diff options
Diffstat (limited to 'arch')
123 files changed, 6096 insertions, 2650 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7cb7d635fbcc..cf6d1cd8b6dc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1988,6 +1988,7 @@ config ARM64_MTE depends on ARM64_PAN select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_USES_PG_ARCH_X help Memory Tagging (part of the ARMv8.5 Extensions) provides architectural support for run-time, always-on detection of diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8aa8492dafc0..0df3fc3a0173 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -135,7 +135,7 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). + * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu. * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. @@ -340,9 +340,13 @@ * We have * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] + * + * Always assume 52 bit PA since at this point, we don't know how many PA bits + * the page table has been set up for. This should be safe since unused address + * bits in PAR are res0. */ #define PAR_TO_HPFAR(par) \ - (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) + (((par) & GENMASK_ULL(52 - 1, 12)) >> 8) #define ECN(x) { ESR_ELx_EC_##x, #x } diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 53035763e48e..43c3bc0f9544 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -76,6 +76,9 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, + __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, + __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, }; #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] @@ -106,7 +109,7 @@ enum __kvm_host_smccc_func { #define per_cpu_ptr_nvhe_sym(sym, cpu) \ ({ \ unsigned long base, off; \ - base = kvm_arm_hyp_percpu_base[cpu]; \ + base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; \ off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \ (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \ base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \ @@ -211,7 +214,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) -extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[]; DECLARE_KVM_NVHE_SYM(__per_cpu_start); DECLARE_KVM_NVHE_SYM(__per_cpu_end); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fd34ab155d0b..35a159d131b5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -73,6 +73,63 @@ u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); +struct kvm_hyp_memcache { + phys_addr_t head; + unsigned long nr_pages; +}; + +static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, + phys_addr_t *p, + phys_addr_t (*to_pa)(void *virt)) +{ + *p = mc->head; + mc->head = to_pa(p); + mc->nr_pages++; +} + +static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, + void *(*to_va)(phys_addr_t phys)) +{ + phys_addr_t *p = to_va(mc->head); + + if (!mc->nr_pages) + return NULL; + + mc->head = *p; + mc->nr_pages--; + + return p; +} + +static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc, + unsigned long min_pages, + void *(*alloc_fn)(void *arg), + phys_addr_t (*to_pa)(void *virt), + void *arg) +{ + while (mc->nr_pages < min_pages) { + phys_addr_t *p = alloc_fn(arg); + + if (!p) + return -ENOMEM; + push_hyp_memcache(mc, p, to_pa); + } + + return 0; +} + +static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc, + void (*free_fn)(void *virt, void *arg), + void *(*to_va)(phys_addr_t phys), + void *arg) +{ + while (mc->nr_pages) + free_fn(pop_hyp_memcache(mc, to_va), arg); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc); +int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages); + struct kvm_vmid { atomic64_t id; }; @@ -115,6 +172,13 @@ struct kvm_smccc_features { unsigned long vendor_hyp_bmap; }; +typedef unsigned int pkvm_handle_t; + +struct kvm_protected_vm { + pkvm_handle_t handle; + struct kvm_hyp_memcache teardown_mc; +}; + struct kvm_arch { struct kvm_s2_mmu mmu; @@ -163,9 +227,19 @@ struct kvm_arch { u8 pfr0_csv2; u8 pfr0_csv3; + struct { + u8 imp:4; + u8 unimp:4; + } dfr0_pmuver; /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; + + /* + * For an untrusted host VM, 'pkvm.handle' is used to lookup + * the associated pKVM instance in the hypervisor. + */ + struct kvm_protected_vm pkvm; }; struct kvm_vcpu_fault_info { @@ -925,8 +999,6 @@ int kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); -int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); - static inline bool kvm_vm_is_protected(struct kvm *kvm) { return false; diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index aa7fa2a08f06..6797eafe7890 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -123,4 +123,7 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); +extern unsigned long kvm_nvhe_sym(__icache_flags); +extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 7784081088e7..e4a7e6369499 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -166,7 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu); +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 3252eb50ecfe..63f81b27a4e3 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -42,6 +42,8 @@ typedef u64 kvm_pte_t; #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) +#define KVM_PHYS_INVALID (-1ULL) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; @@ -57,6 +59,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte) return pa; } +static inline kvm_pte_t kvm_phys_to_pte(u64 pa) +{ + kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) { + pa &= GENMASK(51, 48); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + } + + return pte; +} + static inline u64 kvm_granule_shift(u32 level) { /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ @@ -85,6 +99,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level) * allocation is physically contiguous. * @free_pages_exact: Free an exact number of memory pages previously * allocated by zalloc_pages_exact. + * @free_removed_table: Free a removed paging structure by unlinking and + * dropping references. * @get_page: Increment the refcount on a page. * @put_page: Decrement the refcount on a page. When the * refcount reaches 0 the page is automatically @@ -103,6 +119,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); + void (*free_removed_table)(void *addr, u32 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); @@ -162,29 +179,6 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, enum kvm_pgtable_prot prot); /** - * struct kvm_pgtable - KVM page-table. - * @ia_bits: Maximum input address size, in bits. - * @start_level: Level at which the page-table walk starts. - * @pgd: Pointer to the first top-level entry of the page-table. - * @mm_ops: Memory management callbacks. - * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. - * @flags: Stage-2 page-table flags. - * @force_pte_cb: Function that returns true if page level mappings must - * be used instead of block mappings. - */ -struct kvm_pgtable { - u32 ia_bits; - u32 start_level; - kvm_pte_t *pgd; - struct kvm_pgtable_mm_ops *mm_ops; - - /* Stage-2 only */ - struct kvm_s2_mmu *mmu; - enum kvm_pgtable_stage2_flags flags; - kvm_pgtable_force_pte_cb_t force_pte_cb; -}; - -/** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid * entries. @@ -192,17 +186,34 @@ struct kvm_pgtable { * children. * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their * children. + * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared + * with other software walkers. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), KVM_PGTABLE_WALK_TABLE_POST = BIT(2), + KVM_PGTABLE_WALK_SHARED = BIT(3), +}; + +struct kvm_pgtable_visit_ctx { + kvm_pte_t *ptep; + kvm_pte_t old; + void *arg; + struct kvm_pgtable_mm_ops *mm_ops; + u64 addr; + u64 end; + u32 level; + enum kvm_pgtable_walk_flags flags; }; -typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg); +typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit); + +static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx) +{ + return ctx->flags & KVM_PGTABLE_WALK_SHARED; +} /** * struct kvm_pgtable_walker - Hook into a page-table walk. @@ -217,6 +228,94 @@ struct kvm_pgtable_walker { const enum kvm_pgtable_walk_flags flags; }; +/* + * RCU cannot be used in a non-kernel context such as the hyp. As such, page + * table walkers used in hyp do not call into RCU and instead use other + * synchronization mechanisms (such as a spinlock). + */ +#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) + +typedef kvm_pte_t *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, + kvm_pteref_t pteref) +{ + return pteref; +} + +static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) +{ + /* + * Due to the lack of RCU (or a similar protection scheme), only + * non-shared table walkers are allowed in the hypervisor. + */ + if (walker->flags & KVM_PGTABLE_WALK_SHARED) + return -EPERM; + + return 0; +} + +static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {} + +static inline bool kvm_pgtable_walk_lock_held(void) +{ + return true; +} + +#else + +typedef kvm_pte_t __rcu *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, + kvm_pteref_t pteref) +{ + return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); +} + +static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) +{ + if (walker->flags & KVM_PGTABLE_WALK_SHARED) + rcu_read_lock(); + + return 0; +} + +static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) +{ + if (walker->flags & KVM_PGTABLE_WALK_SHARED) + rcu_read_unlock(); +} + +static inline bool kvm_pgtable_walk_lock_held(void) +{ + return rcu_read_lock_held(); +} + +#endif + +/** + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mm_ops: Memory management callbacks. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + * @flags: Stage-2 page-table flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. + */ +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pteref_t pgd; + struct kvm_pgtable_mm_ops *mm_ops; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; + enum kvm_pgtable_stage2_flags flags; + kvm_pgtable_force_pte_cb_t force_pte_cb; +}; + /** * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. * @pgt: Uninitialised page-table structure to initialise. @@ -297,6 +396,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** + * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD + * @vtcr: Content of the VTCR register. + * + * Return: the size (in bytes) of the stage-2 PGD + */ +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr); + +/** * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @mmu: S2 MMU context for this S2 translation @@ -325,6 +432,17 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** + * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure. + * @mm_ops: Memory management callbacks. + * @pgtable: Unlinked stage-2 paging structure to be freed. + * @level: Level of the stage-2 paging structure to be freed. + * + * The page-table is assumed to be unreachable by any hardware walkers prior to + * freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); + +/** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address at which to place the mapping. @@ -333,6 +451,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * @prot: Permissions and attributes for the mapping. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. + * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page @@ -354,7 +473,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); */ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, - void *mc); + void *mc, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 9f4ad2a8df59..01129b0d4c68 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -9,11 +9,49 @@ #include <linux/memblock.h> #include <asm/kvm_pgtable.h> +/* Maximum number of VMs that can co-exist under pKVM. */ +#define KVM_MAX_PVMS 255 + #define HYP_MEMBLOCK_REGIONS 128 +int pkvm_init_host_vm(struct kvm *kvm); +int pkvm_create_hyp_vm(struct kvm *kvm); +void pkvm_destroy_hyp_vm(struct kvm *kvm); + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); +static inline unsigned long +hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size) +{ + unsigned long nr_pages = reg->size >> PAGE_SHIFT; + unsigned long start, end; + + start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size; + end = start + nr_pages * vmemmap_entry_size; + start = ALIGN_DOWN(start, PAGE_SIZE); + end = ALIGN(end, PAGE_SIZE); + + return end - start; +} + +static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size) +{ + unsigned long res = 0, i; + + for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { + res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i], + vmemmap_entry_size); + } + + return res >> PAGE_SHIFT; +} + +static inline unsigned long hyp_vm_table_pages(void) +{ + return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT; +} + static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { unsigned long total = 0, i; diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 760c62f8e22f..20dd06d70af5 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -25,7 +25,7 @@ unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); -bool mte_restore_tags(swp_entry_t entry, struct page *page); +void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); @@ -36,6 +36,58 @@ void mte_free_tag_storage(char *storage); /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 +/* simple lock to avoid multiple threads tagging the same page */ +#define PG_mte_lock PG_arch_3 + +static inline void set_page_mte_tagged(struct page *page) +{ + /* + * Ensure that the tags written prior to this function are visible + * before the page flags update. + */ + smp_wmb(); + set_bit(PG_mte_tagged, &page->flags); +} + +static inline bool page_mte_tagged(struct page *page) +{ + bool ret = test_bit(PG_mte_tagged, &page->flags); + + /* + * If the page is tagged, ensure ordering with a likely subsequent + * read of the tags. + */ + if (ret) + smp_rmb(); + return ret; +} + +/* + * Lock the page for tagging and return 'true' if the page can be tagged, + * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the + * locking only happens once for page initialisation. + * + * The page MTE lock state: + * + * Locked: PG_mte_lock && !PG_mte_tagged + * Unlocked: !PG_mte_lock || PG_mte_tagged + * + * Acquire semantics only if the page is tagged (returning 'false'). + */ +static inline bool try_page_mte_tagging(struct page *page) +{ + if (!test_and_set_bit(PG_mte_lock, &page->flags)) + return true; + + /* + * The tags are either being initialised or may have been initialised + * already. Check if the PG_mte_tagged flag has been set or wait + * otherwise. + */ + smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); + + return false; +} void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t old_pte, pte_t pte); @@ -56,6 +108,17 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size); /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 +static inline void set_page_mte_tagged(struct page *page) +{ +} +static inline bool page_mte_tagged(struct page *page) +{ + return false; +} +static inline bool try_page_mte_tagging(struct page *page) +{ + return false; +} static inline void mte_zero_clear_page_tags(void *addr) { } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3faf7582a53..6914add66bcf 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1046,8 +1046,8 @@ static inline void arch_swap_invalidate_area(int type) #define __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { - if (system_supports_mte() && mte_restore_tags(entry, &folio->page)) - set_bit(PG_mte_tagged, &folio->flags); + if (system_supports_mte()) + mte_restore_tags(entry, &folio->page); } #endif /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 316917b98707..a7a857f1784d 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -43,6 +43,7 @@ #define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7e76e1fda2a1..a77315b338e6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2076,8 +2076,10 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) * Clear the tags in the zero page. This needs to be done via the * linear map which has the Tagged attribute. */ - if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags)) + if (try_page_mte_tagging(ZERO_PAGE(0))) { mte_clear_page_tags(lm_alias(empty_zero_page)); + set_page_mte_tagged(ZERO_PAGE(0)); + } kasan_init_hw_tags_cpu(); } diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 27ef7ad3ffd2..353009d7f307 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -47,7 +47,7 @@ static int mte_dump_tag_range(struct coredump_params *cprm, * Pages mapped in user space as !pte_access_permitted() (e.g. * PROT_EXEC only) may not have the PG_mte_tagged flag set. */ - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (!page_mte_tagged(page)) { put_page(page); dump_skip(cprm, MTE_PAGE_TAG_STORAGE); continue; diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index af5df48ba915..788597a6b6a2 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -271,7 +271,7 @@ static int swsusp_mte_save_tags(void) if (!page) continue; - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) continue; ret = save_tags(page, pfn); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index f31130ba0233..d0e9bb5c91fc 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -63,12 +63,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); /* Vectors installed by hyp-init on reset HVC. */ KVM_NVHE_ALIAS(__hyp_stub_vectors); -/* Kernel symbol used by icache_is_vpipt(). */ -KVM_NVHE_ALIAS(__icache_flags); - -/* VMID bits set by the KVM VMID allocator */ -KVM_NVHE_ALIAS(kvm_arm_vmid_bits); - /* Static keys which are set if a vGIC trap should be handled in hyp. */ KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); @@ -84,9 +78,6 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* Array containing bases of nVHE per-CPU memory regions. */ -KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); - /* PMU available static key */ #ifdef CONFIG_HW_PERF_EVENTS KVM_NVHE_ALIAS(kvm_arm_pmu_available); @@ -103,12 +94,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); #endif -/* Kernel memory sections */ -KVM_NVHE_ALIAS(__start_rodata); -KVM_NVHE_ALIAS(__end_rodata); -KVM_NVHE_ALIAS(__bss_start); -KVM_NVHE_ALIAS(__bss_stop); - /* Hyp memory sections */ KVM_NVHE_ALIAS(__hyp_idmap_text_start); KVM_NVHE_ALIAS(__hyp_idmap_text_end); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 7467217c1eaf..f5bcb0dc6267 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -41,19 +41,17 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, if (check_swap && is_swap_pte(old_pte)) { swp_entry_t entry = pte_to_swp_entry(old_pte); - if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) - return; + if (!non_swap_entry(entry)) + mte_restore_tags(entry, page); } if (!pte_is_tagged) return; - /* - * Test PG_mte_tagged again in case it was racing with another - * set_pte_at(). - */ - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } } void mte_sync_tags(pte_t old_pte, pte_t pte) @@ -69,9 +67,11 @@ void mte_sync_tags(pte_t old_pte, pte_t pte) /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) { mte_sync_page_tags(page, old_pte, check_swap, pte_is_tagged); + set_page_mte_tagged(page); + } } /* ensure the tags are visible before the PTE is set */ @@ -96,8 +96,7 @@ int memcmp_pages(struct page *page1, struct page *page2) * pages is tagged, set_pte_at() may zero or change the tags of the * other page via mte_sync_tags(). */ - if (test_bit(PG_mte_tagged, &page1->flags) || - test_bit(PG_mte_tagged, &page2->flags)) + if (page_mte_tagged(page1) || page_mte_tagged(page2)) return addr1 != addr2; return ret; @@ -454,7 +453,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags)); + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 815cc118c675..05da3c8f7e88 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -32,6 +32,8 @@ menuconfig KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD + select HAVE_KVM_DIRTY_RING_ACQ_REL + select NEED_KVM_DIRTY_RING_WITH_BITMAP select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10..9c5573bc4614 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -37,6 +37,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_pkvm.h> #include <asm/kvm_emulate.h> #include <asm/sections.h> @@ -50,7 +51,6 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); static bool vgic_present; @@ -138,24 +138,24 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; - ret = kvm_arm_setup_stage2(kvm, type); - if (ret) - return ret; - - ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu); + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; - ret = kvm_share_hyp(kvm, kvm + 1); + ret = pkvm_init_host_vm(kvm); if (ret) - goto out_free_stage2_pgd; + goto err_unshare_kvm; if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { ret = -ENOMEM; - goto out_free_stage2_pgd; + goto err_unshare_kvm; } cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); + ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); + if (ret) + goto err_free_cpumask; + kvm_vgic_early_init(kvm); /* The maximum number of VCPUs is limited by the host's GIC model */ @@ -164,9 +164,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_default_spectre(kvm); kvm_arm_init_hypercalls(kvm); - return ret; -out_free_stage2_pgd: - kvm_free_stage2_pgd(&kvm->arch.mmu); + /* + * Initialise the default PMUver before there is a chance to + * create an actual PMU. + */ + kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); + + return 0; + +err_free_cpumask: + free_cpumask_var(kvm->arch.supported_cpus); +err_unshare_kvm: + kvm_unshare_hyp(kvm, kvm + 1); return ret; } @@ -187,6 +196,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_vgic_destroy(kvm); + if (is_protected_kvm_enabled()) + pkvm_destroy_hyp_vm(kvm); + kvm_destroy_vcpus(kvm); kvm_unshare_hyp(kvm, kvm + 1); @@ -569,6 +581,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (ret) return ret; + if (is_protected_kvm_enabled()) { + ret = pkvm_create_hyp_vm(kvm); + if (ret) + return ret; + } + if (!irqchip_in_kernel(kvm)) { /* * Tell the rest of the code that there are userspace irqchip @@ -746,6 +764,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) return kvm_vcpu_suspend(vcpu); + + if (kvm_dirty_ring_check_request(vcpu)) + return 0; } return 1; @@ -1518,7 +1539,7 @@ static int kvm_init_vector_slots(void) return 0; } -static void cpu_prepare_hyp_mode(int cpu) +static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; @@ -1534,23 +1555,9 @@ static void cpu_prepare_hyp_mode(int cpu) params->mair_el2 = read_sysreg(mair_el1); - /* - * The ID map may be configured to use an extended virtual address - * range. This is only the case if system RAM is out of range for the - * currently configured page size and VA_BITS, in which case we will - * also need the extended virtual range for the HYP ID map, or we won't - * be able to enable the EL2 MMU. - * - * However, at EL2, there is only one TTBR register, and we can't switch - * between translation tables *and* update TCR_EL2.T0SZ at the same - * time. Bottom line: we need to use the extended range with *both* our - * translation tables. - * - * So use the same T0SZ value we use for the ID map. - */ tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; tcr &= ~TCR_T0SZ_MASK; - tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; + tcr |= TCR_T0SZ(hyp_va_bits); params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); @@ -1844,13 +1851,13 @@ static void teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); } } static int do_pkvm_init(u32 hyp_va_bits) { - void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base); + void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; preempt_disable(); @@ -1870,11 +1877,8 @@ static int do_pkvm_init(u32 hyp_va_bits) return ret; } -static int kvm_hyp_init_protection(u32 hyp_va_bits) +static void kvm_hyp_init_symbols(void) { - void *addr = phys_to_virt(hyp_mem_base); - int ret; - kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); @@ -1883,6 +1887,14 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + kvm_nvhe_sym(__icache_flags) = __icache_flags; + kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; +} + +static int kvm_hyp_init_protection(u32 hyp_va_bits) +{ + void *addr = phys_to_virt(hyp_mem_base); + int ret; ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); if (ret) @@ -1950,7 +1962,7 @@ static int init_hyp_mode(void) page_addr = page_address(page); memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); - kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; + kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; } /* @@ -2043,7 +2055,7 @@ static int init_hyp_mode(void) } for_each_possible_cpu(cpu) { - char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; + char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; char *percpu_end = percpu_begin + nvhe_percpu_size(); /* Map Hyp percpu pages */ @@ -2054,9 +2066,11 @@ static int init_hyp_mode(void) } /* Prepare the CPU initialization parameters */ - cpu_prepare_hyp_mode(cpu); + cpu_prepare_hyp_mode(cpu, hyp_va_bits); } + kvm_hyp_init_symbols(); + if (is_protected_kvm_enabled()) { init_cpu_logical_map(); @@ -2064,9 +2078,7 @@ static int init_hyp_mode(void) err = -ENODEV; goto out_err; } - } - if (is_protected_kvm_enabled()) { err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2130,6 +2142,11 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) return NULL; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + bool kvm_arch_has_irq_bypass(void) { return true; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2ff13a3f8479..5626ddb540ce 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1059,7 +1059,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, maddr = page_address(page); if (!write) { - if (test_bit(PG_mte_tagged, &page->flags)) + if (page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else @@ -1068,15 +1068,19 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, clear_user(tags, MTE_GRANULES_PER_PAGE); kvm_release_pfn_clean(pfn); } else { + /* + * Only locking to serialise with a concurrent + * set_pte_at() in the VMM but still overriding the + * tags, hence ignoring the return value. + */ + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); - /* - * Set the flag after checking the write - * completed fully - */ - if (num_tags == MTE_GRANULES_PER_PAGE) - set_bit(PG_mte_tagged, &page->flags); + /* uaccess failed, don't leave stale tags */ + if (num_tags != MTE_GRANULES_PER_PAGE) + mte_clear_page_tags(page); + set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); } diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c index b3742a6691e8..b257a3b4bfc5 100644 --- a/arch/arm64/kvm/hyp/hyp-constants.c +++ b/arch/arm64/kvm/hyp/hyp-constants.c @@ -2,9 +2,12 @@ #include <linux/kbuild.h> #include <nvhe/memory.h> +#include <nvhe/pkvm.h> int main(void) { DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page)); + DEFINE(PKVM_HYP_VM_SIZE, sizeof(struct pkvm_hyp_vm)); + DEFINE(PKVM_HYP_VCPU_SIZE, sizeof(struct pkvm_hyp_vcpu)); return 0; } diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 80e99836eac7..b7bdbe63deed 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -8,8 +8,10 @@ #define __KVM_NVHE_MEM_PROTECT__ #include <linux/kvm_host.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/virt.h> +#include <nvhe/pkvm.h> #include <nvhe/spinlock.h> /* @@ -43,30 +45,45 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) return prot & PKVM_PAGE_STATE_PROT_MASK; } -struct host_kvm { +struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; struct kvm_pgtable_mm_ops mm_ops; hyp_spinlock_t lock; }; -extern struct host_kvm host_kvm; +extern struct host_mmu host_mmu; -extern const u8 pkvm_hyp_id; +/* This corresponds to page-table locking order */ +enum pkvm_component_id { + PKVM_ID_HOST, + PKVM_ID_HYP, +}; + +extern unsigned long hyp_nr_cpus; int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); +int hyp_pin_shared_mem(void *from, void *to); +void hyp_unpin_shared_mem(void *from, void *to); +void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc); + static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 592b7edb3edb..ab205c4d6774 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -38,6 +38,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +/* + * Refcounting for 'struct hyp_page'. + * hyp_pool::lock must be held if atomic access to the refcount is required. + */ static inline int hyp_page_count(void *addr) { struct hyp_page *p = hyp_virt_to_page(addr); @@ -45,4 +49,27 @@ static inline int hyp_page_count(void *addr) return p->refcount; } +static inline void hyp_page_ref_inc(struct hyp_page *p) +{ + BUG_ON(p->refcount == USHRT_MAX); + p->refcount++; +} + +static inline void hyp_page_ref_dec(struct hyp_page *p) +{ + BUG_ON(!p->refcount); + p->refcount--; +} + +static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +{ + hyp_page_ref_dec(p); + return (p->refcount == 0); +} + +static inline void hyp_set_page_refcounted(struct hyp_page *p) +{ + BUG_ON(p->refcount); + p->refcount = 1; +} #endif /* __KVM_HYP_MEMORY_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 42d8eb9bfe72..d5ec972b5c1e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -13,9 +13,13 @@ extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; +int hyp_create_pcpu_fixmap(void); +void *hyp_fixmap_map(phys_addr_t phys); +void hyp_fixmap_unmap(void); + int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); -int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); +int hyp_back_vmemmap(phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); @@ -24,16 +28,4 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, unsigned long *haddr); int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); -static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, - unsigned long *start, unsigned long *end) -{ - unsigned long nr_pages = size >> PAGE_SHIFT; - struct hyp_page *p = hyp_phys_to_page(phys); - - *start = (unsigned long)p; - *end = *start + nr_pages * sizeof(struct hyp_page); - *start = ALIGN_DOWN(*start, PAGE_SIZE); - *end = ALIGN(*end, PAGE_SIZE); -} - #endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h new file mode 100644 index 000000000000..82b3d62538a6 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#ifndef __ARM64_KVM_NVHE_PKVM_H__ +#define __ARM64_KVM_NVHE_PKVM_H__ + +#include <asm/kvm_pkvm.h> + +#include <nvhe/gfp.h> +#include <nvhe/spinlock.h> + +/* + * Holds the relevant data for maintaining the vcpu state completely at hyp. + */ +struct pkvm_hyp_vcpu { + struct kvm_vcpu vcpu; + + /* Backpointer to the host's (untrusted) vCPU instance. */ + struct kvm_vcpu *host_vcpu; +}; + +/* + * Holds the relevant data for running a protected vm. + */ +struct pkvm_hyp_vm { + struct kvm kvm; + + /* Backpointer to the host's (untrusted) KVM instance. */ + struct kvm *host_kvm; + + /* The guest's stage-2 page-table managed by the hypervisor. */ + struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; + struct hyp_pool pool; + hyp_spinlock_t lock; + + /* + * The number of vcpus initialized and ready to run. + * Modifying this is protected by 'vm_table_lock'. + */ + unsigned int nr_vcpus; + + /* Array of the hyp vCPU structures for this VM. */ + struct pkvm_hyp_vcpu *vcpus[]; +}; + +static inline struct pkvm_hyp_vm * +pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); +} + +void pkvm_hyp_vm_table_init(void *tbl); + +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva); +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva); +int __pkvm_teardown_vm(pkvm_handle_t handle); + +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx); +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); + +#endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 4652fd04bdbe..7c7ea8c55405 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -28,9 +28,17 @@ typedef union hyp_spinlock { }; } hyp_spinlock_t; +#define __HYP_SPIN_LOCK_INITIALIZER \ + { .__val = 0 } + +#define __HYP_SPIN_LOCK_UNLOCKED \ + ((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER) + +#define DEFINE_HYP_SPINLOCK(x) hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED + #define hyp_spin_lock_init(l) \ do { \ - *(l) = (hyp_spinlock_t){ .__val = 0 }; \ + *(l) = __HYP_SPIN_LOCK_UNLOCKED; \ } while (0) static inline void hyp_spin_lock(hyp_spinlock_t *lock) diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S index 0c367eb5f4e2..85936c17ae40 100644 --- a/arch/arm64/kvm/hyp/nvhe/cache.S +++ b/arch/arm64/kvm/hyp/nvhe/cache.S @@ -12,3 +12,14 @@ SYM_FUNC_START(__pi_dcache_clean_inval_poc) ret SYM_FUNC_END(__pi_dcache_clean_inval_poc) SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc) + +SYM_FUNC_START(__pi_icache_inval_pou) +alternative_if ARM64_HAS_CACHE_DIC + isb + ret +alternative_else_nop_endif + + invalidate_icache_by_line x0, x1, x2, x3 + ret +SYM_FUNC_END(__pi_icache_inval_pou) +SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 3cea4b6ac23e..728e01d4536b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -15,17 +15,93 @@ #include <nvhe/mem_protect.h> #include <nvhe/mm.h> +#include <nvhe/pkvm.h> #include <nvhe/trap_handler.h> DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; + + hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); + hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; + + hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; + + hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; + hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2; + + hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; + hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state; + + hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); + hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state; + + hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; + + hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; +} + +static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + unsigned int i; + + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; + + host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; + host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2; + + host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; + + host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; + host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state; + + host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + for (i = 0; i < hyp_cpu_if->used_lrs; ++i) + host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); + int ret; - cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); + host_vcpu = kern_hyp_va(host_vcpu); + + if (unlikely(is_protected_kvm_enabled())) { + struct pkvm_hyp_vcpu *hyp_vcpu; + struct kvm *host_kvm; + + host_kvm = kern_hyp_va(host_vcpu->kvm); + hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, + host_vcpu->vcpu_idx); + if (!hyp_vcpu) { + ret = -EINVAL; + goto out; + } + + flush_hyp_vcpu(hyp_vcpu); + + ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); + + sync_hyp_vcpu(hyp_vcpu); + pkvm_put_hyp_vcpu(hyp_vcpu); + } else { + /* The host is fully trusted, run its vCPU directly. */ + ret = __kvm_vcpu_run(host_vcpu); + } + +out: + cpu_reg(host_ctxt, 1) = ret; } static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) @@ -191,6 +267,33 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); } +static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); + DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2); + DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3); + + host_kvm = kern_hyp_va(host_kvm); + cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva); +} + +static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2); + DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3); + + host_vcpu = kern_hyp_va(host_vcpu); + cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); +} + +static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -220,6 +323,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_aprs), HANDLE_FUNC(__pkvm_vcpu_init_traps), + HANDLE_FUNC(__pkvm_init_vm), + HANDLE_FUNC(__pkvm_init_vcpu), + HANDLE_FUNC(__pkvm_teardown_vm), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index 9f54833af400..04d194583f1e 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -23,6 +23,8 @@ u64 cpu_logical_map(unsigned int cpu) return hyp_cpu_logical_map[cpu]; } +unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS]; + unsigned long __hyp_per_cpu_offset(unsigned int cpu) { unsigned long *cpu_base_array; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 07f9dc9848ef..552653fa18be 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -21,21 +21,33 @@ #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) -extern unsigned long hyp_nr_cpus; -struct host_kvm host_kvm; +struct host_mmu host_mmu; static struct hyp_pool host_s2_pool; -const u8 pkvm_hyp_id = 1; +static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); +#define current_vm (*this_cpu_ptr(&__current_vm)) + +static void guest_lock_component(struct pkvm_hyp_vm *vm) +{ + hyp_spin_lock(&vm->lock); + current_vm = vm; +} + +static void guest_unlock_component(struct pkvm_hyp_vm *vm) +{ + current_vm = NULL; + hyp_spin_unlock(&vm->lock); +} static void host_lock_component(void) { - hyp_spin_lock(&host_kvm.lock); + hyp_spin_lock(&host_mmu.lock); } static void host_unlock_component(void) { - hyp_spin_unlock(&host_kvm.lock); + hyp_spin_unlock(&host_mmu.lock); } static void hyp_lock_component(void) @@ -79,6 +91,11 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } +static void host_s2_free_removed_table(void *addr, u32 level) +{ + kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level); +} + static int prepare_s2_pool(void *pgt_pool_base) { unsigned long nr_pages, pfn; @@ -90,9 +107,10 @@ static int prepare_s2_pool(void *pgt_pool_base) if (ret) return ret; - host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { + host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, + .free_removed_table = host_s2_free_removed_table, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .page_count = hyp_page_count, @@ -111,7 +129,7 @@ static void prepare_host_vtcr(void) parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); - host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, + host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, id_aa64mmfr1_el1_sys_val, phys_shift); } @@ -119,45 +137,170 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr int kvm_host_prepare_stage2(void *pgt_pool_base) { - struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; int ret; prepare_host_vtcr(); - hyp_spin_lock_init(&host_kvm.lock); - mmu->arch = &host_kvm.arch; + hyp_spin_lock_init(&host_mmu.lock); + mmu->arch = &host_mmu.arch; ret = prepare_s2_pool(pgt_pool_base); if (ret) return ret; - ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, mmu, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, + ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu, + &host_mmu.mm_ops, KVM_HOST_S2_FLAGS, host_stage2_force_pte_cb); if (ret) return ret; - mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); - mmu->pgt = &host_kvm.pgt; + mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd); + mmu->pgt = &host_mmu.pgt; atomic64_set(&mmu->vmid.id, 0); return 0; } +static bool guest_stage2_force_pte_cb(u64 addr, u64 end, + enum kvm_pgtable_prot prot) +{ + return true; +} + +static void *guest_s2_zalloc_pages_exact(size_t size) +{ + void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); + + WARN_ON(size != (PAGE_SIZE << get_order(size))); + hyp_split_page(hyp_virt_to_page(addr)); + + return addr; +} + +static void guest_s2_free_pages_exact(void *addr, unsigned long size) +{ + u8 order = get_order(size); + unsigned int i; + + for (i = 0; i < (1 << order); i++) + hyp_put_page(¤t_vm->pool, addr + (i * PAGE_SIZE)); +} + +static void *guest_s2_zalloc_page(void *mc) +{ + struct hyp_page *p; + void *addr; + + addr = hyp_alloc_pages(¤t_vm->pool, 0); + if (addr) + return addr; + + addr = pop_hyp_memcache(mc, hyp_phys_to_virt); + if (!addr) + return addr; + + memset(addr, 0, PAGE_SIZE); + p = hyp_virt_to_page(addr); + memset(p, 0, sizeof(*p)); + p->refcount = 1; + + return addr; +} + +static void guest_s2_get_page(void *addr) +{ + hyp_get_page(¤t_vm->pool, addr); +} + +static void guest_s2_put_page(void *addr) +{ + hyp_put_page(¤t_vm->pool, addr); +} + +static void clean_dcache_guest_page(void *va, size_t size) +{ + __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); + hyp_fixmap_unmap(); +} + +static void invalidate_icache_guest_page(void *va, size_t size) +{ + __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); + hyp_fixmap_unmap(); +} + +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) +{ + struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu; + unsigned long nr_pages; + int ret; + + nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; + ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0); + if (ret) + return ret; + + hyp_spin_lock_init(&vm->lock); + vm->mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_pages_exact = guest_s2_zalloc_pages_exact, + .free_pages_exact = guest_s2_free_pages_exact, + .zalloc_page = guest_s2_zalloc_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .page_count = hyp_page_count, + .get_page = guest_s2_get_page, + .put_page = guest_s2_put_page, + .dcache_clean_inval_poc = clean_dcache_guest_page, + .icache_inval_pou = invalidate_icache_guest_page, + }; + + guest_lock_component(vm); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, + guest_stage2_force_pte_cb); + guest_unlock_component(vm); + if (ret) + return ret; + + vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd); + + return 0; +} + +void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +{ + void *addr; + + /* Dump all pgtable pages in the hyp_pool */ + guest_lock_component(vm); + kvm_pgtable_stage2_destroy(&vm->pgt); + vm->kvm.arch.mmu.pgd_phys = 0ULL; + guest_unlock_component(vm); + + /* Drain the hyp_pool into the memcache */ + addr = hyp_alloc_pages(&vm->pool, 0); + while (addr) { + memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); + addr = hyp_alloc_pages(&vm->pool, 0); + } +} + int __pkvm_prot_finalize(void) { - struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); if (params->hcr_el2 & HCR_VM) return -EPERM; params->vttbr = kvm_get_vttbr(mmu); - params->vtcr = host_kvm.arch.vtcr; + params->vtcr = host_mmu.arch.vtcr; params->hcr_el2 |= HCR_VM; kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* * Make sure to have an ISB before the TLB maintenance below but only @@ -175,7 +318,7 @@ int __pkvm_prot_finalize(void) static int host_stage2_unmap_dev_all(void) { - struct kvm_pgtable *pgt = &host_kvm.pgt; + struct kvm_pgtable *pgt = &host_mmu.pgt; struct memblock_region *reg; u64 addr = 0; int i, ret; @@ -195,7 +338,7 @@ struct kvm_mem_range { u64 end; }; -static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) +static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; struct memblock_region *reg; @@ -218,18 +361,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) } else { range->start = reg->base; range->end = end; - return true; + return reg; } } - return false; + return NULL; } bool addr_is_memory(phys_addr_t phys) { struct kvm_mem_range range; - return find_mem_range(phys, &range); + return !!find_mem_range(phys, &range); +} + +static bool addr_is_allowed_memory(phys_addr_t phys) +{ + struct memblock_region *reg; + struct kvm_mem_range range; + + reg = find_mem_range(phys, &range); + + return reg && !(reg->flags & MEMBLOCK_NOMAP); } static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) @@ -250,8 +403,8 @@ static bool range_is_memory(u64 start, u64 end) static inline int __host_stage2_idmap(u64 start, u64 end, enum kvm_pgtable_prot prot) { - return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, - prot, &host_s2_pool); + return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start, + prot, &host_s2_pool, 0); } /* @@ -263,7 +416,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end, #define host_stage2_try(fn, ...) \ ({ \ int __ret; \ - hyp_assert_lock_held(&host_kvm.lock); \ + hyp_assert_lock_held(&host_mmu.lock); \ __ret = fn(__VA_ARGS__); \ if (__ret == -ENOMEM) { \ __ret = host_stage2_unmap_dev_all(); \ @@ -286,8 +439,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) u32 level; int ret; - hyp_assert_lock_held(&host_kvm.lock); - ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + hyp_assert_lock_held(&host_mmu.lock); + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level); if (ret) return ret; @@ -319,7 +472,7 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, addr, size, &host_s2_pool, owner_id); } @@ -348,7 +501,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr static int host_stage2_idmap(u64 addr) { struct kvm_mem_range range; - bool is_memory = find_mem_range(addr, &range); + bool is_memory = !!find_mem_range(addr, &range); enum kvm_pgtable_prot prot; int ret; @@ -380,12 +533,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) BUG_ON(ret && ret != -EAGAIN); } -/* This corresponds to locking order */ -enum pkvm_component_id { - PKVM_ID_HOST, - PKVM_ID_HYP, -}; - struct pkvm_mem_transition { u64 nr_pages; @@ -399,6 +546,9 @@ struct pkvm_mem_transition { /* Address in the completer's address space */ u64 completer_addr; } host; + struct { + u64 completer_addr; + } hyp; }; } initiator; @@ -412,23 +562,24 @@ struct pkvm_mem_share { const enum kvm_pgtable_prot completer_prot; }; +struct pkvm_mem_donation { + const struct pkvm_mem_transition tx; +}; + struct check_walk_data { enum pkvm_page_state desired; enum pkvm_page_state (*get_page_state)(kvm_pte_t pte); }; -static int __check_page_state_visitor(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct check_walk_data *d = arg; - kvm_pte_t pte = *ptep; + struct check_walk_data *d = ctx->arg; - if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte))) + if (kvm_pte_valid(ctx->old) && !addr_is_allowed_memory(kvm_pte_to_phys(ctx->old))) return -EINVAL; - return d->get_page_state(pte) == d->desired ? 0 : -EPERM; + return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM; } static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, @@ -459,8 +610,8 @@ static int __host_check_page_state_range(u64 addr, u64 size, .get_page_state = host_get_page_state, }; - hyp_assert_lock_held(&host_kvm.lock); - return check_page_state_range(&host_kvm.pgt, addr, size, &d); + hyp_assert_lock_held(&host_mmu.lock); + return check_page_state_range(&host_mmu.pgt, addr, size, &d); } static int __host_set_page_state_range(u64 addr, u64 size, @@ -511,6 +662,46 @@ static int host_initiate_unshare(u64 *completer_addr, return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); } +static int host_initiate_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u8 owner_id = tx->completer.id; + u64 size = tx->nr_pages * PAGE_SIZE; + + *completer_addr = tx->initiator.host.completer_addr; + return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); +} + +static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +{ + return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || + tx->initiator.id != PKVM_ID_HYP); +} + +static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, + enum pkvm_page_state state) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__host_ack_skip_pgtable_check(tx)) + return 0; + + return __host_check_page_state_range(addr, size, state); +} + +static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + return __host_ack_transition(addr, tx, PKVM_NOPAGE); +} + +static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u8 host_id = tx->completer.id; + + return host_stage2_set_owner_locked(addr, size, host_id); +} + static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte) { if (!kvm_pte_valid(pte)) @@ -531,6 +722,27 @@ static int __hyp_check_page_state_range(u64 addr, u64 size, return check_page_state_range(&pkvm_pgtable, addr, size, &d); } +static int hyp_request_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.hyp.completer_addr; + return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static int hyp_initiate_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + int ret; + + *completer_addr = tx->initiator.hyp.completer_addr; + ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); + return (ret != size) ? -EFAULT : 0; +} + static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) { return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || @@ -555,6 +767,9 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) { u64 size = tx->nr_pages * PAGE_SIZE; + if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) + return -EBUSY; + if (__hyp_ack_skip_pgtable_check(tx)) return 0; @@ -562,6 +777,16 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) PKVM_PAGE_SHARED_BORROWED); } +static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); +} + static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, enum kvm_pgtable_prot perms) { @@ -580,6 +805,15 @@ static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) return (ret != size) ? -EFAULT : 0; } +static int hyp_complete_donation(u64 addr, + const struct pkvm_mem_transition *tx) +{ + void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + + return pkvm_create_mappings_locked(start, end, prot); +} + static int check_share(struct pkvm_mem_share *share) { const struct pkvm_mem_transition *tx = &share->tx; @@ -732,6 +966,94 @@ static int do_unshare(struct pkvm_mem_share *share) return WARN_ON(__do_unshare(share)); } +static int check_donation(struct pkvm_mem_donation *donation) +{ + const struct pkvm_mem_transition *tx = &donation->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_owned_transition(&completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_request_donation(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_ack_donation(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_ack_donation(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static int __do_donate(struct pkvm_mem_donation *donation) +{ + const struct pkvm_mem_transition *tx = &donation->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_donation(&completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_initiate_donation(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_complete_donation(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_complete_donation(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/* + * do_donate(): + * + * The page owner transfers ownership to another component, losing access + * as a consequence. + * + * Initiator: OWNED => NOPAGE + * Completer: NOPAGE => OWNED + */ +static int do_donate(struct pkvm_mem_donation *donation) +{ + int ret; + + ret = check_donation(donation); + if (ret) + return ret; + + return WARN_ON(__do_donate(donation)); +} + int __pkvm_host_share_hyp(u64 pfn) { int ret; @@ -797,3 +1119,112 @@ int __pkvm_host_unshare_hyp(u64 pfn) return ret; } + +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_donate(&donation); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HYP, + .addr = hyp_addr, + .hyp = { + .completer_addr = host_addr, + }, + }, + .completer = { + .id = PKVM_ID_HOST, + }, + }, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_donate(&donation); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int hyp_pin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 size = end - start; + int ret; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(__hyp_pa(start), size, + PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + + ret = __hyp_check_page_state_range(start, size, + PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_inc(hyp_virt_to_page(cur)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +void hyp_unpin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + + host_lock_component(); + hyp_lock_component(); + + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_dec(hyp_virt_to_page(cur)); + + hyp_unlock_component(); + host_unlock_component(); +} diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 96193cb31a39..318298eb3d6b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -14,6 +14,7 @@ #include <nvhe/early_alloc.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> +#include <nvhe/mem_protect.h> #include <nvhe/mm.h> #include <nvhe/spinlock.h> @@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr; static u64 __io_map_base; +struct hyp_fixmap_slot { + u64 addr; + kvm_pte_t *ptep; +}; +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots); + static int __pkvm_create_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { @@ -129,13 +136,36 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return ret; } -int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) +int hyp_back_vmemmap(phys_addr_t back) { - unsigned long start, end; + unsigned long i, start, size, end = 0; + int ret; - hyp_vmemmap_range(phys, size, &start, &end); + for (i = 0; i < hyp_memblock_nr; i++) { + start = hyp_memory[i].base; + start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); + /* + * The begining of the hyp_vmemmap region for the current + * memblock may already be backed by the page backing the end + * the previous region, so avoid mapping it twice. + */ + start = max(start, end); + + end = hyp_memory[i].base + hyp_memory[i].size; + end = PAGE_ALIGN((u64)hyp_phys_to_page(end)); + if (start >= end) + continue; + + size = end - start; + ret = __pkvm_create_mappings(start, size, back, PAGE_HYP); + if (ret) + return ret; + + memset(hyp_phys_to_virt(back), 0, size); + back += size; + } - return __pkvm_create_mappings(start, end - start, back, PAGE_HYP); + return 0; } static void *__hyp_bp_vect_base; @@ -189,6 +219,102 @@ int hyp_map_vectors(void) return 0; } +void *hyp_fixmap_map(phys_addr_t phys) +{ + struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); + kvm_pte_t pte, *ptep = slot->ptep; + + pte = *ptep; + pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID); + pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID; + WRITE_ONCE(*ptep, pte); + dsb(ishst); + + return (void *)slot->addr; +} + +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) +{ + kvm_pte_t *ptep = slot->ptep; + u64 addr = slot->addr; + + WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); + + /* + * Irritatingly, the architecture requires that we use inner-shareable + * broadcast TLB invalidation here in case another CPU speculates + * through our fixmap and decides to create an "amalagamation of the + * values held in the TLB" due to the apparent lack of a + * break-before-make sequence. + * + * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 + */ + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + dsb(ish); + isb(); +} + +void hyp_fixmap_unmap(void) +{ + fixmap_clear_slot(this_cpu_ptr(&fixmap_slots)); +} + +static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); + + if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1) + return -EINVAL; + + slot->addr = ctx->addr; + slot->ptep = ctx->ptep; + + /* + * Clear the PTE, but keep the page-table page refcount elevated to + * prevent it from ever being freed. This lets us manipulate the PTEs + * by hand safely without ever needing to allocate memory. + */ + fixmap_clear_slot(slot); + + return 0; +} + +static int create_fixmap_slot(u64 addr, u64 cpu) +{ + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = (void *)cpu, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); +} + +int hyp_create_pcpu_fixmap(void) +{ + unsigned long addr, i; + int ret; + + for (i = 0; i < hyp_nr_cpus; i++) { + ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr); + if (ret) + return ret; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE, + __hyp_pa(__hyp_bss_start), PAGE_HYP); + if (ret) + return ret; + + ret = create_fixmap_slot(addr, i); + if (ret) + return ret; + } + + return 0; +} + int hyp_create_idmap(u32 hyp_va_bits) { unsigned long start, end; @@ -213,3 +339,36 @@ int hyp_create_idmap(u32 hyp_va_bits) return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); } + +static void *admit_host_page(void *arg) +{ + struct kvm_hyp_memcache *host_mc = arg; + + if (!host_mc->nr_pages) + return NULL; + + /* + * The host still owns the pages in its memcache, so we need to go + * through a full host-to-hyp donation cycle to change it. Fortunately, + * __pkvm_host_donate_hyp() takes care of races for us, so if it + * succeeds we're good to go. + */ + if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1)) + return NULL; + + return pop_hyp_memcache(host_mc, hyp_phys_to_virt); +} + +/* Refill our local memcache by poping pages from the one provided by the host. */ +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc) +{ + struct kvm_hyp_memcache tmp = *host_mc; + int ret; + + ret = __topup_hyp_memcache(mc, min_pages, admit_host_page, + hyp_virt_to_phys, &tmp); + *host_mc = tmp; + + return ret; +} diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index d40f0b30b534..803ba3222e75 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -93,11 +93,16 @@ static inline struct hyp_page *node_to_page(struct list_head *node) static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { + phys_addr_t phys = hyp_page_to_phys(p); unsigned short order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); + /* Skip coalescing for 'external' pages being freed into the pool. */ + if (phys < pool->range_start || phys >= pool->range_end) + goto insert; + /* * Only the first struct hyp_page of a high-order page (otherwise known * as the 'head') should have p->order set. The non-head pages should @@ -116,6 +121,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, p = min(p, buddy); } +insert: /* Mark the new head, and insert it */ p->order = order; page_add_to_list(p, &pool->free_area[order]); @@ -144,25 +150,6 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, return p; } -static inline void hyp_page_ref_inc(struct hyp_page *p) -{ - BUG_ON(p->refcount == USHRT_MAX); - p->refcount++; -} - -static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) -{ - BUG_ON(!p->refcount); - p->refcount--; - return (p->refcount == 0); -} - -static inline void hyp_set_page_refcounted(struct hyp_page *p) -{ - BUG_ON(p->refcount); - p->refcount = 1; -} - static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p) { if (hyp_page_ref_dec_and_test(p)) @@ -249,10 +236,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, /* Init the vmemmap portion */ p = hyp_phys_to_page(phys); - for (i = 0; i < nr_pages; i++) { - p[i].order = 0; + for (i = 0; i < nr_pages; i++) hyp_set_page_refcounted(&p[i]); - } /* Attach the unused pages to the buddy tree */ for (i = reserved_pages; i < nr_pages; i++) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 85d3b7ae720f..a06ece14a6d8 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -7,8 +7,17 @@ #include <linux/kvm_host.h> #include <linux/mm.h> #include <nvhe/fixed_config.h> +#include <nvhe/mem_protect.h> +#include <nvhe/memory.h> +#include <nvhe/pkvm.h> #include <nvhe/trap_handler.h> +/* Used by icache_is_vpipt(). */ +unsigned long __icache_flags; + +/* Used by kvm_get_vttbr(). */ +unsigned int kvm_arm_vmid_bits; + /* * Set trap register values based on features in ID_AA64PFR0. */ @@ -183,3 +192,430 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) pvm_init_traps_aa64mmfr0(vcpu); pvm_init_traps_aa64mmfr1(vcpu); } + +/* + * Start the VM table handle at the offset defined instead of at 0. + * Mainly for sanity checking and debugging. + */ +#define HANDLE_OFFSET 0x1000 + +static unsigned int vm_handle_to_idx(pkvm_handle_t handle) +{ + return handle - HANDLE_OFFSET; +} + +static pkvm_handle_t idx_to_vm_handle(unsigned int idx) +{ + return idx + HANDLE_OFFSET; +} + +/* + * Spinlock for protecting state related to the VM table. Protects writes + * to 'vm_table' and 'nr_table_entries' as well as reads and writes to + * 'last_hyp_vcpu_lookup'. + */ +static DEFINE_HYP_SPINLOCK(vm_table_lock); + +/* + * The table of VM entries for protected VMs in hyp. + * Allocated at hyp initialization and setup. + */ +static struct pkvm_hyp_vm **vm_table; + +void pkvm_hyp_vm_table_init(void *tbl) +{ + WARN_ON(vm_table); + vm_table = tbl; +} + +/* + * Return the hyp vm structure corresponding to the handle. + */ +static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(idx >= KVM_MAX_PVMS)) + return NULL; + + return vm_table[idx]; +} + +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx) +{ + struct pkvm_hyp_vcpu *hyp_vcpu = NULL; + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + goto unlock; + + hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); +unlock: + hyp_spin_unlock(&vm_table_lock); + return hyp_vcpu; +} + +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + hyp_spin_lock(&vm_table_lock); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) +{ + if (host_vcpu) + hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); +} + +static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], + unsigned int nr_vcpus) +{ + int i; + + for (i = 0; i < nr_vcpus; i++) + unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); +} + +static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, + unsigned int nr_vcpus) +{ + hyp_vm->host_kvm = host_kvm; + hyp_vm->kvm.created_vcpus = nr_vcpus; + hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr; +} + +static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, + struct pkvm_hyp_vm *hyp_vm, + struct kvm_vcpu *host_vcpu, + unsigned int vcpu_idx) +{ + int ret = 0; + + if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) + return -EBUSY; + + if (host_vcpu->vcpu_idx != vcpu_idx) { + ret = -EINVAL; + goto done; + } + + hyp_vcpu->host_vcpu = host_vcpu; + + hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; + hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); + hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + + hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; + hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); +done: + if (ret) + unpin_host_vcpu(host_vcpu); + return ret; +} + +static int find_free_vm_table_entry(struct kvm *host_kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_PVMS; ++i) { + if (!vm_table[i]) + return i; + } + + return -ENOMEM; +} + +/* + * Allocate a VM table entry and insert a pointer to the new vm. + * + * Return a unique handle to the protected VM on success, + * negative error code on failure. + */ +static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, + struct pkvm_hyp_vm *hyp_vm) +{ + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = find_free_vm_table_entry(host_kvm); + if (idx < 0) + return idx; + + hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; + + vm_table[idx] = hyp_vm; + return hyp_vm->kvm.arch.pkvm.handle; +} + +/* + * Deallocate and remove the VM table entry corresponding to the handle. + */ +static void remove_vm_table_entry(pkvm_handle_t handle) +{ + hyp_assert_lock_held(&vm_table_lock); + vm_table[vm_handle_to_idx(handle)] = NULL; +} + +static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus) +{ + return size_add(sizeof(struct pkvm_hyp_vm), + size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus)); +} + +static void *map_donated_memory_noclear(unsigned long host_va, size_t size) +{ + void *va = (void *)kern_hyp_va(host_va); + + if (!PAGE_ALIGNED(va)) + return NULL; + + if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)) + return NULL; + + return va; +} + +static void *map_donated_memory(unsigned long host_va, size_t size) +{ + void *va = map_donated_memory_noclear(host_va, size); + + if (va) + memset(va, 0, size); + + return va; +} + +static void __unmap_donated_memory(void *va, size_t size) +{ + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)); +} + +static void unmap_donated_memory(void *va, size_t size) +{ + if (!va) + return; + + memset(va, 0, size); + __unmap_donated_memory(va, size); +} + +static void unmap_donated_memory_noclear(void *va, size_t size) +{ + if (!va) + return; + + __unmap_donated_memory(va, size); +} + +/* + * Initialize the hypervisor copy of the protected VM state using the + * memory donated by the host. + * + * Unmaps the donated memory from the host at stage 2. + * + * host_kvm: A pointer to the host's struct kvm. + * vm_hva: The host va of the area being donated for the VM state. + * Must be page aligned. + * pgd_hva: The host va of the area being donated for the stage-2 PGD for + * the VM. Must be page aligned. Its size is implied by the VM's + * VTCR. + * + * Return a unique handle to the protected VM on success, + * negative error code on failure. + */ +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva) +{ + struct pkvm_hyp_vm *hyp_vm = NULL; + size_t vm_size, pgd_size; + unsigned int nr_vcpus; + void *pgd = NULL; + int ret; + + ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1); + if (ret) + return ret; + + nr_vcpus = READ_ONCE(host_kvm->created_vcpus); + if (nr_vcpus < 1) { + ret = -EINVAL; + goto err_unpin_kvm; + } + + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); + pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr); + + ret = -ENOMEM; + + hyp_vm = map_donated_memory(vm_hva, vm_size); + if (!hyp_vm) + goto err_remove_mappings; + + pgd = map_donated_memory_noclear(pgd_hva, pgd_size); + if (!pgd) + goto err_remove_mappings; + + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus); + + hyp_spin_lock(&vm_table_lock); + ret = insert_vm_table_entry(host_kvm, hyp_vm); + if (ret < 0) + goto err_unlock; + + ret = kvm_guest_prepare_stage2(hyp_vm, pgd); + if (ret) + goto err_remove_vm_table_entry; + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm->kvm.arch.pkvm.handle; + +err_remove_vm_table_entry: + remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); +err_unlock: + hyp_spin_unlock(&vm_table_lock); +err_remove_mappings: + unmap_donated_memory(hyp_vm, vm_size); + unmap_donated_memory(pgd, pgd_size); +err_unpin_kvm: + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return ret; +} + +/* + * Initialize the hypervisor copy of the protected vCPU state using the + * memory donated by the host. + * + * handle: The handle for the protected vm. + * host_vcpu: A pointer to the corresponding host vcpu. + * vcpu_hva: The host va of the area being donated for the vcpu state. + * Must be page aligned. The size of the area must be equal to + * the page-aligned size of 'struct pkvm_hyp_vcpu'. + * Return 0 on success, negative error code on failure. + */ +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + struct pkvm_hyp_vm *hyp_vm; + unsigned int idx; + int ret; + + hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu)); + if (!hyp_vcpu) + return -ENOMEM; + + hyp_spin_lock(&vm_table_lock); + + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + ret = -ENOENT; + goto unlock; + } + + idx = hyp_vm->nr_vcpus; + if (idx >= hyp_vm->kvm.created_vcpus) { + ret = -EINVAL; + goto unlock; + } + + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); + if (ret) + goto unlock; + + hyp_vm->vcpus[idx] = hyp_vcpu; + hyp_vm->nr_vcpus++; +unlock: + hyp_spin_unlock(&vm_table_lock); + + if (ret) + unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + + return ret; +} + +static void +teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) +{ + size = PAGE_ALIGN(size); + memset(addr, 0, size); + + for (void *start = addr; start < addr + size; start += PAGE_SIZE) + push_hyp_memcache(mc, start, hyp_virt_to_phys); + + unmap_donated_memory_noclear(addr, size); +} + +int __pkvm_teardown_vm(pkvm_handle_t handle) +{ + struct kvm_hyp_memcache *mc; + struct pkvm_hyp_vm *hyp_vm; + struct kvm *host_kvm; + unsigned int idx; + size_t vm_size; + int err; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + err = -ENOENT; + goto err_unlock; + } + + if (WARN_ON(hyp_page_count(hyp_vm))) { + err = -EBUSY; + goto err_unlock; + } + + host_kvm = hyp_vm->host_kvm; + + /* Ensure the VMID is clean before it can be reallocated */ + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); + + /* Reclaim guest pages (including page-table pages) */ + mc = &host_kvm->arch.pkvm.teardown_mc; + reclaim_guest_pages(hyp_vm, mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + + /* Push the metadata pages to the teardown memcache */ + for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + + teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); + } + + vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); + teardown_donated_memory(mc, hyp_vm, vm_size); + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return 0; + +err_unlock: + hyp_spin_unlock(&vm_table_lock); + return err; +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index e8d4ea2fcfa0..110f04627785 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -16,6 +16,7 @@ #include <nvhe/memory.h> #include <nvhe/mem_protect.h> #include <nvhe/mm.h> +#include <nvhe/pkvm.h> #include <nvhe/trap_handler.h> unsigned long hyp_nr_cpus; @@ -24,6 +25,7 @@ unsigned long hyp_nr_cpus; (unsigned long)__per_cpu_start) static void *vmemmap_base; +static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; @@ -31,16 +33,20 @@ static struct hyp_pool hpool; static int divide_memory_pool(void *virt, unsigned long size) { - unsigned long vstart, vend, nr_pages; + unsigned long nr_pages; hyp_early_alloc_init(virt, size); - hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend); - nr_pages = (vend - vstart) >> PAGE_SHIFT; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) return -ENOMEM; + nr_pages = hyp_vm_table_pages(); + vm_table_base = hyp_early_alloc_contig(nr_pages); + if (!vm_table_base) + return -ENOMEM; + nr_pages = hyp_s1_pgtable_pages(); hyp_pgt_base = hyp_early_alloc_contig(nr_pages); if (!hyp_pgt_base) @@ -78,7 +84,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base)); + ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base)); if (ret) return ret; @@ -138,20 +144,17 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, } /* - * Map the host's .bss and .rodata sections RO in the hypervisor, but - * transfer the ownership from the host to the hypervisor itself to - * make sure it can't be donated or shared with another entity. + * Map the host sections RO in the hypervisor, but transfer the + * ownership from the host to the hypervisor itself to make sure they + * can't be donated or shared with another entity. * * The ownership transition requires matching changes in the host * stage-2. This will be done later (see finalize_host_mappings()) once * the hyp_vmemmap is addressable. */ prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); - if (ret) - return ret; - - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); + ret = pkvm_create_mappings(&kvm_vgic_global_state, + &kvm_vgic_global_state + 1, prot); if (ret) return ret; @@ -186,33 +189,20 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } -static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = arg; enum kvm_pgtable_prot prot; enum pkvm_page_state state; - kvm_pte_t pte = *ptep; phys_addr_t phys; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return 0; - /* - * Fix-up the refcount for the page-table pages as the early allocator - * was unable to access the hyp_vmemmap and so the buddy allocator has - * initialised the refcount to '1'. - */ - mm_ops->get_page(ptep); - if (flag != KVM_PGTABLE_WALK_LEAF) - return 0; - - if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) + if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; - phys = kvm_pte_to_phys(pte); + phys = kvm_pte_to_phys(ctx->old); if (!addr_is_memory(phys)) return -EINVAL; @@ -220,10 +210,10 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, * Adjust the host stage-2 mappings to match the ownership attributes * configured in the hypervisor stage-1. */ - state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); switch (state) { case PKVM_PAGE_OWNED: - return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); break; @@ -237,12 +227,25 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); } -static int finalize_host_mappings(void) +static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + /* + * Fix-up the refcount for the page-table pages as the early allocator + * was unable to access the hyp_vmemmap and so the buddy allocator has + * initialised the refcount to '1'. + */ + if (kvm_pte_valid(ctx->old)) + ctx->mm_ops->get_page(ctx->ptep); + + return 0; +} + +static int fix_host_ownership(void) { struct kvm_pgtable_walker walker = { - .cb = finalize_host_mappings_walker, - .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pkvm_pgtable.mm_ops, + .cb = fix_host_ownership_walker, + .flags = KVM_PGTABLE_WALK_LEAF, }; int i, ret; @@ -258,6 +261,18 @@ static int finalize_host_mappings(void) return 0; } +static int fix_hyp_pgtable_refcnt(void) +{ + struct kvm_pgtable_walker walker = { + .cb = fix_hyp_pgtable_refcnt_walker, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + .arg = pkvm_pgtable.mm_ops, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), + &walker); +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -287,10 +302,19 @@ void __noreturn __pkvm_init_finalise(void) }; pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; - ret = finalize_host_mappings(); + ret = fix_host_ownership(); + if (ret) + goto out; + + ret = fix_hyp_pgtable_refcnt(); + if (ret) + goto out; + + ret = hyp_create_pcpu_fixmap(); if (ret) goto out; + pkvm_hyp_vm_table_init(vm_table_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index cdf8e76b0be1..b11cf2c618a6 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -49,35 +49,38 @@ #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + struct kvm_pgtable_walk_data { - struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; u64 addr; u64 end; }; -#define KVM_PHYS_INVALID (-1ULL) - static bool kvm_phys_is_valid(u64 phys) { return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); } -static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { - u64 granule = kvm_granule_size(level); + u64 granule = kvm_granule_size(ctx->level); - if (!kvm_level_supports_block_mapping(level)) + if (!kvm_level_supports_block_mapping(ctx->level)) return false; - if (granule > (end - addr)) + if (granule > (ctx->end - ctx->addr)) return false; if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) return false; - return IS_ALIGNED(addr, granule); + return IS_ALIGNED(ctx->addr, granule); } static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) @@ -88,7 +91,7 @@ static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) return (data->addr >> shift) & mask; } -static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) +static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) { u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ u64 mask = BIT(pgt->ia_bits) - 1; @@ -96,11 +99,6 @@ static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) return (addr & mask) >> shift; } -static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data) -{ - return __kvm_pgd_page_idx(data->pgt, data->addr); -} - static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) { struct kvm_pgtable pgt = { @@ -108,7 +106,7 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) .start_level = start_level, }; - return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; + return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } static bool kvm_pte_table(kvm_pte_t pte, u32 level) @@ -122,16 +120,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level) return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } -static kvm_pte_t kvm_phys_to_pte(u64 pa) -{ - kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); - - return pte; -} - static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) { return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); @@ -142,16 +130,13 @@ static void kvm_clear_pte(kvm_pte_t *ptep) WRITE_ONCE(*ptep, 0); } -static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp, - struct kvm_pgtable_mm_ops *mm_ops) +static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) { - kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); + kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); pte |= KVM_PTE_VALID; - - WARN_ON(kvm_pte_valid(old)); - smp_store_release(ptep, pte); + return pte; } static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) @@ -172,36 +157,47 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); } -static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, - u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag) +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, + const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_walker *walker = data->walker; - return walker->cb(addr, data->end, level, ptep, flag, walker->arg); + + /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ + WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); + return walker->cb(ctx, visit); } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - kvm_pte_t *pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, - kvm_pte_t *ptep, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, + kvm_pteref_t pteref, u32 level) { - int ret = 0; - u64 addr = data->addr; - kvm_pte_t *childp, pte = *ptep; - bool table = kvm_pte_table(pte, level); enum kvm_pgtable_walk_flags flags = data->walker->flags; + kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); + struct kvm_pgtable_visit_ctx ctx = { + .ptep = ptep, + .old = READ_ONCE(*ptep), + .arg = data->walker->arg, + .mm_ops = mm_ops, + .addr = data->addr, + .end = data->end, + .level = level, + .flags = flags, + }; + int ret = 0; + kvm_pteref_t childp; + bool table = kvm_pte_table(ctx.old, level); - if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) { - ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, - KVM_PGTABLE_WALK_TABLE_PRE); - } + if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); - if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) { - ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, - KVM_PGTABLE_WALK_LEAF); - pte = *ptep; - table = kvm_pte_table(pte, level); + if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); + ctx.old = READ_ONCE(*ptep); + table = kvm_pte_table(ctx.old, level); } if (ret) @@ -213,22 +209,20 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; } - childp = kvm_pte_follow(pte, data->pgt->mm_ops); - ret = __kvm_pgtable_walk(data, childp, level + 1); + childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); + ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); if (ret) goto out; - if (flags & KVM_PGTABLE_WALK_TABLE_POST) { - ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, - KVM_PGTABLE_WALK_TABLE_POST); - } + if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); out: return ret; } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - kvm_pte_t *pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) { u32 idx; int ret = 0; @@ -237,12 +231,12 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { - kvm_pte_t *ptep = &pgtable[idx]; + kvm_pteref_t pteref = &pgtable[idx]; if (data->addr >= data->end) break; - ret = __kvm_pgtable_visit(data, ptep, level); + ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); if (ret) break; } @@ -250,11 +244,10 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, return ret; } -static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) +static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) { u32 idx; int ret = 0; - struct kvm_pgtable *pgt = data->pgt; u64 limit = BIT(pgt->ia_bits); if (data->addr > limit || data->end > limit) @@ -263,10 +256,10 @@ static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) if (!pgt->pgd) return -EINVAL; - for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { - kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; + for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { + kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; - ret = __kvm_pgtable_walk(data, ptep, pgt->start_level); + ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); if (ret) break; } @@ -278,13 +271,20 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { - .pgt = pgt, .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, }; + int r; - return _kvm_pgtable_walk(&walk_data); + r = kvm_pgtable_walk_begin(walker); + if (r) + return r; + + r = _kvm_pgtable_walk(pgt, &walk_data); + kvm_pgtable_walk_end(walker); + + return r; } struct leaf_walk_data { @@ -292,13 +292,13 @@ struct leaf_walk_data { u32 level; }; -static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct leaf_walk_data *data = arg; + struct leaf_walk_data *data = ctx->arg; - data->pte = *ptep; - data->level = level; + data->pte = ctx->old; + data->level = ctx->level; return 0; } @@ -329,7 +329,6 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, struct hyp_map_data { u64 phys; kvm_pte_t attr; - struct kvm_pgtable_mm_ops *mm_ops; }; static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) @@ -383,47 +382,49 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) return prot; } -static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, struct hyp_map_data *data) +static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, + struct hyp_map_data *data) { - kvm_pte_t new, old = *ptep; - u64 granule = kvm_granule_size(level), phys = data->phys; + kvm_pte_t new; + u64 granule = kvm_granule_size(ctx->level), phys = data->phys; - if (!kvm_block_mapping_supported(addr, end, phys, level)) + if (!kvm_block_mapping_supported(ctx, phys)) return false; data->phys += granule; - new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (old == new) + new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); + if (ctx->old == new) return true; - if (!kvm_pte_valid(old)) - data->mm_ops->get_page(ptep); - else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + if (!kvm_pte_valid(ctx->old)) + ctx->mm_ops->get_page(ctx->ptep); + else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) return false; - smp_store_release(ptep, new); + smp_store_release(ctx->ptep, new); return true; } -static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - kvm_pte_t *childp; - struct hyp_map_data *data = arg; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t *childp, new; + struct hyp_map_data *data = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) + if (hyp_map_walker_try_leaf(ctx, data)) return 0; - if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); if (!childp) return -ENOMEM; - kvm_set_table_pte(ptep, childp, mm_ops); - mm_ops->get_page(ptep); + new = kvm_init_table_pte(childp, mm_ops); + mm_ops->get_page(ctx->ptep); + smp_store_release(ctx->ptep, new); + return 0; } @@ -433,7 +434,6 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, int ret; struct hyp_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), - .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = hyp_map_walker, @@ -451,44 +451,39 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, return ret; } -struct hyp_unmap_data { - u64 unmapped; - struct kvm_pgtable_mm_ops *mm_ops; -}; - -static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = *ptep, *childp = NULL; - u64 granule = kvm_granule_size(level); - struct hyp_unmap_data *data = arg; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t *childp = NULL; + u64 granule = kvm_granule_size(ctx->level); + u64 *unmapped = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return -EINVAL; - if (kvm_pte_table(pte, level)) { - childp = kvm_pte_follow(pte, mm_ops); + if (kvm_pte_table(ctx->old, ctx->level)) { + childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; - kvm_clear_pte(ptep); + kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); } else { - if (end - addr < granule) + if (ctx->end - ctx->addr < granule) return -EINVAL; - kvm_clear_pte(ptep); + kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); - data->unmapped += granule; + __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + *unmapped += granule; } dsb(ish); isb(); - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); if (childp) mm_ops->put_page(childp); @@ -498,12 +493,10 @@ static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { - struct hyp_unmap_data unmap_data = { - .mm_ops = pgt->mm_ops, - }; + u64 unmapped = 0; struct kvm_pgtable_walker walker = { .cb = hyp_unmap_walker, - .arg = &unmap_data, + .arg = &unmapped, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; @@ -511,7 +504,7 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) return 0; kvm_pgtable_walk(pgt, addr, size, &walker); - return unmap_data.unmapped; + return unmapped; } int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, @@ -519,7 +512,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, { u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); - pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL); + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; @@ -532,19 +525,18 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, return 0; } -static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = arg; - kvm_pte_t pte = *ptep; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return 0; - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); - if (kvm_pte_table(pte, level)) - mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); + if (kvm_pte_table(ctx->old, ctx->level)) + mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } @@ -554,11 +546,10 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pgt->mm_ops, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); - pgt->mm_ops->put_page(pgt->pgd); + pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); pgt->pgd = NULL; } @@ -573,8 +564,6 @@ struct stage2_map_data { struct kvm_s2_mmu *mmu; void *memcache; - struct kvm_pgtable_mm_ops *mm_ops; - /* Force mappings to page granularity */ bool force_pte; }; @@ -682,19 +671,92 @@ static bool stage2_pte_is_counted(kvm_pte_t pte) return !!pte; } -static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, - u32 level, struct kvm_pgtable_mm_ops *mm_ops) +static bool stage2_pte_is_locked(kvm_pte_t pte) +{ + return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); +} + +static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) +{ + if (!kvm_pgtable_walk_shared(ctx)) { + WRITE_ONCE(*ctx->ptep, new); + return true; + } + + return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; +} + +/** + * stage2_try_break_pte() - Invalidates a pte according to the + * 'break-before-make' requirements of the + * architecture. + * + * @ctx: context of the visited pte. + * @mmu: stage-2 mmu + * + * Returns: true if the pte was successfully broken. + * + * If the removed pte was valid, performs the necessary serialization and TLB + * invalidation for the old value. For counted ptes, drops the reference count + * on the containing table page. + */ +static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, + struct kvm_s2_mmu *mmu) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (stage2_pte_is_locked(ctx->old)) { + /* + * Should never occur if this walker has exclusive access to the + * page tables. + */ + WARN_ON(!kvm_pgtable_walk_shared(ctx)); + return false; + } + + if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) + return false; + + /* + * Perform the appropriate TLB invalidation based on the evicted pte + * value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + else if (kvm_pte_valid(ctx->old)) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + + if (stage2_pte_is_counted(ctx->old)) + mm_ops->put_page(ctx->ptep); + + return true; +} + +static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); + + if (stage2_pte_is_counted(new)) + mm_ops->get_page(ctx->ptep); + + smp_store_release(ctx->ptep, new); +} + +static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) { /* * Clear the existing PTE, and perform break-before-make with * TLB maintenance if it was valid. */ - if (kvm_pte_valid(*ptep)) { - kvm_clear_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); + if (kvm_pte_valid(ctx->old)) { + kvm_clear_pte(ctx->ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); } static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) @@ -708,44 +770,42 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } -static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, +static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1))) + if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) return false; - return kvm_block_mapping_supported(addr, end, data->phys, level); + return kvm_block_mapping_supported(ctx, data->phys); } -static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, +static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - kvm_pte_t new, old = *ptep; - u64 granule = kvm_granule_size(level), phys = data->phys; + kvm_pte_t new; + u64 granule = kvm_granule_size(ctx->level), phys = data->phys; struct kvm_pgtable *pgt = data->mmu->pgt; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!stage2_leaf_mapping_allowed(addr, end, level, data)) + if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; if (kvm_phys_is_valid(phys)) - new = kvm_init_valid_leaf_pte(phys, data->attr, level); + new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); - if (stage2_pte_is_counted(old)) { - /* - * Skip updating the PTE if we are trying to recreate the exact - * same mapping or only change the access permissions. Instead, - * the vCPU will exit one more time from guest if still needed - * and then go through the path of relaxing permissions. - */ - if (!stage2_pte_needs_update(old, new)) - return -EAGAIN; + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!stage2_pte_needs_update(ctx->old, new)) + return -EAGAIN; - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); - } + if (!stage2_try_break_pte(ctx, data->mmu)) + return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) @@ -755,56 +815,43 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); - smp_store_release(ptep, new); - if (stage2_pte_is_counted(new)) - mm_ops->get_page(ptep); + stage2_make_pte(ctx, new); + if (kvm_phys_is_valid(phys)) data->phys += granule; return 0; } -static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, +static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - if (data->anchor) - return 0; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + int ret; - if (!stage2_leaf_mapping_allowed(addr, end, level, data)) + if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; - data->childp = kvm_pte_follow(*ptep, data->mm_ops); - kvm_clear_pte(ptep); + ret = stage2_map_walker_try_leaf(ctx, data); + if (ret) + return ret; - /* - * Invalidate the whole stage-2, as we may have numerous leaf - * entries below us which would otherwise need invalidating - * individually. - */ - kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu); - data->anchor = ptep; + mm_ops->free_removed_table(childp, ctx->level); return 0; } -static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, +static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - kvm_pte_t *childp, pte = *ptep; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp, new; int ret; - if (data->anchor) { - if (stage2_pte_is_counted(pte)) - mm_ops->put_page(ptep); - - return 0; - } - - ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data); + ret = stage2_map_walker_try_leaf(ctx, data); if (ret != -E2BIG) return ret; - if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; if (!data->memcache) @@ -814,99 +861,62 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, if (!childp) return -ENOMEM; + if (!stage2_try_break_pte(ctx, data->mmu)) { + mm_ops->put_page(childp); + return -EAGAIN; + } + /* * If we've run into an existing block mapping then replace it with * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (stage2_pte_is_counted(pte)) - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); - - kvm_set_table_pte(ptep, childp, mm_ops); - mm_ops->get_page(ptep); + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); return 0; } -static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - struct stage2_map_data *data) -{ - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - kvm_pte_t *childp; - int ret = 0; - - if (!data->anchor) - return 0; - - if (data->anchor == ptep) { - childp = data->childp; - data->anchor = NULL; - data->childp = NULL; - ret = stage2_map_walk_leaf(addr, end, level, ptep, data); - } else { - childp = kvm_pte_follow(*ptep, mm_ops); - } - - mm_ops->put_page(childp); - mm_ops->put_page(ptep); - - return ret; -} - /* - * This is a little fiddly, as we use all three of the walk flags. The idea - * is that the TABLE_PRE callback runs for table entries on the way down, - * looking for table entries which we could conceivably replace with a - * block entry for this mapping. If it finds one, then it sets the 'anchor' - * field in 'struct stage2_map_data' to point at the table entry, before - * clearing the entry to zero and descending into the now detached table. + * The TABLE_PRE callback runs for table entries on the way down, looking + * for table entries which we could conceivably replace with a block entry + * for this mapping. If it finds one it replaces the entry and calls + * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. * - * The behaviour of the LEAF callback then depends on whether or not the - * anchor has been set. If not, then we're not using a block mapping higher - * up the table and we perform the mapping at the existing leaves instead. - * If, on the other hand, the anchor _is_ set, then we drop references to - * all valid leaves so that the pages beneath the anchor can be freed. - * - * Finally, the TABLE_POST callback does nothing if the anchor has not - * been set, but otherwise frees the page-table pages while walking back up - * the page-table, installing the block entry when it revisits the anchor - * pointer and clearing the anchor to NULL. + * Otherwise, the LEAF callback performs the mapping at the existing leaves + * instead. */ -static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct stage2_map_data *data = arg; + struct stage2_map_data *data = ctx->arg; - switch (flag) { + switch (visit) { case KVM_PGTABLE_WALK_TABLE_PRE: - return stage2_map_walk_table_pre(addr, end, level, ptep, data); + return stage2_map_walk_table_pre(ctx, data); case KVM_PGTABLE_WALK_LEAF: - return stage2_map_walk_leaf(addr, end, level, ptep, data); - case KVM_PGTABLE_WALK_TABLE_POST: - return stage2_map_walk_table_post(addr, end, level, ptep, data); + return stage2_map_walk_leaf(ctx, data); + default: + return -EINVAL; } - - return -EINVAL; } int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, - void *mc) + void *mc, enum kvm_pgtable_walk_flags flags) { int ret; struct stage2_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mmu = pgt->mmu, .memcache = mc, - .mm_ops = pgt->mm_ops, .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, - .flags = KVM_PGTABLE_WALK_TABLE_PRE | - KVM_PGTABLE_WALK_LEAF | - KVM_PGTABLE_WALK_TABLE_POST, + .flags = flags | + KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; @@ -930,15 +940,13 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .phys = KVM_PHYS_INVALID, .mmu = pgt->mmu, .memcache = mc, - .mm_ops = pgt->mm_ops, .owner_id = owner_id, .force_pte = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_TABLE_PRE | - KVM_PGTABLE_WALK_LEAF | - KVM_PGTABLE_WALK_TABLE_POST, + KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; @@ -949,30 +957,29 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable *pgt = arg; + struct kvm_pgtable *pgt = ctx->arg; struct kvm_s2_mmu *mmu = pgt->mmu; - struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - kvm_pte_t pte = *ptep, *childp = NULL; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = NULL; bool need_flush = false; - if (!kvm_pte_valid(pte)) { - if (stage2_pte_is_counted(pte)) { - kvm_clear_pte(ptep); - mm_ops->put_page(ptep); + if (!kvm_pte_valid(ctx->old)) { + if (stage2_pte_is_counted(ctx->old)) { + kvm_clear_pte(ctx->ptep); + mm_ops->put_page(ctx->ptep); } return 0; } - if (kvm_pte_table(pte, level)) { - childp = kvm_pte_follow(pte, mm_ops); + if (kvm_pte_table(ctx->old, ctx->level)) { + childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; - } else if (stage2_pte_cacheable(pgt, pte)) { + } else if (stage2_pte_cacheable(pgt, ctx->old)) { need_flush = !stage2_has_fwb(pgt); } @@ -981,11 +988,11 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ptep, mmu, addr, level, mm_ops); + stage2_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) - mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), - kvm_granule_size(level)); + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), + kvm_granule_size(ctx->level)); if (childp) mm_ops->put_page(childp); @@ -1009,21 +1016,19 @@ struct stage2_attr_data { kvm_pte_t attr_clr; kvm_pte_t pte; u32 level; - struct kvm_pgtable_mm_ops *mm_ops; }; -static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = *ptep; - struct stage2_attr_data *data = arg; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t pte = ctx->old; + struct stage2_attr_data *data = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return 0; - data->level = level; + data->level = ctx->level; data->pte = pte; pte &= ~data->attr_clr; pte |= data->attr_set; @@ -1039,10 +1044,12 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * stage-2 PTE if we are going to add executable permission. */ if (mm_ops->icache_inval_pou && - stage2_pte_executable(pte) && !stage2_pte_executable(*ptep)) + stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), - kvm_granule_size(level)); - WRITE_ONCE(*ptep, pte); + kvm_granule_size(ctx->level)); + + if (!stage2_try_set_pte(ctx, pte)) + return -EAGAIN; } return 0; @@ -1051,19 +1058,18 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, - u32 *level) + u32 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; struct stage2_attr_data data = { .attr_set = attr_set & attr_mask, .attr_clr = attr_clr & attr_mask, - .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = stage2_attr_walker, .arg = &data, - .flags = KVM_PGTABLE_WALK_LEAF, + .flags = flags | KVM_PGTABLE_WALK_LEAF, }; ret = kvm_pgtable_walk(pgt, addr, size, &walker); @@ -1082,14 +1088,14 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, - NULL, NULL); + NULL, NULL, 0); } kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte, NULL); + &pte, NULL, 0); dsb(ishst); return pte; } @@ -1098,7 +1104,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, - &pte, NULL); + &pte, NULL, 0); /* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. @@ -1111,7 +1117,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL); + stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0); return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; } @@ -1134,26 +1140,25 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, + KVM_PGTABLE_WALK_SHARED); if (!ret) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); return ret; } -static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable *pgt = arg; + struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - kvm_pte_t pte = *ptep; - if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) + if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) - mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), - kvm_granule_size(level)); + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), + kvm_granule_size(ctx->level)); return 0; } @@ -1184,7 +1189,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; - pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz); + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); if (!pgt->pgd) return -ENOMEM; @@ -1200,20 +1205,27 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, return 0; } -static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) +{ + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; +} + +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = arg; - kvm_pte_t pte = *ptep; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!stage2_pte_is_counted(pte)) + if (!stage2_pte_is_counted(ctx->old)) return 0; - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); - if (kvm_pte_table(pte, level)) - mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); + if (kvm_pte_table(ctx->old, ctx->level)) + mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } @@ -1225,11 +1237,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pgt->mm_ops, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); pgt->pgd = NULL; } + +void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +{ + kvm_pteref_t ptep = (kvm_pteref_t)pgtable; + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + struct kvm_pgtable_walk_data data = { + .walker = &walker, + + /* + * At this point the IPA really doesn't matter, as the page + * table being traversed has already been removed from the stage + * 2. Set an appropriate range to cover the entire page table. + */ + .addr = 0, + .end = kvm_granule_size(level), + }; + + WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); +} diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 96bec0ecf9dd..3b9e5464b5b3 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # -# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part +# Makefile for Kernel-based Virtual Machine module, HYP/VHE part # asflags-y := -D__KVM_VHE_HYPERVISOR__ diff --git a/arch/arm64/kvm/irq.h b/arch/arm64/kvm/irq.h deleted file mode 100644 index 0d257de42c10..000000000000 --- a/arch/arm64/kvm/irq.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2016 Red Hat, Inc. - * - * This header is included by irqchip.c. However, on ARM, interrupt - * controller declarations are located in include/kvm/arm_vgic.h since - * they are mostly shared between arm and arm64. - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include <kvm/arm_vgic.h> - -#endif diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8..31d7fa4c7c14 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) free_pages_exact(virt, size); } +static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; + +static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +{ + struct page *page = container_of(head, struct page, rcu_head); + void *pgtable = page_to_virt(page); + u32 level = page_private(page); + + kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); +} + +static void stage2_free_removed_table(void *addr, u32 level) +{ + struct page *page = virt_to_page(addr); + + set_page_private(page, (unsigned long)level); + call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); +} + static void kvm_host_get_page(void *addr) { get_page(virt_to_page(addr)); @@ -640,8 +659,8 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { static int get_user_mapping_size(struct kvm *kvm, u64 addr) { struct kvm_pgtable pgt = { - .pgd = (kvm_pte_t *)kvm->mm->pgd, - .ia_bits = VA_BITS, + .pgd = (kvm_pteref_t)kvm->mm->pgd, + .ia_bits = vabits_actual, .start_level = (KVM_PGTABLE_MAX_LEVELS - CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, @@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, + .free_removed_table = stage2_free_removed_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, @@ -675,15 +695,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine * * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) { + u32 kvm_ipa_limit = get_kvm_ipa_limit(); int cpu, err; struct kvm_pgtable *pgt; + u64 mmfr0, mmfr1; + u32 phys_shift; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (is_protected_kvm_enabled()) { + phys_shift = kvm_ipa_limit; + } else if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < ARM64_MIN_PARANGE_BITS) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); @@ -807,6 +854,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) } } +static void hyp_mc_free_fn(void *addr, void *unused) +{ + free_page((unsigned long)addr); +} + +static void *hyp_mc_alloc_fn(void *unused) +{ + return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc) +{ + if (is_protected_kvm_enabled()) + __free_hyp_memcache(mc, hyp_mc_free_fn, + kvm_host_va, NULL); +} + +int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) +{ + if (!is_protected_kvm_enabled()) + return 0; + + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, + kvm_host_pa, NULL); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -841,7 +914,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, write_lock(&kvm->mmu_lock); ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache); + &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1091,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) * - mmap_lock protects between a VM faulting a page in and the VMM performing * an mprotect() to add VM_MTE */ -static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long size) +static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long size) { unsigned long i, nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(pfn); if (!kvm_has_mte(kvm)) - return 0; - - /* - * pfn_to_online_page() is used to reject ZONE_DEVICE pages - * that may not support tags. - */ - page = pfn_to_online_page(pfn); - - if (!page) - return -EFAULT; + return; for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } } +} - return 0; +static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_MTE_ALLOWED; } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1127,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool write_fault, writable, force_pte = false; bool exec_fault; bool device = false; - bool shared; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; @@ -1136,7 +1202,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - bool use_read_lock = false; unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; @@ -1171,14 +1236,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; - use_read_lock = (fault_status == FSC_PERM && write_fault && - fault_granule == PAGE_SIZE); } else { vma_shift = get_vma_page_shift(vma, hva); } - shared = (vma->vm_flags & VM_SHARED); - switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: @@ -1239,7 +1300,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); @@ -1271,15 +1332,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - /* - * To reduce MMU contentions and enhance concurrency during dirty - * logging dirty logging, only acquire read lock for permission - * relaxation. - */ - if (use_read_lock) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); + read_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; @@ -1298,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { - /* Check the VMM hasn't introduced a new VM_SHARED VMA */ - if (!shared) - ret = sanitise_mte_tags(kvm, pfn, vma_pagesize); - else + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (kvm_vma_mte_allowed(vma)) { + sanitise_mte_tags(kvm, pfn, vma_pagesize); + } else { ret = -EFAULT; - if (ret) goto out_unlock; + } } if (writable) @@ -1323,15 +1376,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { + if (fault_status == FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - } else { - WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n"); - + else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache); - } + memcache, KVM_PGTABLE_WALK_SHARED); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { @@ -1340,10 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - if (use_read_lock) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; @@ -1526,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_pfn_t pfn = pte_pfn(range->pte); - int ret; if (!kvm->arch.mmu.pgt) return false; WARN_ON(range->end - range->start != 1); - ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE); - if (ret) + /* + * If the page isn't tagged, defer to user_mem_abort() for sanitising + * the MTE tags. The S2 pte should have been unmapped by + * mmu_notifier_invalidate_range_end(). + */ + if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) return false; /* @@ -1549,7 +1599,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) */ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL); + KVM_PGTABLE_PROT_R, NULL, 0); return false; } @@ -1618,6 +1668,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { int kvm_mmu_init(u32 *hyp_va_bits) { int err; + u32 idmap_bits; + u32 kernel_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -1631,7 +1683,31 @@ int kvm_mmu_init(u32 *hyp_va_bits) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + /* + * The ID map may be configured to use an extended virtual address + * range. This is only the case if system RAM is out of range for the + * currently configured page size and VA_BITS_MIN, in which case we will + * also need the extended virtual range for the HYP ID map, or we won't + * be able to enable the EL2 MMU. + * + * However, in some cases the ID map may be configured for fewer than + * the number of VA bits used by the regular kernel stage 1. This + * happens when VA_BITS=52 and the kernel image is placed in PA space + * below 48 bits. + * + * At EL2, there is only one TTBR register, and we can't switch between + * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom + * line: we need to use the extended range with *both* our translation + * tables. + * + * So use the maximum of the idmap VA bits and the regular kernel stage + * 1 VA bits to assure that the hypervisor can both ID map its code page + * and map any kernel memory. + */ + idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kernel_bits = vabits_actual; + *hyp_va_bits = max(idmap_bits, kernel_bits); + kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", @@ -1740,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!vma) break; - /* - * VM_SHARED mappings are not allowed with MTE to avoid races - * when updating the PG_mte_tagged page flag, see - * sanitise_mte_tags for more details. - */ - if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) { + if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { ret = -EINVAL; break; } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index ebecb7c045f4..cf56958b1492 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include <linux/memblock.h> +#include <linux/mutex.h> #include <linux/sort.h> #include <asm/kvm_pkvm.h> @@ -53,7 +54,7 @@ static int __init register_memblock_regions(void) void __init kvm_hyp_reserve(void) { - u64 nr_pages, prev, hyp_mem_pages = 0; + u64 hyp_mem_pages = 0; int ret; if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) @@ -71,21 +72,8 @@ void __init kvm_hyp_reserve(void) hyp_mem_pages += hyp_s1_pgtable_pages(); hyp_mem_pages += host_s2_pgtable_pages(); - - /* - * The hyp_vmemmap needs to be backed by pages, but these pages - * themselves need to be present in the vmemmap, so compute the number - * of pages needed by looking for a fixed point. - */ - nr_pages = 0; - do { - prev = nr_pages; - nr_pages = hyp_mem_pages + prev; - nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE, - PAGE_SIZE); - nr_pages += __hyp_pgtable_max_pages(nr_pages); - } while (nr_pages != prev); - hyp_mem_pages += nr_pages; + hyp_mem_pages += hyp_vm_table_pages(); + hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); /* * Try to allocate a PMD-aligned region to reduce TLB pressure once @@ -107,3 +95,121 @@ void __init kvm_hyp_reserve(void) kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); } + +/* + * Allocates and donates memory for hypervisor VM structs at EL2. + * + * Allocates space for the VM state, which includes the hyp vm as well as + * the hyp vcpus. + * + * Stores an opaque handler in the kvm struct for future reference. + * + * Return 0 on success, negative error code on failure. + */ +static int __pkvm_create_hyp_vm(struct kvm *host_kvm) +{ + size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz; + struct kvm_vcpu *host_vcpu; + pkvm_handle_t handle; + void *pgd, *hyp_vm; + unsigned long idx; + int ret; + + if (host_kvm->created_vcpus < 1) + return -EINVAL; + + pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); + + /* + * The PGD pages will be reclaimed using a hyp_memcache which implies + * page granularity. So, use alloc_pages_exact() to get individual + * refcounts. + */ + pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); + if (!pgd) + return -ENOMEM; + + /* Allocate memory to donate to hyp for vm and vcpu pointers. */ + hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, + size_mul(sizeof(void *), + host_kvm->created_vcpus))); + hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vm) { + ret = -ENOMEM; + goto free_pgd; + } + + /* Donate the VM memory to hyp and let hyp initialize it. */ + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); + if (ret < 0) + goto free_vm; + + handle = ret; + + host_kvm->arch.pkvm.handle = handle; + + /* Donate memory for the vcpus at hyp and initialize it. */ + hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { + void *hyp_vcpu; + + /* Indexing of the vcpus to be sequential starting at 0. */ + if (WARN_ON(host_vcpu->vcpu_idx != idx)) { + ret = -EINVAL; + goto destroy_vm; + } + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) { + ret = -ENOMEM; + goto destroy_vm; + } + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, + hyp_vcpu); + if (ret) { + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); + goto destroy_vm; + } + } + + return 0; + +destroy_vm: + pkvm_destroy_hyp_vm(host_kvm); + return ret; +free_vm: + free_pages_exact(hyp_vm, hyp_vm_sz); +free_pgd: + free_pages_exact(pgd, pgd_sz); + return ret; +} + +int pkvm_create_hyp_vm(struct kvm *host_kvm) +{ + int ret = 0; + + mutex_lock(&host_kvm->lock); + if (!host_kvm->arch.pkvm.handle) + ret = __pkvm_create_hyp_vm(host_kvm); + mutex_unlock(&host_kvm->lock); + + return ret; +} + +void pkvm_destroy_hyp_vm(struct kvm *host_kvm) +{ + if (host_kvm->arch.pkvm.handle) { + WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + host_kvm->arch.pkvm.handle)); + } + + host_kvm->arch.pkvm.handle = 0; + free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); +} + +int pkvm_init_host_vm(struct kvm *host_kvm) +{ + mutex_init(&host_kvm->lock); + return 0; +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 0003c7d37533..24908400e190 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -15,16 +15,25 @@ #include <kvm/arm_pmu.h> #include <kvm/arm_vgic.h> +#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) + DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); -static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); -static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); +static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); + +static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) +{ + return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); +} -#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 +static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) +{ + return &vcpu->arch.pmu.pmc[cnt_idx]; +} static u32 kvm_pmu_event_mask(struct kvm *kvm) { @@ -47,113 +56,46 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm) } /** - * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter - * @vcpu: The vcpu pointer - * @select_idx: The counter index + * kvm_pmc_is_64bit - determine if counter is 64bit + * @pmc: counter context */ -static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) +static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) { - return (select_idx == ARMV8_PMU_CYCLE_IDX && - __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); + return (pmc->idx == ARMV8_PMU_CYCLE_IDX || + kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc))); } -static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) +static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu; - struct kvm_vcpu_arch *vcpu_arch; + u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0); - pmc -= pmc->idx; - pmu = container_of(pmc, struct kvm_pmu, pmc[0]); - vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); - return container_of(vcpu_arch, struct kvm_vcpu, arch); + return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || + (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); } -/** - * kvm_pmu_pmc_is_chained - determine if the pmc is chained - * @pmc: The PMU counter pointer - */ -static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc) +static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc) { - struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - - return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); + return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX && + !kvm_pmc_has_64bit_overflow(pmc)); } -/** - * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter - * @select_idx: The counter index - */ -static bool kvm_pmu_idx_is_high_counter(u64 select_idx) -{ - return select_idx & 0x1; -} - -/** - * kvm_pmu_get_canonical_pmc - obtain the canonical pmc - * @pmc: The PMU counter pointer - * - * When a pair of PMCs are chained together we use the low counter (canonical) - * to hold the underlying perf event. - */ -static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc) -{ - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(pmc->idx)) - return pmc - 1; - - return pmc; -} -static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) +static u32 counter_index_to_reg(u64 idx) { - if (kvm_pmu_idx_is_high_counter(pmc->idx)) - return pmc - 1; - else - return pmc + 1; + return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx; } -/** - * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) +static u32 counter_index_to_evtreg(u64 idx) { - u64 eventsel, reg; - - select_idx |= 0x1; - - if (select_idx == ARMV8_PMU_CYCLE_IDX) - return false; - - reg = PMEVTYPER0_EL0 + select_idx; - eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm); - - return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; + return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } -/** - * kvm_pmu_get_pair_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @pmc: The PMU counter pointer - */ -static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, - struct kvm_pmc *pmc) +static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { - u64 counter, counter_high, reg, enabled, running; - - if (kvm_pmu_pmc_is_chained(pmc)) { - pmc = kvm_pmu_get_canonical_pmc(pmc); - reg = PMEVCNTR0_EL0 + pmc->idx; + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 counter, reg, enabled, running; - counter = __vcpu_sys_reg(vcpu, reg); - counter_high = __vcpu_sys_reg(vcpu, reg + 1); - - counter = lower_32_bits(counter) | (counter_high << 32); - } else { - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; - counter = __vcpu_sys_reg(vcpu, reg); - } + reg = counter_index_to_reg(pmc->idx); + counter = __vcpu_sys_reg(vcpu, reg); /* * The real counter value is equal to the value of counter register plus @@ -163,6 +105,9 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, counter += perf_event_read_value(pmc->perf_event, &enabled, &running); + if (!kvm_pmc_is_64bit(pmc)) + counter = lower_32_bits(counter); + return counter; } @@ -173,22 +118,37 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, */ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { - u64 counter; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - if (!kvm_vcpu_has_pmu(vcpu)) return 0; - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); +} - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(select_idx)) - counter = upper_32_bits(counter); - else if (select_idx != ARMV8_PMU_CYCLE_IDX) - counter = lower_32_bits(counter); +static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 reg; - return counter; + kvm_pmu_release_perf_event(pmc); + + reg = counter_index_to_reg(pmc->idx); + + if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX && + !force) { + /* + * Even with PMUv3p5, AArch32 cannot write to the top + * 32bit of the counters. The only possible course of + * action is to use PMCR.P, which will reset them to + * 0 (the only use of the 'force' parameter). + */ + val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); + val |= lower_32_bits(val); + } + + __vcpu_sys_reg(vcpu, reg) = val; + + /* Recreate the perf event to reflect the updated sample_period */ + kvm_pmu_create_perf_event(pmc); } /** @@ -199,17 +159,10 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { - u64 reg; - if (!kvm_vcpu_has_pmu(vcpu)) return; - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; - __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); - - /* Recreate the perf event to reflect the updated sample_period */ - kvm_pmu_create_perf_event(vcpu, select_idx); + kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } /** @@ -218,7 +171,6 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) */ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) { - pmc = kvm_pmu_get_canonical_pmc(pmc); if (pmc->perf_event) { perf_event_disable(pmc->perf_event); perf_event_release_kernel(pmc->perf_event); @@ -232,29 +184,20 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) * * If this counter has been configured to monitor some event, release it here. */ -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) +static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) { - u64 counter, reg, val; + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 reg, val; - pmc = kvm_pmu_get_canonical_pmc(pmc); if (!pmc->perf_event) return; - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + val = kvm_pmu_get_pmc_value(pmc); - if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { - reg = PMCCNTR_EL0; - val = counter; - } else { - reg = PMEVCNTR0_EL0 + pmc->idx; - val = lower_32_bits(counter); - } + reg = counter_index_to_reg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = val; - if (kvm_pmu_pmc_is_chained(pmc)) - __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); - kvm_pmu_release_perf_event(pmc); } @@ -280,13 +223,10 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - struct kvm_pmu *pmu = &vcpu->arch.pmu; int i; for_each_set_bit(i, &mask, 32) - kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - - bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); + kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i)); } /** @@ -297,10 +237,9 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) - kvm_pmu_release_perf_event(&pmu->pmc[i]); + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } @@ -325,9 +264,6 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - if (!kvm_vcpu_has_pmu(vcpu)) return; @@ -335,17 +271,16 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc; + if (!(val & BIT(i))) continue; - pmc = &pmu->pmc[i]; - - /* A change in the enable state may affect the chain state */ - kvm_pmu_update_pmc_chained(vcpu, i); - kvm_pmu_create_perf_event(vcpu, i); + pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - /* At this point, pmc must be the canonical */ - if (pmc->perf_event) { + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(pmc); + } else { perf_event_enable(pmc->perf_event); if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) kvm_debug("fail to enable perf event\n"); @@ -363,23 +298,18 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; if (!kvm_vcpu_has_pmu(vcpu) || !val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc; + if (!(val & BIT(i))) continue; - pmc = &pmu->pmc[i]; - - /* A change in the enable state may affect the chain state */ - kvm_pmu_update_pmc_chained(vcpu, i); - kvm_pmu_create_perf_event(vcpu, i); + pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - /* At this point, pmc must be the canonical */ if (pmc->perf_event) perf_event_disable(pmc->perf_event); } @@ -476,14 +406,69 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) { struct kvm_vcpu *vcpu; - struct kvm_pmu *pmu; - - pmu = container_of(work, struct kvm_pmu, overflow_work); - vcpu = kvm_pmc_to_vcpu(pmu->pmc); + vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); kvm_vcpu_kick(vcpu); } +/* + * Perform an increment on any of the counters described in @mask, + * generating the overflow if required, and propagate it as a chained + * event if possible. + */ +static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, + unsigned long mask, u32 event) +{ + int i; + + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + return; + + /* Weed out disabled counters */ + mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + + for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + u64 type, reg; + + /* Filter on event type */ + type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i)); + type &= kvm_pmu_event_mask(vcpu->kvm); + if (type != event) + continue; + + /* Increment this counter */ + reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; + if (!kvm_pmc_is_64bit(pmc)) + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; + + /* No overflow? move on */ + if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) + continue; + + /* Mark overflow */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + + if (kvm_pmu_counter_can_chain(pmc)) + kvm_pmu_counter_increment(vcpu, BIT(i + 1), + ARMV8_PMUV3_PERFCTR_CHAIN); + } +} + +/* Compute the sample period for a given counter value */ +static u64 compute_period(struct kvm_pmc *pmc, u64 counter) +{ + u64 val; + + if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc)) + val = (-counter) & GENMASK(63, 0); + else + val = (-counter) & GENMASK(31, 0); + + return val; +} + /** * When the perf event overflows, set the overflow status and inform the vcpu. */ @@ -503,10 +488,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, * Reset the sample period to the architectural limit, * i.e. the point where the counter overflows. */ - period = -(local64_read(&perf_event->count)); - - if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) - period &= GENMASK(31, 0); + period = compute_period(pmc, local64_read(&perf_event->count)); local64_set(&perf_event->hw.period_left, 0); perf_event->attr.sample_period = period; @@ -514,6 +496,10 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + if (kvm_pmu_counter_can_chain(pmc)) + kvm_pmu_counter_increment(vcpu, BIT(idx + 1), + ARMV8_PMUV3_PERFCTR_CHAIN); + if (kvm_pmu_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); @@ -533,50 +519,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, */ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int i; - - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) - return; - - /* Weed out disabled counters */ - val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { - u64 type, reg; - - if (!(val & BIT(i))) - continue; - - /* PMSWINC only applies to ... SW_INC! */ - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); - type &= kvm_pmu_event_mask(vcpu->kvm); - if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) - continue; - - /* increment this even SW_INC counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - - if (reg) /* no overflow on the low part */ - continue; - - if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { - /* increment the high counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; - if (!reg) /* mark overflow on the high counter */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); - } else { - /* mark overflow on low counter */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); - } - } + kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR); } /** @@ -591,6 +534,12 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu)) return; + /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ + if (!kvm_pmu_is_3p5(vcpu)) + val &= ~ARMV8_PMU_PMCR_LP; + + __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); @@ -606,49 +555,44 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) - kvm_pmu_set_counter_value(vcpu, i, 0); + kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } } -static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); + (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); } /** * kvm_pmu_create_perf_event - create a perf event for a counter - * @vcpu: The vcpu pointer - * @select_idx: The number of selected counter + * @pmc: Counter context */ -static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) +static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, counter, reg, data; + u64 eventsel, reg, data; - /* - * For chained counters the event type and filtering attributes are - * obtained from the low/even counter. We also use this counter to - * determine if the event is enabled/disabled. - */ - pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]); - - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx; + reg = counter_index_to_evtreg(pmc->idx); data = __vcpu_sys_reg(vcpu, reg); - kvm_pmu_stop_counter(vcpu, pmc); + kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else eventsel = data & kvm_pmu_event_mask(vcpu->kvm); - /* Software increment event doesn't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR) + /* + * Neither SW increment nor chained events need to be backed + * by a perf event. + */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR || + eventsel == ARMV8_PMUV3_PERFCTR_CHAIN) return; /* @@ -663,37 +607,25 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; - attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx); + attr.disabled = !kvm_pmu_counter_is_enabled(pmc); attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - - if (kvm_pmu_pmc_is_chained(pmc)) { - /** - * The initial sample period (overflow count) of an event. For - * chained counters we only support overflow interrupts on the - * high counter. - */ - attr.sample_period = (-counter) & GENMASK(63, 0); - attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; + /* + * If counting with a 64bit counter, advertise it to the perf + * code, carefully dealing with the initial sample period + * which also depends on the overflow. + */ + if (kvm_pmc_is_64bit(pmc)) + attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, - pmc + 1); - } else { - /* The initial sample period (overflow count) of an event. */ - if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) - attr.sample_period = (-counter) & GENMASK(63, 0); - else - attr.sample_period = (-counter) & GENMASK(31, 0); + attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc)); - event = perf_event_create_kernel_counter(&attr, -1, current, + event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc); - } if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", @@ -705,41 +637,6 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) } /** - * kvm_pmu_update_pmc_chained - update chained bitmap - * @vcpu: The vcpu pointer - * @select_idx: The number of selected counter - * - * Update the chained bitmap based on the event type written in the - * typer register and the enable state of the odd register. - */ -static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; - bool new_state, old_state; - - old_state = kvm_pmu_pmc_is_chained(pmc); - new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && - kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); - - if (old_state == new_state) - return; - - canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); - kvm_pmu_stop_counter(vcpu, canonical_pmc); - if (new_state) { - /* - * During promotion from !chained to chained we must ensure - * the adjacent counter is stopped and its event destroyed - */ - kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); - set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); - return; - } - clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); -} - -/** * kvm_pmu_set_counter_event_type - set selected counter to monitor some event * @vcpu: The vcpu pointer * @data: The data guest writes to PMXEVTYPER_EL0 @@ -752,6 +649,7 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); u64 reg, mask; if (!kvm_vcpu_has_pmu(vcpu)) @@ -761,20 +659,19 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, mask &= ~ARMV8_PMU_EVTYPE_EVENT; mask |= kvm_pmu_event_mask(vcpu->kvm); - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; + reg = counter_index_to_evtreg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = data & mask; - kvm_pmu_update_pmc_chained(vcpu, select_idx); - kvm_pmu_create_perf_event(vcpu, select_idx); + kvm_pmu_create_perf_event(pmc); } void kvm_host_pmu_init(struct arm_pmu *pmu) { struct arm_pmu_entry *entry; - if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI || + pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) return; mutex_lock(&arm_pmus_lock); @@ -827,7 +724,7 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) if (event->pmu) { pmu = to_arm_pmu(event->pmu); - if (pmu->pmuver == 0 || + if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) pmu = NULL; } @@ -849,6 +746,8 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) if (!pmceid1) { val = read_sysreg(pmceid0_el0); + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); base = 0; } else { val = read_sysreg(pmceid1_el0); @@ -1150,3 +1049,14 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return -ENXIO; } + +u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + u64 tmp; + + tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + tmp = cpuid_feature_cap_perfmon_field(tmp, + ID_AA64DFR0_EL1_PMUVer_SHIFT, + ID_AA64DFR0_EL1_PMUVer_V3P5); + return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); +} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5ae18472205a..e0267f672b8a 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -395,32 +395,3 @@ int kvm_set_ipa_limit(void) return 0; } - -int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) -{ - u64 mmfr0, mmfr1; - u32 phys_shift; - - if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) - return -EINVAL; - - phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); - if (phys_shift) { - if (phys_shift > kvm_ipa_limit || - phys_shift < ARM64_MIN_PARANGE_BITS) - return -EINVAL; - } else { - phys_shift = KVM_PHYS_SHIFT; - if (phys_shift > kvm_ipa_limit) { - pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", - current->comm); - return -EINVAL; - } - } - - mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); - - return 0; -} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 608e4f25161d..d5ee52d6bf73 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -639,22 +639,18 @@ static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 pmcr, val; + u64 pmcr; /* No PMU available, PMCR_EL0 may UNDEF... */ if (!kvm_arm_support_pmu_v3()) return; - pmcr = read_sysreg(pmcr_el0); - /* - * Writable bits of PMCR_EL0 (ARMV8_PMU_PMCR_MASK) are reset to UNKNOWN - * except PMCR.E resetting to zero. - */ - val = ((pmcr & ~ARMV8_PMU_PMCR_MASK) - | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E); + /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ + pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N_MASK; if (!kvm_supports_32bit_el0()) - val |= ARMV8_PMU_PMCR_LC; - __vcpu_sys_reg(vcpu, r->reg) = val; + pmcr |= ARMV8_PMU_PMCR_LC; + + __vcpu_sys_reg(vcpu, r->reg) = pmcr; } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) @@ -697,13 +693,15 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; if (p->is_write) { - /* Only update writeable bits of PMCR */ + /* + * Only update writeable bits of PMCR (continuing into + * kvm_pmu_handle_pmcr() as well) + */ val = __vcpu_sys_reg(vcpu, PMCR_EL0); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; kvm_pmu_handle_pmcr(vcpu, val); kvm_vcpu_pmu_restore_guest(vcpu); } else { @@ -1062,6 +1060,40 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_has_pmu(vcpu)) + return vcpu->kvm->arch.dfr0_pmuver.imp; + + return vcpu->kvm->arch.dfr0_pmuver.unimp; +} + +static u8 perfmon_to_pmuver(u8 perfmon) +{ + switch (perfmon) { + case ID_DFR0_EL1_PerfMon_PMUv3: + return ID_AA64DFR0_EL1_PMUVer_IMP; + case ID_DFR0_EL1_PerfMon_IMPDEF: + return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return perfmon; + } +} + +static u8 pmuver_to_perfmon(u8 pmuver) +{ + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: + return ID_DFR0_EL1_PerfMon_PMUv3; + case ID_AA64DFR0_EL1_PMUVer_IMP_DEF: + return ID_DFR0_EL1_PerfMon_IMPDEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return pmuver; + } +} + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { @@ -1111,18 +1143,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r /* Limit debug to ARMv8.0 */ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); - /* Limit guests to PMUv3 for ARMv8.4 */ - val = cpuid_feature_cap_perfmon_field(val, - ID_AA64DFR0_EL1_PMUVer_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0); + /* Set PMUver to the required version */ + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), + vcpu_pmuver(vcpu)); /* Hide SPE from guests */ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); break; case SYS_ID_DFR0_EL1: - /* Limit guests to PMUv3 for ARMv8.4 */ - val = cpuid_feature_cap_perfmon_field(val, - ID_DFR0_EL1_PerfMon_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_EL1_PerfMon_PMUv3p4 : 0); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), + pmuver_to_perfmon(vcpu_pmuver(vcpu))); break; } @@ -1222,6 +1253,85 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return 0; } +static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 pmuver, host_pmuver; + bool valid_pmu; + + host_pmuver = kvm_arm_pmu_get_pmuver_limit(); + + /* + * Allow AA64DFR0_EL1.PMUver to be set from userspace as long + * as it doesn't promise more than what the HW gives us. We + * allow an IMPDEF PMU though, only if no PMU is supported + * (KVM backward compatibility handling). + */ + pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val); + if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver)) + return -EINVAL; + + valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF); + + /* Make sure view register and PMU support do match */ + if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + return -EINVAL; + + /* We can only differ with PMUver, and anything else is an error */ + val ^= read_id_reg(vcpu, rd); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); + if (val) + return -EINVAL; + + if (valid_pmu) + vcpu->kvm->arch.dfr0_pmuver.imp = pmuver; + else + vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver; + + return 0; +} + +static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 perfmon, host_perfmon; + bool valid_pmu; + + host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + + /* + * Allow DFR0_EL1.PerfMon to be set from userspace as long as + * it doesn't promise more than what the HW gives us on the + * AArch64 side (as everything is emulated with that), and + * that this is a PMUv3. + */ + perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val); + if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) || + (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)) + return -EINVAL; + + valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF); + + /* Make sure view register and PMU support do match */ + if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + return -EINVAL; + + /* We can only differ with PerfMon, and anything else is an error */ + val ^= read_id_reg(vcpu, rd); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); + if (val) + return -EINVAL; + + if (valid_pmu) + vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon); + else + vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon); + + return 0; +} + /* * cpufeature ID register user accessors * @@ -1443,7 +1553,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* CRm=1 */ AA32_ID_SANITISED(ID_PFR0_EL1), AA32_ID_SANITISED(ID_PFR1_EL1), - AA32_ID_SANITISED(ID_DFR0_EL1), + { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, + .get_user = get_id_reg, .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, }, ID_HIDDEN(ID_AFR0_EL1), AA32_ID_SANITISED(ID_MMFR0_EL1), AA32_ID_SANITISED(ID_MMFR1_EL1), @@ -1483,7 +1595,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,7), /* CRm=5 */ - ID_SANITISED(ID_AA64DFR0_EL1), + { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg, + .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, }, ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 733b53055f97..94a666dd1443 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2743,6 +2743,7 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); + struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ @@ -2762,7 +2763,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: + dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); + dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); @@ -2775,6 +2778,23 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) return ret; } +/* + * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory + * without the running VCPU when dirty ring is enabled. + * + * The running VCPU is required to track dirty guest pages when dirty ring + * is enabled. Otherwise, the backup bitmap should be used to track the + * dirty guest pages. When vgic/its tables are being saved, the backup + * bitmap is used to track the dirty guest pages due to the missed running + * VCPU in the period. + */ +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + return dist->save_its_tables_in_progress; +} + static int vgic_its_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 24913271e898..8dd5a8fe64b4 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -21,9 +21,12 @@ void copy_highpage(struct page *to, struct page *from) copy_page(kto, kfrom); - if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { - set_bit(PG_mte_tagged, &to->flags); + if (system_supports_mte() && page_mte_tagged(from)) { + page_kasan_tag_reset(to); + /* It's a new page, shouldn't have been tagged yet */ + WARN_ON_ONCE(!try_page_mte_tagging(to)); mte_copy_page_tags(kto, kfrom); + set_page_mte_tagged(to); } } EXPORT_SYMBOL(copy_highpage); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3eb2825d08cf..596f46dabe4e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -943,6 +943,8 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, void tag_clear_highpage(struct page *page) { + /* Newly allocated page, shouldn't have been tagged yet */ + WARN_ON_ONCE(!try_page_mte_tagging(page)); mte_zero_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index bed803d8e158..cd508ba80ab1 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -24,7 +24,7 @@ int mte_save_tags(struct page *page) { void *tag_storage, *ret; - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) return 0; tag_storage = mte_allocate_tag_storage(); @@ -46,21 +46,17 @@ int mte_save_tags(struct page *page) return 0; } -bool mte_restore_tags(swp_entry_t entry, struct page *page) +void mte_restore_tags(swp_entry_t entry, struct page *page) { void *tags = xa_load(&mte_pages, entry.val); if (!tags) - return false; + return; - /* - * Test PG_mte_tagged again in case it was racing with another - * set_pte_at(). - */ - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (try_page_mte_tagging(page)) { mte_restore_page_tags(page_address(page), tags); - - return true; + set_page_mte_tagged(page); + } } void mte_invalidate_tags(int type, pgoff_t offset) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index e9744b41a226..4939f57b6f6a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -598,7 +598,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 5d5e12f3bf86..9d3743ca16d5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -846,7 +846,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long pfn; /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h deleted file mode 100644 index e6463f866abc..000000000000 --- a/arch/powerpc/kvm/irq.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __IRQ_H -#define __IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - int ret = 0; - -#ifdef CONFIG_KVM_MPIC - ret = ret || (kvm->arch.mpic != NULL); -#endif -#ifdef CONFIG_KVM_XICS - ret = ret || (kvm->arch.xics != NULL); - ret = ret || (kvm->arch.xive != NULL); -#endif - smp_rmb(); - return ret; -} - -#endif diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b850b0efa201..04494a4fb37a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -36,7 +36,6 @@ #include <asm/setup.h> #include "timing.h" -#include "irq.h" #include "../mm/mmu_decl.h" #define CREATE_TRACE_POINTS @@ -2165,10 +2164,25 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) return 0; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); + ret = ret || (kvm->arch.xive != NULL); +#endif + smp_rmb(); + return ret; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { - if (!irqchip_in_kernel(kvm)) + if (!kvm_arch_irqchip_in_kernel(kvm)) return -ENXIO; irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index b1e98a9ed152..d67ce719d16a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -142,8 +142,7 @@ struct mcck_volatile_info { CR14_EXTERNAL_DAMAGE_SUBMASK) #define SIDAD_SIZE_MASK 0xff -#define sida_origin(sie_block) \ - ((sie_block)->sidad & PAGE_MASK) +#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) #define sida_size(sie_block) \ ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) @@ -276,6 +275,7 @@ struct kvm_s390_sie_block { #define ECB3_AES 0x04 #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU __u32 scaol; /* 0x0064 */ __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ @@ -942,6 +942,8 @@ struct kvm_s390_pv { unsigned long stor_base; void *stor_var; bool dumping; + void *set_aside; + struct list_head need_cleanup; struct mmu_notifier mmu_notifier; }; @@ -1017,7 +1019,13 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm); void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm); -extern int sie64a(struct kvm_s390_sie_block *, u64 *); +int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa); + +static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa) +{ + return __sie64a(virt_to_phys(sie_block), sie_block, rsa); +} + extern char sie_exit; extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index 08a8b96606d7..b85e13505a0f 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -4,8 +4,8 @@ #ifndef __ASSEMBLY__ -int set_memory_encrypted(unsigned long addr, int numpages); -int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long vaddr, int numpages); +int set_memory_decrypted(unsigned long vaddr, int numpages); #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index b23c658dce77..1802be5abb5d 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -46,6 +46,7 @@ struct stack_frame { unsigned long sie_savearea; unsigned long sie_reason; unsigned long sie_flags; + unsigned long sie_control_block_phys; }; }; unsigned long gprs[10]; diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index be3ef9dd6972..28a9ad57b6f1 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -34,6 +34,7 @@ #define UVC_CMD_INIT_UV 0x000f #define UVC_CMD_CREATE_SEC_CONF 0x0100 #define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102 #define UVC_CMD_CREATE_SEC_CPU 0x0120 #define UVC_CMD_DESTROY_SEC_CPU 0x0121 #define UVC_CMD_CONV_TO_SEC_STOR 0x0200 @@ -81,6 +82,7 @@ enum uv_cmds_inst { BIT_UVC_CMD_UNSHARE_ALL = 20, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23, BIT_UVC_CMD_DUMP_INIT = 24, BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25, BIT_UVC_CMD_DUMP_CPU = 26, @@ -230,6 +232,14 @@ struct uv_cb_nodata { u64 reserved20[4]; } __packed __aligned(8); +/* Destroy Configuration Fast */ +struct uv_cb_destroy_fast { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[5]; +} __packed __aligned(8); + /* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index d8ce965c0a97..3f8e760298c2 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -62,6 +62,7 @@ int main(void) OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); + OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); /* idle data offsets */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index e0d11f3adfcc..0f423e9df095 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -207,18 +207,20 @@ ENDPROC(__switch_to) #if IS_ENABLED(CONFIG_KVM) /* - * sie64a calling convention: - * %r2 pointer to sie control block - * %r3 guest register save area + * __sie64a calling convention: + * %r2 pointer to sie control block phys + * %r3 pointer to sie control block virt + * %r4 guest register save area */ -ENTRY(sie64a) +ENTRY(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers lg %r12,__LC_CURRENT - stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer - stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. + stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses + stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 + lmg %r0,%r13,0(%r4) # load guest gprs 0-13 lg %r14,__LC_GMAP # get gmap pointer ltgr %r14,%r14 jz .Lsie_gmap @@ -230,6 +232,7 @@ ENTRY(sie64a) jnz .Lsie_skip TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lsie_skip # exit if fp/vx regs changed + lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_entry: sie 0(%r14) @@ -240,13 +243,14 @@ ENTRY(sie64a) BPOFF BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_skip: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce .Lsie_done: # some program checks are suppressing. C code (e.g. do_protection_exception) # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. -# Other instructions between sie64a and .Lsie_done should not cause program +# Other instructions between __sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. .Lrewind_pad6: nopr 7 @@ -275,8 +279,8 @@ sie_exit: EX_TABLE(.Lrewind_pad4,.Lsie_fault) EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) -ENDPROC(sie64a) -EXPORT_SYMBOL(sie64a) +ENDPROC(__sie64a) +EXPORT_SYMBOL(__sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -355,7 +359,7 @@ ENTRY(pgm_check_handler) j 3f # -> fault in user space .Lpgm_skip_asce: #if IS_ENABLED(CONFIG_KVM) - # cleanup critical section for program checks in sie64a + # cleanup critical section for program checks in __sie64a OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f SIEEXIT lghi %r10,_PIF_GUEST_FAULT diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index f9810d2a267c..9f18a4af9c13 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -255,6 +255,13 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, */ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) { + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications)) + return false; if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) return false; return atomic_read(&mm->context.protected_count) > 1; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 88112065d941..0ee02dae14b2 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -217,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu) return 0; if (current->thread.per_flags & PER_FLAG_NO_TE) return 0; - itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba; + itdb = phys_to_virt(vcpu->arch.sie_block->itdba); rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); if (rc) return rc; @@ -409,8 +409,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu) out: if (!cc) { if (kvm_s390_pv_cpu_is_protected(vcpu)) { - memcpy((void *)(sida_origin(vcpu->arch.sie_block)), - sctns, PAGE_SIZE); + memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE); } else { r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); if (r) { @@ -464,7 +463,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu) static int handle_pv_spx(struct kvm_vcpu *vcpu) { - u32 pref = *(u32 *)vcpu->arch.sie_block->sidad; + u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block); kvm_s390_set_prefix(vcpu, pref); trace_kvm_s390_handle_prefix(vcpu, 1, pref); @@ -497,7 +496,7 @@ static int handle_pv_sclp(struct kvm_vcpu *vcpu) static int handle_pv_uvc(struct kvm_vcpu *vcpu) { - struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad; + struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block); struct uv_cb_cts uvcb = { .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, .header.len = sizeof(uvcb), diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ab569faf0df2..1dae78deddf2 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -314,11 +314,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) return READ_ONCE(gisa->ipm); } -static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) -{ - clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); -} - static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h deleted file mode 100644 index 484608c71dd0..000000000000 --- a/arch/s390/kvm/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 irqchip routines - * - * Copyright IBM Corp. 2014 - * - * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> - */ -#ifndef __KVM_IRQ_H -#define __KVM_IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} - -#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bc491a73815c..e4890e04b210 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -210,6 +210,14 @@ module_param(diag9c_forwarding_hz, uint, 0644); MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); /* + * allow asynchronous deinit for protected guests; enable by default since + * the feature is opt-in anyway + */ +static int async_destroy = 1; +module_param(async_destroy, int, 0444); +MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests"); + +/* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond * this, this requires changes to code, but the external uapi can stay. @@ -616,6 +624,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; case KVM_CAP_S390_PROTECTED: r = is_prot_virt_host(); break; @@ -2519,9 +2530,13 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) { + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; int r = 0; u16 dummy; - void __user *argp = (void __user *)cmd->data; + + if (need_lock) + mutex_lock(&kvm->lock); switch (cmd->cmd) { case KVM_PV_ENABLE: { @@ -2555,6 +2570,31 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); break; } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; case KVM_PV_DISABLE: { r = -EINVAL; if (!kvm_s390_pv_is_protected(kvm)) @@ -2568,7 +2608,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) */ if (r) break; - r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); /* no need to block service interrupts any more */ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); @@ -2718,6 +2758,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) default: r = -ENOTTY; } + if (need_lock) + mutex_unlock(&kvm->lock); + return r; } @@ -2922,9 +2965,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EINVAL; break; } - mutex_lock(&kvm->lock); + /* must be called without kvm->lock */ r = kvm_s390_handle_pv(kvm, &args); - mutex_unlock(&kvm->lock); if (copy_to_user(argp, &args, sizeof(args))) { r = -EFAULT; break; @@ -3243,6 +3285,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_vsie_init(kvm); if (use_gisa) kvm_s390_gisa_init(kvm); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -3287,11 +3331,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* * We are already at the end of life and kvm->lock is not taken. * This is ok as the file descriptor is closed by now and nobody - * can mess with the pv state. To avoid lockdep_assert_held from - * complaining we do not use kvm_s390_pv_is_protected. + * can mess with the pv state. */ - if (kvm_s390_pv_get_handle(kvm)) - kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); /* * Remove the mmu notifier only when the whole KVM VM is torn down, * and only if one was registered to begin with. If the VM is @@ -3344,28 +3386,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { if (!kvm_s390_use_sca_entries()) { - struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; return; } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); } read_unlock(&vcpu->kvm->arch.sca_lock); @@ -3396,6 +3440,7 @@ static int sca_switch_to_extended(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long vcpu_idx; u32 scaol, scaoh; + phys_addr_t new_sca_phys; if (kvm->arch.use_esca) return 0; @@ -3404,8 +3449,9 @@ static int sca_switch_to_extended(struct kvm *kvm) if (!new_sca) return -ENOMEM; - scaoh = (u32)((u64)(new_sca) >> 32); - scaol = (u32)(u64)(new_sca) & ~0x3fU; + new_sca_phys = virt_to_phys(new_sca); + scaoh = new_sca_phys >> 32; + scaol = new_sca_phys & ESCA_SCAOL_MASK; kvm_s390_vcpu_block_all(kvm); write_lock(&kvm->arch.sca_lock); @@ -3625,15 +3671,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) { - free_page(vcpu->arch.sie_block->cbrlo); + free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo)); vcpu->arch.sie_block->cbrlo = 0; } int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) { - vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.sie_block->cbrlo) + void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!cbrlo_page) return -ENOMEM; + + vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page); return 0; } @@ -3643,7 +3692,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) - vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; + vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list); } static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) @@ -3700,9 +3749,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); } - vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) - | SDNXC; - vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; + vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC; + vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb); if (sclp.has_kss) kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); @@ -3752,7 +3800,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; - vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); /* the real guest size will always be smaller than msl */ vcpu->arch.sie_block->mso = 0; @@ -5169,6 +5217,7 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *sida_addr; int r = 0; if (mop->flags || !mop->size) @@ -5180,16 +5229,16 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, if (!kvm_s390_pv_cpu_is_protected(vcpu)) return -EINVAL; + sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset; + switch (mop->op) { case KVM_S390_MEMOP_SIDA_READ: - if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) + - mop->sida_offset), mop->size)) + if (copy_to_user(uaddr, sida_addr, mop->size)) r = -EFAULT; break; case KVM_S390_MEMOP_SIDA_WRITE: - if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) + - mop->sida_offset), uaddr, mop->size)) + if (copy_from_user(sida_addr, uaddr, mop->size)) r = -EFAULT; break; } @@ -5567,6 +5616,11 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return true; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 4755492dfabc..d48588c207d8 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -23,7 +23,8 @@ /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 -#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +#define IS_ITDB_VALID(vcpu) \ + ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; extern debug_info_t *kvm_s390_dbf_uv; @@ -233,7 +234,7 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) { - u32 gd = (u32)(u64)kvm->arch.gisa_int.origin; + u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); if (gd && sclp.has_gisaf) gd |= GISA_FORMAT1; @@ -243,6 +244,9 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 3335fa09b6f1..9f8a192bd750 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -924,8 +924,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) return -EREMOTE; } if (kvm_s390_pv_cpu_is_protected(vcpu)) { - memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, - PAGE_SIZE); + memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE); rc = 0; } else { rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 7cb7799a0acb..e032ebbf51b9 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -18,6 +18,29 @@ #include <linux/mmu_notifier.h> #include "kvm-s390.h" +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + static void kvm_s390_clear_pv_state(struct kvm *kvm) { kvm->arch.pv.handle = 0; @@ -44,7 +67,7 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); - free_page(sida_origin(vcpu->arch.sie_block)); + free_page((unsigned long)sida_addr(vcpu->arch.sie_block)); vcpu->arch.sie_block->pv_handle_cpu = 0; vcpu->arch.sie_block->pv_handle_config = 0; memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); @@ -66,6 +89,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) .header.cmd = UVC_CMD_CREATE_SEC_CPU, .header.len = sizeof(uvcb), }; + void *sida_addr; int cc; if (kvm_s390_pv_cpu_get_handle(vcpu)) @@ -79,16 +103,17 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) /* Input */ uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); uvcb.num = vcpu->arch.sie_block->icpua; - uvcb.state_origin = (u64)vcpu->arch.sie_block; - uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block); + uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base); /* Alloc Secure Instruction Data Area Designation */ - vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!vcpu->arch.sie_block->sidad) { + sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sida_addr) { free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); return -ENOMEM; } + vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr); cc = uv_call(0, (u64)&uvcb); *rc = uvcb.header.rc; @@ -159,23 +184,192 @@ out_err: return -ENOMEM; } -/* this should not fail, but if it does, we must not free the donated memory */ -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) { int cc; - cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), - UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + /* It used the destroy-fast UVC, nothing left to do here */ + if (!leftover->handle) + goto done_fast; + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; + /* + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. + */ + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); +done_fast: + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_destroy_fast uvcb = { + .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST, + .header.len = sizeof(uvcb), + .handle = kvm_s390_pv_get_handle(kvm), + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", + uvcb.header.rc, uvcb.header.rrc); + WARN_ONCE(cc, "protvirt destroy vm fast failed handle %llx rc %x rrc %x", + kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc); + /* Inteded memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +static inline bool is_destroy_fast_available(void) +{ + return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + int res = 0; + + lockdep_assert_held(&kvm->lock); /* - * if the mm still has a mapping, make all its pages accessible - * before destroying the guest + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. */ - if (mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); - mmput(kvm->mm); + if (kvm->arch.pv.set_aside) + return -EINVAL; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + if (is_destroy_fast_available()) { + res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); + } else { + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) + res = -ENOMEM; } + if (res) { + kfree(priv); + return res; + } + + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -189,11 +383,137 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return cc ? -EIO : 0; } +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */ + atomic_inc(&kvm->mm->context.protected_count); + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, struct mm_struct *mm) { struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); u16 dummy; + int r; /* * No locking is needed since this is the last thread of the last user of this @@ -202,7 +522,9 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, * unregistered. This means that if this notifier runs, then the * struct kvm is still valid. */ - kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); } static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { @@ -226,8 +548,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; uvcb.guest_asce = kvm->arch.gmap->asce; - uvcb.guest_sca = (unsigned long)kvm->arch.sca; - uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; + uvcb.guest_sca = virt_to_phys(kvm->arch.sca); + uvcb.conf_base_stor_origin = + virt_to_phys((void *)kvm->arch.pv.stor_base); uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; cc = uv_call_sched(0, (u64)&uvcb); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index ace2541ababd..b6a0219e470a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -656,7 +656,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) page = gfn_to_page(kvm, gpa_to_gfn(gpa)); if (is_error_page(page)) return -EINVAL; - *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); return 0; } @@ -871,7 +871,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, WARN_ON_ONCE(rc); return 1; } - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + vsie_page->scb_o = phys_to_virt(hpa); return 0; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 8947451ae021..74e1d873dce0 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -72,7 +72,7 @@ static struct gmap *gmap_alloc(unsigned long limit) goto out_free; page->index = 0; list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -311,12 +311,12 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); page->index = gaddr; page = NULL; @@ -556,7 +556,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -564,7 +564,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -572,7 +572,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ @@ -812,7 +812,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -820,7 +820,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -828,7 +828,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@ -836,7 +836,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; @@ -1149,7 +1149,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; + *val = *(unsigned long *)__va(address); set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; @@ -1334,7 +1334,8 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; + unsigned long *ste; + phys_addr_t sto, pgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1342,13 +1343,13 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + page = phys_to_page(pgt); list_del(&page->lru); page_table_free_pgste(page); } @@ -1364,19 +1365,19 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long *pgt; struct page *page; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + page = phys_to_page(pgt); list_del(&page->lru); page_table_free_pgste(page); } @@ -1391,7 +1392,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1400,12 +1402,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1421,19 +1423,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1448,7 +1450,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1457,12 +1460,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1478,7 +1481,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long *r3t; + phys_addr_t r3t; struct page *page; int i; @@ -1486,11 +1489,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1505,8 +1508,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ @@ -1514,12 +1518,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1535,22 +1539,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + asce = __pa(r1t) | _ASCE_TYPE_REGION1; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1572,7 +1577,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1747,7 +1752,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; @@ -1759,7 +1765,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, page->index = r2t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1774,9 +1780,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); @@ -1797,8 +1803,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1831,7 +1836,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; @@ -1843,7 +1849,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, page->index = r3t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1858,9 +1864,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); @@ -1881,8 +1887,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1915,7 +1920,8 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; @@ -1927,7 +1933,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, page->index = sgt & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1942,9 +1948,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; @@ -1965,8 +1971,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -2039,8 +2044,9 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; + unsigned long *table; struct page *page; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); @@ -2051,7 +2057,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, page->index = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + s_pgt = page_to_phys(page); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2084,8 +2090,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 1a25d456d865..30ab55f868f6 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -141,25 +141,25 @@ void mark_rodata_ro(void) debug_checkwx(); } -int set_memory_encrypted(unsigned long addr, int numpages) +int set_memory_encrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages unshared, (swiotlb, dma_free) */ for (i = 0; i < numpages; ++i) { - uv_remove_shared(addr); - addr += PAGE_SIZE; + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -int set_memory_decrypted(unsigned long addr, int numpages) +int set_memory_decrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages shared (swiotlb, dma_alloca) */ for (i = 0; i < numpages; ++i) { - uv_set_shared(addr); - addr += PAGE_SIZE; + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 017baba56b01..1f21f576ca77 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1603,10 +1603,8 @@ clear_arch_lbr: * x86_perf_get_lbr - get the LBR records information * * @lbr: the caller's memory to store the LBR records information - * - * Returns: 0 indicates the LBR info has been successfully obtained */ -int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { int lbr_fmt = x86_pmu.intel_cap.lbr_format; @@ -1614,8 +1612,6 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; - - return 0; } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2dd2691b5ee1..61012476d66e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -311,6 +311,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 6d9368ea3701..08e822bd7aa6 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -61,6 +61,8 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) /* Support for debug MSRs available */ #define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +/* Support for extended gva ranges for flush hypercalls available */ +#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14) /* * Support for returning hypercall output block via XMM * registers is available @@ -607,6 +609,41 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF +/* + * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by + * pairing it with architecturally impossible exit reasons. Bit 28 is set only + * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit + * is pending. I.e. it will never be set by hardware for non-SMI exits (there + * are only three), nor will it ever be set unless the VMM is an STM. + */ +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031 + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose + * SVM enlightenments to guests. + */ +struct hv_vmcb_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB. + */ +#define HV_VMCB_NESTED_ENLIGHTENMENTS 31 + +/* Synthetic VM-Exit */ +#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) + struct hv_partition_assist_pg { u32 tlb_lock_count; }; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 82ba4a564e58..abccd51dcfca 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -110,10 +110,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) +#ifdef CONFIG_KVM_SMM KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) +#endif KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) @@ -123,7 +125,7 @@ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f05ebaa26f0f..f35f1ff4427b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include <linux/clocksource.h> #include <linux/irqbypass.h> #include <linux/hyperv.h> +#include <linux/kfifo.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> @@ -81,7 +82,9 @@ #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) +#ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) +#endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) @@ -108,6 +111,8 @@ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -204,6 +209,7 @@ typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; +union kvm_smram; enum x86_intercept; enum x86_intercept_stage; @@ -253,16 +259,16 @@ enum x86_intercept_stage; #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) -#define PFERR_PK_MASK (1U << PFERR_PK_BIT) -#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) +#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) +#define PFERR_USER_MASK BIT(PFERR_USER_BIT) +#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) +#define PFERR_PK_MASK BIT(PFERR_PK_BIT) +#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ @@ -488,17 +494,19 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; /* More counters may conflict with other existing Architectural MSRs */ @@ -524,7 +532,16 @@ struct kvm_pmu { struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; - DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + + /* + * Overlay the bitmap with a 64-bit atomic so that all bits can be + * set in a single access, e.g. to reprogram all counters when the PMU + * filter changes. + */ + union { + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + atomic64_t __reprogram_pmi; + }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); @@ -602,6 +619,29 @@ struct kvm_vcpu_hv_synic { bool dont_zero_synic_pages; }; +/* The maximum number of entries on the TLB flush fifo. */ +#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) +/* + * Note: the following 'magic' entry is made up by KVM to avoid putting + * anything besides GVA on the TLB flush fifo. It is theoretically possible + * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 + * which will look identical. KVM's action to 'flush everything' instead of + * flushing these particular addresses is, however, fully legitimate as + * flushing more than requested is always OK. + */ +#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) + +enum hv_tlb_flush_fifos { + HV_L1_TLB_FLUSH_FIFO, + HV_L2_TLB_FLUSH_FIFO, + HV_NR_TLB_FLUSH_FIFOS, +}; + +struct kvm_vcpu_hv_tlb_flush_fifo { + spinlock_t write_lock; + DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; @@ -623,6 +663,19 @@ struct kvm_vcpu_hv { u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; + + struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; + + /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ + u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; + + struct hv_vp_assist_page vp_assist_page; + + struct { + u64 pa_page_gpa; + u64 vm_id; + u32 vp_id; + } nested; }; /* Xen HVM per vcpu emulation context */ @@ -633,6 +686,7 @@ struct kvm_vcpu_xen { struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; + struct gfn_to_pfn_cache runstate2_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; @@ -1059,6 +1113,7 @@ struct msr_bitmap_range { struct kvm_xen { u32 xen_version; bool long_mode; + bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; @@ -1156,7 +1211,18 @@ struct kvm_arch { struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; - struct list_head lpage_disallowed_mmu_pages; + /* + * A list of kvm_mmu_page structs that, if zapped, could possibly be + * replaced by an NX huge page. A shadow page is on this list if its + * existence disallows an NX huge page (nx_huge_page_disallowed is set) + * and there are no other conditions that prevent a huge page, e.g. + * the backing host page is huge, dirtly logging is not enabled for its + * memslot, etc... Note, zapping shadow pages on this list doesn't + * guarantee an NX huge page will be created in its stead, e.g. if the + * guest attempts to execute from the region then KVM obviously can't + * create an NX huge page (without hanging the guest). + */ + struct list_head possible_nx_huge_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; /* @@ -1272,7 +1338,7 @@ struct kvm_arch { bool sgx_provisioning_allowed; struct kvm_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_lpage_recovery_thread; + struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 /* @@ -1284,6 +1350,9 @@ struct kvm_arch { */ bool tdp_mmu_enabled; + /* The number of TDP MMU pages across all roots. */ + atomic64_t tdp_mmu_pages; + /* * List of kvm_mmu_page structs being used as roots. * All kvm_mmu_page structs in the list should have @@ -1305,20 +1374,12 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* - * List of kvm_mmu_page structs not being used as roots. - * All kvm_mmu_page structs in the list should have - * tdp_mmu_page set and a tdp_mmu_root_count of 0. - */ - struct list_head tdp_mmu_pages; - - /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) - * - tdp_mmu_pages (above) * - the link field of kvm_mmu_page structs used by the TDP MMU - * - lpage_disallowed_mmu_pages - * - the lpage_disallowed_link field of kvm_mmu_page structs used + * - possible_nx_huge_pages; + * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * It is acceptable, but not necessary, to acquire this lock when * the thread holds the MMU lock in write mode. @@ -1612,10 +1673,12 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); + int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); +#endif int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1630,7 +1693,7 @@ struct kvm_x86_ops { void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); - int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); @@ -1663,6 +1726,7 @@ struct kvm_x86_nested_ops { int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); + void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { @@ -1844,6 +1908,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); @@ -1909,8 +1974,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -1994,14 +2057,18 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ + +#ifdef CONFIG_KVM_SMM #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) -#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -#define KVM_ADDRESS_SPACE_NUM 2 - -#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) -#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +# define KVM_ADDRESS_SPACE_NUM 2 +# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +#else +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) +#endif #define KVM_ARCH_WANT_MMU_NOTIFIER @@ -2089,14 +2156,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) - -int kvm_cpu_dirty_log_size(void); - int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9ac46dbe57d4..5d0f6891ae61 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -543,12 +543,12 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { - return -1; + memset(lbr, 0, sizeof(*lbr)); } #endif diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0361626841bc..cb1ee53ad3b1 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -5,6 +5,8 @@ #include <uapi/asm/svm.h> #include <uapi/asm/kvm.h> +#include <asm/hyperv-tlfs.h> + /* * 32-bit intercept words in the VMCB Control Area, starting * at Byte offset 000h. @@ -161,7 +163,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area { * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. */ - u8 reserved_sw[32]; + union { + struct hv_vmcb_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; @@ -293,12 +298,13 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[42]; + /* Reserved fields are named following their struct offset */ + u8 reserved_0xa0[42]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_0xd8[112]; u64 cr4; u64 cr3; u64 cr0; @@ -306,7 +312,7 @@ struct vmcb_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u8 reserved_0x180[88]; u64 rsp; u64 s_cet; u64 ssp; @@ -321,14 +327,14 @@ struct vmcb_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_6[72]; + u8 reserved_0x298[72]; u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; @@ -349,12 +355,12 @@ struct sev_es_save_area { u64 vmpl2_ssp; u64 vmpl3_ssp; u64 u_cet; - u8 reserved_1[2]; + u8 reserved_0xc8[2]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[104]; + u8 reserved_0xd8[104]; u64 xss; u64 cr4; u64 cr3; @@ -371,7 +377,7 @@ struct sev_es_save_area { u64 dr1_addr_mask; u64 dr2_addr_mask; u64 dr3_addr_mask; - u8 reserved_4[24]; + u8 reserved_0x1c0[24]; u64 rsp; u64 s_cet; u64 ssp; @@ -386,21 +392,21 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_7[80]; + u8 reserved_0x298[80]; u32 pkru; - u8 reserved_8[20]; - u64 reserved_9; /* rax already available at 0x01f8 */ + u32 tsc_aux; + u8 reserved_0x2f0[24]; u64 rcx; u64 rdx; u64 rbx; - u64 reserved_10; /* rsp already available at 0x01d8 */ + u64 reserved_0x320; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -412,7 +418,7 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_11[16]; + u8 reserved_0x380[16]; u64 guest_exit_info_1; u64 guest_exit_info_2; u64 guest_exit_int_info; @@ -425,7 +431,7 @@ struct sev_es_save_area { u64 pcpu_id; u64 event_inj; u64 xcr0; - u8 reserved_12[16]; + u8 reserved_0x3f0[16]; /* Floating point area */ u64 x87_dp; @@ -443,23 +449,23 @@ struct sev_es_save_area { } __packed; struct ghcb_save_area { - u8 reserved_1[203]; + u8 reserved_0x0[203]; u8 cpl; - u8 reserved_2[116]; + u8 reserved_0xcc[116]; u64 xss; - u8 reserved_3[24]; + u8 reserved_0x148[24]; u64 dr7; - u8 reserved_4[16]; + u8 reserved_0x168[16]; u64 rip; - u8 reserved_5[88]; + u8 reserved_0x180[88]; u64 rsp; - u8 reserved_6[24]; + u8 reserved_0x1e0[24]; u64 rax; - u8 reserved_7[264]; + u8 reserved_0x200[264]; u64 rcx; u64 rdx; u64 rbx; - u8 reserved_8[8]; + u8 reserved_0x320[8]; u64 rbp; u64 rsi; u64 rdi; @@ -471,12 +477,12 @@ struct ghcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_9[16]; + u8 reserved_0x380[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_10[56]; + u8 reserved_0x3b0[56]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; @@ -490,7 +496,7 @@ struct ghcb { u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; - u8 reserved_1[10]; + u8 reserved_0xff0[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ u32 ghcb_usage; } __packed; @@ -502,6 +508,9 @@ struct ghcb { #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE +#define BUILD_BUG_RESERVED_OFFSET(x, y) \ + ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y) + static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); @@ -509,6 +518,39 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); + + /* Check offsets of reserved fields */ + + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298); + + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0); + + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0); + + BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } struct vmcb { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 46de10a809ec..e48deab8901d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -53,14 +53,6 @@ /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ @@ -214,6 +206,8 @@ struct kvm_msr_list { struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) +#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \ + KVM_MSR_FILTER_WRITE) __u32 flags; __u32 nmsrs; /* number of msrs in bitmap */ __u32 base; /* MSR index the bitmap starts at */ @@ -222,8 +216,11 @@ struct kvm_msr_filter_range { #define KVM_MSR_FILTER_MAX_RANGES 16 struct kvm_msr_filter { +#ifndef __KERNEL__ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#endif #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) +#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY) __u32 flags; struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4d053cb2c48a..1cceac5984da 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 67be7f217e37..fbeaa9ddef59 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -118,6 +118,17 @@ config KVM_AMD_SEV Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. +config KVM_SMM + bool "System Management Mode emulation" + default y + depends on KVM + help + Provides support for KVM to emulate System Management Mode (SMM) + in virtual machines. This can be used by the virtual machine + firmware to implement UEFI secure boot. + + If unsure, say Y. + config KVM_XEN bool "Support for Xen hypercall interface" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f453a0f96e24..80e3fe184d17 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,12 +20,14 @@ endif kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o +kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/evmcs.o vmx/nested.o vmx/posted_intr.o + vmx/hyperv.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o -kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o +kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ + svm/sev.o svm/hyperv.o ifdef CONFIG_HYPERV kvm-amd-y += svm/svm_onhyperv.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c92c49a0b35b..b14653b61470 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -62,10 +62,16 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) * This one is tied to SSB in the user API, and not * visible in /proc/cpuinfo. */ -#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ #define F feature_bit -#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) + +/* Scattered Flag - For features that are scattered by cpufeatures.h. */ +#define SF(name) \ +({ \ + BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ + (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \ +}) /* * Magic value used by KVM when querying userspace-provided CPUID entries and @@ -543,9 +549,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) } static __always_inline -void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) +void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ BUILD_BUG_ON(leaf < NCAPINTS); kvm_cpu_caps[leaf] = mask; @@ -555,7 +561,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ BUILD_BUG_ON(leaf >= NCAPINTS); kvm_cpu_caps[leaf] &= mask; @@ -657,14 +663,19 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) | + F(AVX_IFMA) + ); + + kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); - kvm_cpu_cap_init_scattered(CPUID_12_EAX, + kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); @@ -694,7 +705,7 @@ void kvm_set_cpu_caps(void) F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - __feature_bit(KVM_X86_FEATURE_PSFD) + __feature_bit(KVM_X86_FEATURE_AMD_PSFD) ); /* @@ -913,9 +924,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); + cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; entry->ecx = 0; - entry->edx = 0; } break; case 0xa: { /* Architectural Performance Monitoring */ @@ -1220,8 +1231,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * Other defined bits are for MSRs that KVM does not expose: * EAX 3 SPCL, SMM page configuration lock * EAX 13 PCMSR, Prefetch control MSR + * + * KVM doesn't support SMM_CTL. + * EAX 9 SMM_CTL MSR is not supported */ entry->eax &= BIT(0) | BIT(2) | BIT(6); + entry->eax |= BIT(9); if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) entry->eax |= BIT(2); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4a43261d25a2..5cc3efa0e21c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -242,37 +242,6 @@ enum x86_transfer_type { X86_TRANSFER_TASK_SWITCH, }; -static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) - nr &= NR_EMULATOR_GPRS - 1; - - if (!(ctxt->regs_valid & (1 << nr))) { - ctxt->regs_valid |= 1 << nr; - ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); - } - return ctxt->_regs[nr]; -} - -static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) - nr &= NR_EMULATOR_GPRS - 1; - - BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); - BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); - - ctxt->regs_valid |= 1 << nr; - ctxt->regs_dirty |= 1 << nr; - return &ctxt->_regs[nr]; -} - -static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - reg_read(ctxt, nr); - return reg_write(ctxt, nr); -} - static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; @@ -2338,335 +2307,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) -{ -#ifdef CONFIG_X86_64 - return ctxt->ops->guest_has_long_mode(ctxt); -#else - return false; -#endif -} - -static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) -{ - desc->g = (flags >> 23) & 1; - desc->d = (flags >> 22) & 1; - desc->l = (flags >> 21) & 1; - desc->avl = (flags >> 20) & 1; - desc->p = (flags >> 15) & 1; - desc->dpl = (flags >> 13) & 3; - desc->s = (flags >> 12) & 1; - desc->type = (flags >> 8) & 15; -} - -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, - int n) -{ - struct desc_struct desc; - int offset; - u16 selector; - - selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); - return X86EMUL_CONTINUE; -} - -#ifdef CONFIG_X86_64 -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, - int n) -{ - struct desc_struct desc; - int offset; - u16 selector; - u32 base3; - - offset = 0x7e00 + n * 16; - - selector = GET_SMSTATE(u16, smstate, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - base3 = GET_SMSTATE(u32, smstate, offset + 12); - - ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); - return X86EMUL_CONTINUE; -} -#endif - -static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr3, u64 cr4) -{ - int bad; - u64 pcid; - - /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ - pcid = 0; - if (cr4 & X86_CR4_PCIDE) { - pcid = cr3 & 0xfff; - cr3 &= ~0xfff; - } - - bad = ctxt->ops->set_cr(ctxt, 3, cr3); - if (bad) - return X86EMUL_UNHANDLEABLE; - - /* - * First enable PAE, long mode needs it before CR0.PG = 1 is set. - * Then enable protected mode. However, PCID cannot be enabled - * if EFER.LMA=0, so set it separately. - */ - bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - if (bad) - return X86EMUL_UNHANDLEABLE; - - bad = ctxt->ops->set_cr(ctxt, 0, cr0); - if (bad) - return X86EMUL_UNHANDLEABLE; - - if (cr4 & X86_CR4_PCIDE) { - bad = ctxt->ops->set_cr(ctxt, 4, cr4); - if (bad) - return X86EMUL_UNHANDLEABLE; - if (pcid) { - bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); - if (bad) - return X86EMUL_UNHANDLEABLE; - } - - } - - return X86EMUL_CONTINUE; -} - -static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - struct desc_struct desc; - struct desc_ptr dt; - u16 selector; - u32 val, cr0, cr3, cr4; - int i; - - cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); - cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - - for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - - val = GET_SMSTATE(u32, smstate, 0x7fcc); - - if (ctxt->ops->set_dr(ctxt, 6, val)) - return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u32, smstate, 0x7fc8); - - if (ctxt->ops->set_dr(ctxt, 7, val)) - return X86EMUL_UNHANDLEABLE; - - selector = GET_SMSTATE(u32, smstate, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - - selector = GET_SMSTATE(u32, smstate, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - - dt.address = GET_SMSTATE(u32, smstate, 0x7f74); - dt.size = GET_SMSTATE(u32, smstate, 0x7f70); - ctxt->ops->set_gdt(ctxt, &dt); - - dt.address = GET_SMSTATE(u32, smstate, 0x7f58); - dt.size = GET_SMSTATE(u32, smstate, 0x7f54); - ctxt->ops->set_idt(ctxt, &dt); - - for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } - - cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); - - return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); -} - -#ifdef CONFIG_X86_64 -static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - struct desc_struct desc; - struct desc_ptr dt; - u64 val, cr0, cr3, cr4; - u32 base3; - u16 selector; - int i, r; - - for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - - ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - - val = GET_SMSTATE(u64, smstate, 0x7f68); - - if (ctxt->ops->set_dr(ctxt, 6, val)) - return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u64, smstate, 0x7f60); - - if (ctxt->ops->set_dr(ctxt, 7, val)) - return X86EMUL_UNHANDLEABLE; - - cr0 = GET_SMSTATE(u64, smstate, 0x7f58); - cr3 = GET_SMSTATE(u64, smstate, 0x7f50); - cr4 = GET_SMSTATE(u64, smstate, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); - val = GET_SMSTATE(u64, smstate, 0x7ed0); - - if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) - return X86EMUL_UNHANDLEABLE; - - selector = GET_SMSTATE(u32, smstate, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); - base3 = GET_SMSTATE(u32, smstate, 0x7e9c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - - dt.size = GET_SMSTATE(u32, smstate, 0x7e84); - dt.address = GET_SMSTATE(u64, smstate, 0x7e88); - ctxt->ops->set_idt(ctxt, &dt); - - selector = GET_SMSTATE(u32, smstate, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); - base3 = GET_SMSTATE(u32, smstate, 0x7e7c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - - dt.size = GET_SMSTATE(u32, smstate, 0x7e64); - dt.address = GET_SMSTATE(u64, smstate, 0x7e68); - ctxt->ops->set_gdt(ctxt, &dt); - - r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); - if (r != X86EMUL_CONTINUE) - return r; - - for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } - - return X86EMUL_CONTINUE; -} -#endif - static int em_rsm(struct x86_emulate_ctxt *ctxt) { - unsigned long cr0, cr4, efer; - char buf[512]; - u64 smbase; - int ret; - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); - smbase = ctxt->ops->get_smbase(ctxt); - - ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); - if (ret != X86EMUL_CONTINUE) - return X86EMUL_UNHANDLEABLE; - - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); - - ctxt->ops->exiting_smm(ctxt); - - /* - * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU - * supports long mode. - */ - if (emulator_has_longmode(ctxt)) { - struct desc_struct cs_desc; - - /* Zero CR4.PCIDE before CR0.PG. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_PCIDE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - - /* A 32-bit code segment is required to clear EFER.LMA. */ - memset(&cs_desc, 0, sizeof(cs_desc)); - cs_desc.type = 0xb; - cs_desc.s = cs_desc.g = cs_desc.p = 1; - ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); - } - - /* For the 64-bit case, this will clear EFER.LMA. */ - cr0 = ctxt->ops->get_cr(ctxt, 0); - if (cr0 & X86_CR0_PE) - ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - - if (emulator_has_longmode(ctxt)) { - /* Clear CR4.PAE before clearing EFER.LME. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); - } - - /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. enter guest mode) before loading state from the SMM - * state-save area. - */ - if (ctxt->ops->leave_smm(ctxt, buf)) - goto emulate_shutdown; - -#ifdef CONFIG_X86_64 - if (emulator_has_longmode(ctxt)) - ret = rsm_load_state_64(ctxt, buf); - else -#endif - ret = rsm_load_state_32(ctxt, buf); - - if (ret != X86EMUL_CONTINUE) - goto emulate_shutdown; + if (ctxt->ops->leave_smm(ctxt)) + ctxt->ops->triple_fault(ctxt); - /* - * Note, the ctxt->ops callbacks are responsible for handling side - * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID - * runtime updates, etc... If that changes, e.g. this flow is moved - * out of the emulator to make it look more like enter_smm(), then - * those side effects need to be explicitly handled for both success - * and shutdown. - */ return emulator_recalc_and_set_mode(ctxt); - -emulate_shutdown: - ctxt->ops->triple_fault(ctxt); - return X86EMUL_CONTINUE; } static void diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0adf4a437e85..2c7f2a26421e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,22 +23,25 @@ #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" +#include "mmu.h" #include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> +#include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> +#include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" #include "irq.h" #include "fpu.h" -#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -897,13 +900,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, - struct hv_vp_assist_page *assist_page) +int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { - if (!kvm_hv_assist_page_enabled(vcpu)) - return false; - return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, - assist_page, sizeof(*assist_page)); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) + return -EFAULT; + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); @@ -954,6 +959,11 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) hv_vcpu->vp_index = vcpu->vcpu_idx; + for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { + INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); + spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); + } + return 0; } @@ -1736,6 +1746,28 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, } } +static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) +{ + int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; + unsigned long sbank; + + if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) + return false; + + /* + * The index into the sparse bank is the number of preceding bits in + * the valid mask. Optimize for VMs with <64 vCPUs by skipping the + * fancy math if there can't possibly be preceding bits. + */ + if (valid_bit_nr) + sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); + else + sbank = 0; + + return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, + (unsigned long *)&sparse_banks[sbank]); +} + struct kvm_hv_hcall { u64 param; u64 ingpa; @@ -1749,57 +1781,173 @@ struct kvm_hv_hcall { sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; }; -static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, - int consumed_xmm_halves, - u64 *sparse_banks, gpa_t offset) -{ - u16 var_cnt; - int i; - - if (hc->var_cnt > 64) - return -EINVAL; - /* Ignore banks that cannot possibly contain a legal VP index. */ - var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS); +static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, + u16 orig_cnt, u16 cnt_cap, u64 *data, + int consumed_xmm_halves, gpa_t offset) +{ + /* + * Preserve the original count when ignoring entries via a "cap", KVM + * still needs to validate the guest input (though the non-XMM path + * punts on the checks). + */ + u16 cnt = min(orig_cnt, cnt_cap); + int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ - if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) + if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; - for (i = 0; i < var_cnt; i++) { - int j = i + consumed_xmm_halves; + + for (i = 0; i < cnt; i++) { + j = i + consumed_xmm_halves; if (j % 2) - sparse_banks[i] = sse128_hi(hc->xmm[j / 2]); + data[i] = sse128_hi(hc->xmm[j / 2]); else - sparse_banks[i] = sse128_lo(hc->xmm[j / 2]); + data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } - return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks, - var_cnt * sizeof(*sparse_banks)); + return kvm_read_guest(kvm, hc->ingpa + offset, data, + cnt * sizeof(*data)); +} + +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + u64 *sparse_banks, int consumed_xmm_halves, + gpa_t offset) +{ + if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) + return -EINVAL; + + /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ + return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, + sparse_banks, consumed_xmm_halves, offset); +} + +static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[], + int consumed_xmm_halves, gpa_t offset) +{ + return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, + entries, consumed_xmm_halves, offset); +} + +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, + u64 *entries, int count) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; + + if (!hv_vcpu) + return; + + spin_lock(&tlb_flush_fifo->write_lock); + + /* + * All entries should fit on the fifo leaving one free for 'flush all' + * entry in case another request comes in. In case there's not enough + * space, just put 'flush all' entry there. + */ + if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { + WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); + goto out_unlock; + } + + /* + * Note: full fifo always contains 'flush all' entry, no need to check the + * return value. + */ + kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); + +out_unlock: + spin_unlock(&tlb_flush_fifo->write_lock); +} + +int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; + int i, j, count; + gva_t gva; + + if (!tdp_enabled || !hv_vcpu) + return -EINVAL; + + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); + + count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); + + for (i = 0; i < count; i++) { + if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) + goto out_flush_all; + + /* + * Lower 12 bits of 'address' encode the number of additional + * pages to flush. + */ + gva = entries[i] & PAGE_MASK; + for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); + + ++vcpu->stat.tlb_flush; + } + return 0; + +out_flush_all: + kfifo_reset_out(&tlb_flush_fifo->entries); + + /* Fall back to full flush. */ + return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + /* + * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' + * entries on the TLB flush fifo. The last entry, however, needs to be + * always left free for 'flush all' entry which gets placed when + * there is not enough space to put all the requested entries. + */ + u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; + u64 *tlb_flush_entries; u64 valid_bank_mask; - u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + struct kvm_vcpu *v; + unsigned long i; bool all_cpus; + int consumed_xmm_halves = 0; + gpa_t data_offset; /* - * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the - * valid mask is a u64. Fail the build if KVM's max allowed number of - * vCPUs (>4096) would exceed this limit, KVM will additional changes - * for Hyper-V support to avoid setting the guest up to fail. + * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS + * sparse banks. Fail the build if KVM's max allowed number of + * vCPUs (>4096) exceeds this limit. */ - BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); + + /* + * 'Slow' hypercall's first parameter is the address in guest's memory + * where hypercall parameters are placed. This is either a GPA or a + * nested GPA when KVM is handling the call from L2 ('direct' TLB + * flush). Translate the address here so the memory can be uniformly + * read with kvm_read_guest(). + */ + if (!hc->fast && is_guest_mode(vcpu)) { + hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); + if (unlikely(hc->ingpa == INVALID_GPA)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { @@ -1807,14 +1955,17 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); + consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; + data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, - flush.address_space, flush.flags); + flush.address_space, flush.flags, + is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; @@ -1834,16 +1985,18 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); + consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; + data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, - flush_ex.flags); + flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != @@ -1852,29 +2005,95 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (all_cpus) - goto do_flush; + if (!all_cpus) { + if (!hc->var_cnt) + goto ret_success; - if (!hc->var_cnt) - goto ret_success; + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, + consumed_xmm_halves, data_offset)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } - if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks, - offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents))) + /* + * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU + * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' + * case (HV_GENERIC_SET_ALL). Always adjust data_offset and + * consumed_xmm_halves to make sure TLB flush entries are read + * from the correct offset. + */ + data_offset += hc->var_cnt * sizeof(sparse_banks[0]); + consumed_xmm_halves += hc->var_cnt; + } + + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || + hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || + hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { + tlb_flush_entries = NULL; + } else { + if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries, + consumed_xmm_halves, data_offset)) return HV_STATUS_INVALID_HYPERCALL_INPUT; + tlb_flush_entries = __tlb_flush_entries; } -do_flush: /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - if (all_cpus) { - kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); - } else { + if (all_cpus && !is_guest_mode(vcpu)) { + kvm_for_each_vcpu(i, v, kvm) { + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); + } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask); + for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { + v = kvm_get_vcpu(kvm, i); + if (!v) + continue; + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); + } else { + struct kvm_vcpu_hv *hv_v; + + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); + + kvm_for_each_vcpu(i, v, kvm) { + hv_v = to_hv_vcpu(v); + + /* + * The following check races with nested vCPUs entering/exiting + * and/or migrating between L1's vCPUs, however the only case when + * KVM *must* flush the TLB is when the target L2 vCPU keeps + * running on the same L1 vCPU from the moment of the request until + * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other + * cases, e.g. when the target L2 vCPU migrates to a different L1 + * vCPU or when the corresponding L1 vCPU temporary switches to a + * different L2 vCPU while the request is being processed. + */ + if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) + continue; + + if (!all_cpus && + !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, + sparse_banks)) + continue; + + __set_bit(i, vcpu_mask); + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: @@ -1883,8 +2102,8 @@ ret_success: ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } -static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, - unsigned long *vcpu_bitmap) +static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, + u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, @@ -1894,7 +2113,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + if (sparse_banks && + !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), + valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ @@ -1904,12 +2125,12 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; - u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1959,7 +2180,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (!hc->var_cnt) goto ret_success; - if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks, + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1, offsetof(struct hv_send_ipi_ex, vp_set.bank_contents))) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1969,13 +2190,10 @@ check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (all_cpus) { - kvm_send_ipi_to_many(kvm, vector, NULL); - } else { - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); - } + if (all_cpus) + kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); + else + kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; @@ -2062,10 +2280,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + u32 tlb_lock_count = 0; + int ret; + + if (hv_result_success(result) && is_guest_mode(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu) && + kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, + &tlb_lock_count, sizeof(tlb_lock_count))) + result = HV_STATUS_INVALID_HYPERCALL_INPUT; + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); + + ret = kvm_skip_emulated_instruction(vcpu); + + if (tlb_lock_count) + kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); + + return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) @@ -2502,6 +2735,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel @@ -2545,6 +2779,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; + ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 1030b1b50552..9f96414a31c5 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -22,6 +22,7 @@ #define __ARCH_X86_KVM_HYPERV_H__ #include <linux/kvm_host.h> +#include "x86.h" /* "Hv#1" signature */ #define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 @@ -107,8 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, - struct hv_vp_assist_page *assist_page); +int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu, int timer_index) @@ -151,4 +151,64 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu, + bool is_guest_mode) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO : + HV_L1_TLB_FLUSH_FIFO; + + return &hv_vcpu->tlb_flush_fifo[i]; +} + +static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + + if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + return; + + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); + + kfifo_reset_out(&tlb_flush_fifo->entries); +} + +static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return hv_vcpu && + (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH); +} + +static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u16 code; + + if (!hv_vcpu) + return false; + + code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) : + kvm_rax_read(vcpu); + + return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || + code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || + code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || + code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX); +} + +static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) +{ + if (!to_hv_vcpu(vcpu)) + return 0; + + if (!kvm_hv_assist_page_enabled(vcpu)) + return 0; + + return kvm_hv_get_assist_page(vcpu); +} + +int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f371f1292ca3..a70952eca905 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -31,7 +31,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return r; } -EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* * check if there is a pending userspace external interrupt @@ -150,7 +149,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_inject_timer_irqs(vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { @@ -165,3 +163,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); } + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3febc342360c..c09174f73a34 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -200,9 +200,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } -static inline bool is_smm(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hflags & HF_SMM_MASK; -} - #endif diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 89246446d6aa..2d9662be8333 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -117,16 +117,6 @@ struct x86_emulate_ops { struct x86_exception *fault, bool system); /* - * read_phys: Read bytes of standard (non-emulated/special) memory. - * Used for descriptor reading. - * @addr: [IN ] Physical address from which to read. - * @val: [OUT] Value read from memory. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, - void *val, unsigned int bytes); - - /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. * @addr: [IN ] Linear address to which to write. @@ -209,11 +199,8 @@ struct x86_emulate_ops { int (*cpl)(struct x86_emulate_ctxt *ctxt); void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); - u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); - void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); - int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); @@ -234,8 +221,7 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); - void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); - int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); + int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; @@ -292,7 +278,6 @@ enum x86emul_mode { /* These match some of the HF_* flags defined in kvm_host.h */ #define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ #define X86EMUL_SMM_MASK (1 << 6) -#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) /* * fastop functions are declared as taking a never-defined fastop parameter, @@ -526,4 +511,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); +static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + if (!(ctxt->regs_valid & (1 << nr))) { + ctxt->regs_valid |= 1 << nr; + ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); + } + return ctxt->_regs[nr]; +} + +static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + + ctxt->regs_valid |= 1 << nr; + ctxt->regs_dirty |= 1 << nr; + return &ctxt->_regs[nr]; +} + +static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + reg_read(ctxt, nr); + return reg_write(ctxt, nr); +} + #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d7639d126e6c..4efdb4a4d72c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -42,6 +42,7 @@ #include "x86.h" #include "cpuid.h" #include "hyperv.h" +#include "smm.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -159,7 +160,6 @@ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) && !(kvm_mwait_in_guest(vcpu->kvm) || kvm_can_post_timer_interrupt(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer); static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { @@ -1170,9 +1170,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_SMI: - result = 1; - kvm_make_request(KVM_REQ_SMI, vcpu); - kvm_vcpu_kick(vcpu); + if (!kvm_inject_smi(vcpu)) { + kvm_vcpu_kick(vcpu); + result = 1; + } break; case APIC_DM_NMI: @@ -1912,7 +1913,6 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) return vcpu->arch.apic->lapic_timer.hv_timer_in_use; } -EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { @@ -2430,7 +2430,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->isr_count = count_vectors(apic->regs + APIC_ISR); } } -EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { @@ -2722,8 +2721,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); } - } else { - kvm_lapic_xapic_id_updated(vcpu->arch.apic); } return 0; @@ -2759,6 +2756,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); + if (!apic_x2apic_mode(apic)) + kvm_lapic_xapic_id_updated(apic); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a5ac4a5a5179..28e3769066e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -7,7 +7,7 @@ #include <linux/kvm_host.h> #include "hyperv.h" -#include "kvm_cache_regs.h" +#include "smm.h" #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b6f96d47e596..835426254e76 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -22,6 +22,7 @@ #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "smm.h" #include "kvm_emulate.h" #include "cpuid.h" #include "spte.h" @@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (sp->lpage_disallowed) + /* + * If it's possible to replace the shadow page with an NX huge page, + * i.e. if the shadow page is the only thing currently preventing KVM + * from using a huge page, add the shadow page to the list of "to be + * zapped for NX recovery" pages. Note, the shadow page can already be + * on the list if KVM is reusing an existing shadow page, i.e. if KVM + * links a shadow page at multiple points. + */ + if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; - list_add_tail(&sp->lpage_disallowed_link, - &kvm->arch.lpage_disallowed_mmu_pages); - sp->lpage_disallowed = true; + list_add_tail(&sp->possible_nx_huge_page_link, + &kvm->arch.possible_nx_huge_pages); +} + +static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible) +{ + sp->nx_huge_page_disallowed = true; + + if (nx_huge_page_possible) + track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + if (list_empty(&sp->possible_nx_huge_page_link)) + return; + --kvm->stat.nx_lpage_splits; - sp->lpage_disallowed = false; - list_del(&sp->lpage_disallowed_link); + list_del_init(&sp->possible_nx_huge_page_link); +} + +static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + sp->nx_huge_page_disallowed = false; + + untrack_possible_nx_huge_page(kvm, sp); } static struct kvm_memory_slot * @@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *pos; u64 *end; - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); @@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.invalid) return true; - /* TDP MMU pages due not use the MMU generation. */ + /* TDP MMU pages do not use the MMU generation. */ return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See @@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; @@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, spte); /* @@ -2487,8 +2515,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, zapped_root = !is_obsolete_sp(kvm, sp); } - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; @@ -2811,7 +2839,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3085,7 +3113,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && - !is_large_pte(spte)) { + !is_large_pte(spte) && + spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch) * and __direct_map would like to create a large PTE @@ -3127,9 +3156,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->is_tdp && fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -3149,8 +3178,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { + if (is_sigpending_pfn(pfn)) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can @@ -3172,7 +3206,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau { /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); + return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -3423,7 +3457,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + sp = spte_to_child_sp(*root_hpa); if (WARN_ON(!sp)) return; @@ -3908,8 +3946,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= SPTE_BASE_ADDR_MASK; - sp = to_shadow_page(root); + sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } @@ -4170,7 +4207,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) @@ -4187,7 +4224,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + /* + * Allow gup to bail on pending non-fatal signals when it's also allowed + * to wait for IO. Note, gup always bails if it is unable to quickly + * get a page and a fatal signal, i.e. SIGKILL, is pending. + */ + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; @@ -5972,7 +6014,7 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); r = kvm_mmu_init_tdp_mmu(kvm); @@ -6657,7 +6699,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -6789,7 +6831,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -6797,9 +6839,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_lpages(struct kvm *kvm) +static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6820,24 +6863,55 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages - * because the number of lpage_disallowed pages is expected to - * be relatively small compared to the total. + * because the number of shadow pages that be replaced with an + * NX huge page is expected to be relatively small compared to + * the total number of shadow pages. And because the TDP MMU + * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, - lpage_disallowed_link); - WARN_ON_ONCE(!sp->lpage_disallowed); - if (is_tdp_mmu_page(sp)) { + possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + WARN_ON_ONCE(!sp->role.direct); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages + * that are being dirty tracked, as they would just be faulted + * back in as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + * + * Since gfn_to_memslot() is relatively expensive, it helps to + * skip it if it the test cannot possibly return true. On the + * other hand, if any memslot has logging enabled, chances are + * good that all of them do, in which case unaccount_nx_huge_page() + * is much cheaper than zapping the page. + * + * If a memslot update is in progress, reading an incorrect value + * of kvm->nr_memslots_dirty_logging is not a problem: if it is + * becoming zero, gfn_to_memslot() will be done unnecessarily; if + * it is becoming nonzero, the page will be zapped unnecessarily. + * Either way, this only affects efficiency in racy situations, + * and not correctness. + */ + slot = NULL; + if (atomic_read(&kvm->nr_memslots_dirty_logging)) { + slot = gfn_to_memslot(kvm, sp->gfn); + WARN_ON_ONCE(!slot); + } + + if (slot && kvm_slot_dirty_track_enabled(slot)) + unaccount_nx_huge_page(kvm, sp); + else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); - } else { + else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); - } + WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); @@ -6857,7 +6931,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_lpage_recovery_timeout(u64 start_time) +static long get_nx_huge_page_recovery_timeout(u64 start_time) { bool enabled; uint period; @@ -6868,19 +6942,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time) : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) { u64 start_time; long remaining_time; while (true) { start_time = get_jiffies_64(); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop() && remaining_time > 0) { schedule_timeout(remaining_time); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); } @@ -6889,7 +6963,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) if (kthread_should_stop()) return 0; - kvm_recover_nx_lpages(kvm); + kvm_recover_nx_huge_pages(kvm); } } @@ -6897,17 +6971,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", - &kvm->arch.nx_lpage_recovery_thread); + &kvm->arch.nx_huge_page_recovery_thread); if (!err) - kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); return err; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { - if (kvm->arch.nx_lpage_recovery_thread) - kthread_stop(kvm->arch.nx_lpage_recovery_thread); + if (kvm->arch.nx_huge_page_recovery_thread) + kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 582def531d4d..dbaf6755c5a7 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -57,7 +57,13 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The shadow page can't be replaced by an equivalent huge page + * because it is being used to map an executable page in the guest + * and the NX huge page mitigation is enabled. + */ + bool nx_huge_page_disallowed; /* * The following two entries are used to key the shadow page in the @@ -100,7 +106,14 @@ struct kvm_mmu_page { }; }; - struct list_head lpage_disallowed_link; + /* + * Tracks shadow pages that, if zapped, would allow KVM to create an NX + * huge page. A shadow page will have nx_huge_page_disallowed set but + * not be on the list if a huge page is disallowed for other reasons, + * e.g. because KVM is shadowing a PTE at the same gfn, the memslot + * isn't properly aligned, etc... + */ + struct list_head possible_nx_huge_page_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -120,18 +133,6 @@ struct kvm_mmu_page { extern struct kmem_cache *mmu_page_header_cache; -static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} - -static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) -{ - return to_shadow_page(__pa(sptep)); -} - static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) { return role.smm ? 1 : 0; @@ -315,7 +316,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5ab5f94dcb6f..0f6455072055 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -713,9 +713,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 2e08b2a45361..c0fd7e049b4e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!prefetch) spte |= spte_shadow_accessed_mask(spte); + /* + * For simplicity, enforce the NX huge page mitigation even if not + * strictly necessary. KVM could ignore the mitigation if paging is + * disabled in the guest, as the guest doesn't have an page tables to + * abuse. But to safely ignore the mitigation, KVM would have to + * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG + * is toggled on, and that's a net negative for performance when TDP is + * enabled. When TDP is disabled, KVM will always switch to a new MMU + * when CR0.PG is toggled, but leveraging that to ignore the mitigation + * would tie make_spte() further to vCPU/MMU state, and add complexity + * just to optimize a mode that is anything but performance critical. + */ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 7670c13ce251..1f03701b943a 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on - * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF * vulnerability. Use only low bits to avoid 64-bit immediates. * * Only used by the TDP MMU. @@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) +{ + return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 672f0432d777..771210ce5181 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); kvm->arch.tdp_mmu_zap_wq = wq; return 1; } @@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); - WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); + WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -284,6 +283,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); sp->role = role; @@ -375,11 +376,13 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); + atomic64_inc(&kvm->arch.tdp_mmu_pages); } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); + atomic64_dec(&kvm->arch.tdp_mmu_pages); } /** @@ -395,14 +398,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared) { tdp_unaccount_mmu_page(kvm, sp); + + if (!sp->nx_huge_page_disallowed) + return; + if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); else lockdep_assert_held_write(&kvm->mmu_lock); - list_del(&sp->link); - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + sp->nx_huge_page_disallowed = false; + untrack_possible_nx_huge_page(kvm, sp); if (shared) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); @@ -1116,16 +1122,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. - * @account_nx: True if this page table is being installed to split a - * non-executable huge page. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed). */ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_mmu_page *sp, bool account_nx, - bool shared) + struct kvm_mmu_page *sp, bool shared) { u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); int ret = 0; @@ -1138,16 +1141,14 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, tdp_mmu_set_spte(kvm, iter, spte); } - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_huge_nx_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); tdp_account_mmu_page(kvm, sp); return 0; } +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared); + /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. @@ -1155,9 +1156,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; - int ret; + int ret = RET_PF_RETRY; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -1166,6 +1168,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + int r; + if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); @@ -1173,57 +1177,52 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) break; /* - * If there is an SPTE mapping a large page at a higher level - * than the target, that SPTE must be cleared and replaced - * with a non-leaf SPTE. + * If SPTE has been frozen by another thread, just give up and + * retry, avoiding unnecessary page table allocation and free. */ - if (is_shadow_present_pte(iter.old_spte) && - is_large_pte(iter.old_spte)) { - if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) - break; + if (is_removed_spte(iter.old_spte)) + goto retry; - /* - * The iter must explicitly re-read the spte here - * because the new value informs the !present - * path below. - */ - iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); - } + /* Step down into the lower level page table if it exists. */ + if (is_shadow_present_pte(iter.old_spte) && + !is_large_pte(iter.old_spte)) + continue; - if (!is_shadow_present_pte(iter.old_spte)) { - bool account_nx = fault->huge_page_disallowed && - fault->req_level >= iter.level; + /* + * The SPTE is either non-present or points to a huge page that + * needs to be split. + */ + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); - /* - * If SPTE has been frozen by another thread, just - * give up and retry, avoiding unnecessary page table - * allocation and free. - */ - if (is_removed_spte(iter.old_spte)) - break; + sp->nx_huge_page_disallowed = fault->huge_page_disallowed; - sp = tdp_mmu_alloc_sp(vcpu); - tdp_mmu_init_child_sp(sp, &iter); + if (is_shadow_present_pte(iter.old_spte)) + r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); + else + r = tdp_mmu_link_sp(kvm, &iter, sp, true); - if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { - tdp_mmu_free_sp(sp); - break; - } + /* + * Also force the guest to retry the access if the upper level SPTEs + * aren't in place. + */ + if (r) { + tdp_mmu_free_sp(sp); + goto retry; } - } - /* - * Force the guest to retry the access if the upper level SPTEs aren't - * in place, or if the target leaf SPTE is frozen by another CPU. - */ - if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { - rcu_read_unlock(); - return RET_PF_RETRY; + if (fault->huge_page_disallowed && + fault->req_level >= iter.level) { + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + track_possible_nx_huge_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + } } ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); - rcu_read_unlock(); +retry: + rcu_read_unlock(); return ret; } @@ -1472,6 +1471,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, return sp; } +/* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { @@ -1479,8 +1479,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, const int level = iter->level; int ret, i; - tdp_mmu_init_child_sp(sp, iter); - /* * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. @@ -1496,7 +1494,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * correctness standpoint since the translation will be the same either * way. */ - ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + ret = tdp_mmu_link_sp(kvm, iter, sp, shared); if (ret) goto out; @@ -1556,6 +1554,8 @@ retry: continue; } + tdp_mmu_init_child_sp(sp, &iter); + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) goto retry; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index c163f7cc23ca..d3714200b932 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,6 +5,8 @@ #include <linux/kvm_host.h> +#include "spte.h" + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index de1fd7369736..684393c22105 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -101,10 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; - /* Ignore counters that have been reprogrammed already. */ - if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) - return; - if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { if (!in_pmi) { /* @@ -122,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); if (!pmc->intr || skip_pmi) return; @@ -147,12 +142,22 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ + if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) + return; + __kvm_perf_overflow(pmc, true); + + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } -static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - u64 config, bool exclude_user, - bool exclude_kernel, bool intr) +static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, + bool exclude_user, bool exclude_kernel, + bool intr) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; @@ -204,14 +209,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", PTR_ERR(event), pmc->idx); - return; + return PTR_ERR(event); } pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; - clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; pmc->intr = intr || pebs; + return 0; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -245,7 +250,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) perf_event_enable(pmc->perf_event); pmc->is_paused = false; - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; } @@ -293,7 +297,7 @@ out: return allow_event; } -void reprogram_counter(struct kvm_pmc *pmc) +static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; @@ -303,10 +307,13 @@ void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) - return; + goto reprogram_complete; if (!check_pmu_event_filter(pmc)) - return; + goto reprogram_complete; + + if (pmc->counter < pmc->prev_counter) + __kvm_perf_overflow(pmc, false); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -324,18 +331,29 @@ void reprogram_counter(struct kvm_pmc *pmc) } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) - return; + goto reprogram_complete; pmc_release_perf_event(pmc); pmc->current_config = new_config; - pmc_reprogram_counter(pmc, PERF_TYPE_RAW, - (eventsel & pmu->raw_event_mask), - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + + /* + * If reprogramming fails, e.g. due to contention, leave the counter's + * regprogram bit set, i.e. opportunistically try again on the next PMU + * refresh. Don't make a new request as doing so can stall the guest + * if reprogramming repeatedly fails. + */ + if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT)) + return; + +reprogram_complete: + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc->prev_counter = 0; } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -345,10 +363,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); - if (unlikely(!pmc || !pmc->perf_event)) { + if (unlikely(!pmc)) { clear_bit(bit, pmu->reprogram_pmi); continue; } + reprogram_counter(pmc); } @@ -522,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - u64 prev_count; - - prev_count = pmc->counter; + pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - - reprogram_counter(pmc); - if (pmc->counter < prev_count) - __kvm_perf_overflow(pmc, false); + kvm_pmu_request_counter_reprogam(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, @@ -542,12 +556,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, static inline bool cpl_is_matched(struct kvm_pmc *pmc) { bool select_os, select_user; - u64 config = pmc->current_config; + u64 config; if (pmc_is_gp(pmc)) { + config = pmc->eventsel; select_os = config & ARCH_PERFMON_EVENTSEL_OS; select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { + config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); select_os = config & 0x1; select_user = config & 0x2; } @@ -577,6 +594,8 @@ EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; + struct kvm_vcpu *vcpu; + unsigned long i; size_t size; int r; @@ -613,9 +632,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); + synchronize_srcu_expedited(&kvm->srcu); + + BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > + sizeof(((struct kvm_pmu *)0)->__reprogram_pmi)); + + kvm_for_each_vcpu(i, vcpu, kvm) + atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull); + + kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); + mutex_unlock(&kvm->lock); - synchronize_srcu_expedited(&kvm->srcu); r = 0; cleanup: kfree(filter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5cc5721f260b..85ff3c0588ba 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -183,7 +183,11 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_counter(struct kvm_pmc *pmc); +static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +{ + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); +} void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 4e5b8444f161..042d0aca3c92 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -7,17 +7,30 @@ #include <asm/cpufeatures.h> /* - * Hardware-defined CPUID leafs that are scattered in the kernel, but need to - * be directly used by KVM. Note, these word values conflict with the kernel's - * "bug" caps, but KVM doesn't use those. + * Hardware-defined CPUID leafs that are either scattered by the kernel or are + * unknown to the kernel, but need to be directly used by KVM. Note, these + * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, + CPUID_7_1_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; +/* + * Define a KVM-only feature flag. + * + * For features that are scattered by cpufeatures.h, __feature_translate() also + * needs to be updated to translate the kernel-defined feature into the + * KVM-defined feature. + * + * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h, + * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so + * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is + * needed in this case. + */ #define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ @@ -25,6 +38,11 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) #define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) +/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ +#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) +#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) +#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) + struct cpuid_reg { u32 function; u32 index; @@ -49,6 +67,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, + [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, }; /* diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c new file mode 100644 index 000000000000..a9c1c2af8d94 --- /dev/null +++ b/arch/x86/kvm/smm.c @@ -0,0 +1,649 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/kvm_host.h> +#include "x86.h" +#include "kvm_cache_regs.h" +#include "kvm_emulate.h" +#include "smm.h" +#include "cpuid.h" +#include "trace.h" + +#define CHECK_SMRAM32_OFFSET(field, offset) \ + ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00) + +#define CHECK_SMRAM64_OFFSET(field, offset) \ + ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00) + +static void check_smram_offsets(void) +{ + /* 32 bit SMRAM image */ + CHECK_SMRAM32_OFFSET(reserved1, 0xFE00); + CHECK_SMRAM32_OFFSET(smbase, 0xFEF8); + CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC); + CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00); + CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02); + CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04); + CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08); + CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C); + CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10); + CHECK_SMRAM32_OFFSET(cr4, 0xFF14); + CHECK_SMRAM32_OFFSET(reserved2, 0xFF18); + CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A); + CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B); + CHECK_SMRAM32_OFFSET(ds, 0xFF2C); + CHECK_SMRAM32_OFFSET(fs, 0xFF38); + CHECK_SMRAM32_OFFSET(gs, 0xFF44); + CHECK_SMRAM32_OFFSET(idtr, 0xFF50); + CHECK_SMRAM32_OFFSET(tr, 0xFF5C); + CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C); + CHECK_SMRAM32_OFFSET(ldtr, 0xFF78); + CHECK_SMRAM32_OFFSET(es, 0xFF84); + CHECK_SMRAM32_OFFSET(cs, 0xFF90); + CHECK_SMRAM32_OFFSET(ss, 0xFF9C); + CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8); + CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC); + CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0); + CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4); + CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8); + CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC); + CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0); + CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4); + CHECK_SMRAM32_OFFSET(dr7, 0xFFC8); + CHECK_SMRAM32_OFFSET(dr6, 0xFFCC); + CHECK_SMRAM32_OFFSET(gprs, 0xFFD0); + CHECK_SMRAM32_OFFSET(eip, 0xFFF0); + CHECK_SMRAM32_OFFSET(eflags, 0xFFF4); + CHECK_SMRAM32_OFFSET(cr3, 0xFFF8); + CHECK_SMRAM32_OFFSET(cr0, 0xFFFC); + + /* 64 bit SMRAM image */ + CHECK_SMRAM64_OFFSET(es, 0xFE00); + CHECK_SMRAM64_OFFSET(cs, 0xFE10); + CHECK_SMRAM64_OFFSET(ss, 0xFE20); + CHECK_SMRAM64_OFFSET(ds, 0xFE30); + CHECK_SMRAM64_OFFSET(fs, 0xFE40); + CHECK_SMRAM64_OFFSET(gs, 0xFE50); + CHECK_SMRAM64_OFFSET(gdtr, 0xFE60); + CHECK_SMRAM64_OFFSET(ldtr, 0xFE70); + CHECK_SMRAM64_OFFSET(idtr, 0xFE80); + CHECK_SMRAM64_OFFSET(tr, 0xFE90); + CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0); + CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8); + CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0); + CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8); + CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0); + CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4); + CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8); + CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9); + CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA); + CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB); + CHECK_SMRAM64_OFFSET(reserved2, 0xFECC); + CHECK_SMRAM64_OFFSET(efer, 0xFED0); + CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8); + CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0); + CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8); + CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0); + CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC); + CHECK_SMRAM64_OFFSET(smbase, 0xFF00); + CHECK_SMRAM64_OFFSET(reserved4, 0xFF04); + CHECK_SMRAM64_OFFSET(ssp, 0xFF18); + CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20); + CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28); + CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30); + CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38); + CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40); + CHECK_SMRAM64_OFFSET(cr4, 0xFF48); + CHECK_SMRAM64_OFFSET(cr3, 0xFF50); + CHECK_SMRAM64_OFFSET(cr0, 0xFF58); + CHECK_SMRAM64_OFFSET(dr7, 0xFF60); + CHECK_SMRAM64_OFFSET(dr6, 0xFF68); + CHECK_SMRAM64_OFFSET(rflags, 0xFF70); + CHECK_SMRAM64_OFFSET(rip, 0xFF78); + CHECK_SMRAM64_OFFSET(gprs, 0xFF80); + + BUILD_BUG_ON(sizeof(union kvm_smram) != 512); +} + +#undef CHECK_SMRAM64_OFFSET +#undef CHECK_SMRAM32_OFFSET + + +void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) +{ + BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); + + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + + if (entering_smm) { + vcpu->arch.hflags |= HF_SMM_MASK; + } else { + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); + + /* Process a latched INIT or SMI, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; + } + + kvm_mmu_reset_context(vcpu); +} + +void process_smi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.smi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) +{ + u32 flags = 0; + flags |= seg->g << 23; + flags |= seg->db << 22; + flags |= seg->l << 21; + flags |= seg->avl << 20; + flags |= seg->present << 15; + flags |= seg->dpl << 13; + flags |= seg->s << 12; + flags |= seg->type << 8; + return flags; +} + +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, + struct kvm_smm_seg_state_32 *state, + u32 *selector, int n) +{ + struct kvm_segment seg; + + kvm_get_segment(vcpu, &seg, n); + *selector = seg.selector; + state->base = seg.base; + state->limit = seg.limit; + state->flags = enter_smm_get_segment_flags(&seg); +} + +#ifdef CONFIG_X86_64 +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, + struct kvm_smm_seg_state_64 *state, + int n) +{ + struct kvm_segment seg; + + kvm_get_segment(vcpu, &seg, n); + state->selector = seg.selector; + state->attributes = enter_smm_get_segment_flags(&seg) >> 8; + state->limit = seg.limit; + state->base = seg.base; +} +#endif + +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, + struct kvm_smram_state_32 *smram) +{ + struct desc_ptr dt; + unsigned long val; + int i; + + smram->cr0 = kvm_read_cr0(vcpu); + smram->cr3 = kvm_read_cr3(vcpu); + smram->eflags = kvm_get_rflags(vcpu); + smram->eip = kvm_rip_read(vcpu); + + for (i = 0; i < 8; i++) + smram->gprs[i] = kvm_register_read_raw(vcpu, i); + + kvm_get_dr(vcpu, 6, &val); + smram->dr6 = (u32)val; + kvm_get_dr(vcpu, 7, &val); + smram->dr7 = (u32)val; + + enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); + enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); + + static_call(kvm_x86_get_gdt)(vcpu, &dt); + smram->gdtr.base = dt.address; + smram->gdtr.limit = dt.size; + + static_call(kvm_x86_get_idt)(vcpu, &dt); + smram->idtr.base = dt.address; + smram->idtr.limit = dt.size; + + enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES); + enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS); + enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS); + + enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS); + enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS); + enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS); + + smram->cr4 = kvm_read_cr4(vcpu); + smram->smm_revision = 0x00020000; + smram->smbase = vcpu->arch.smbase; + + smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); +} + +#ifdef CONFIG_X86_64 +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, + struct kvm_smram_state_64 *smram) +{ + struct desc_ptr dt; + unsigned long val; + int i; + + for (i = 0; i < 16; i++) + smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i); + + smram->rip = kvm_rip_read(vcpu); + smram->rflags = kvm_get_rflags(vcpu); + + + kvm_get_dr(vcpu, 6, &val); + smram->dr6 = val; + kvm_get_dr(vcpu, 7, &val); + smram->dr7 = val; + + smram->cr0 = kvm_read_cr0(vcpu); + smram->cr3 = kvm_read_cr3(vcpu); + smram->cr4 = kvm_read_cr4(vcpu); + + smram->smbase = vcpu->arch.smbase; + smram->smm_revison = 0x00020064; + + smram->efer = vcpu->arch.efer; + + enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR); + + static_call(kvm_x86_get_idt)(vcpu, &dt); + smram->idtr.limit = dt.size; + smram->idtr.base = dt.address; + + enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR); + + static_call(kvm_x86_get_gdt)(vcpu, &dt); + smram->gdtr.limit = dt.size; + smram->gdtr.base = dt.address; + + enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES); + enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS); + enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS); + enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS); + enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS); + enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); + + smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); +} +#endif + +void enter_smm(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ds; + struct desc_ptr dt; + unsigned long cr0; + union kvm_smram smram; + + check_smram_offsets(); + + memset(smram.bytes, 0, sizeof(smram.bytes)); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + enter_smm_save_state_64(vcpu, &smram.smram64); + else +#endif + enter_smm_save_state_32(vcpu, &smram.smram32); + + /* + * Give enter_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. leave guest mode) after we've saved the state into the + * SMM state-save area. + * + * Kill the VM in the unlikely case of failure, because the VM + * can be in undefined state in this case. + */ + if (static_call(kvm_x86_enter_smm)(vcpu, &smram)) + goto error; + + kvm_smm_changed(vcpu, true); + + if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram))) + goto error; + + if (static_call(kvm_x86_get_nmi_mask)(vcpu)) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + static_call(kvm_x86_set_nmi_mask)(vcpu, true); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0x8000); + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); + static_call(kvm_x86_set_cr0)(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + static_call(kvm_x86_set_cr4)(vcpu, 0); + + /* Undocumented: IDT limit is set to zero on entry to SMM. */ + dt.address = dt.size = 0; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1))) + goto error; + + cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; + cs.base = vcpu->arch.smbase; + + ds.selector = 0; + ds.base = 0; + + cs.limit = ds.limit = 0xffffffff; + cs.type = ds.type = 0x3; + cs.dpl = ds.dpl = 0; + cs.db = ds.db = 0; + cs.s = ds.s = 1; + cs.l = ds.l = 0; + cs.g = ds.g = 1; + cs.avl = ds.avl = 0; + cs.present = ds.present = 1; + cs.unusable = ds.unusable = 0; + cs.padding = ds.padding = 0; + + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); + kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (static_call(kvm_x86_set_efer)(vcpu, 0)) + goto error; +#endif + + kvm_update_cpuid_runtime(vcpu); + kvm_mmu_reset_context(vcpu); + return; +error: + kvm_vm_dead(vcpu->kvm); +} + +static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags) +{ + desc->g = (flags >> 23) & 1; + desc->db = (flags >> 22) & 1; + desc->l = (flags >> 21) & 1; + desc->avl = (flags >> 20) & 1; + desc->present = (flags >> 15) & 1; + desc->dpl = (flags >> 13) & 3; + desc->s = (flags >> 12) & 1; + desc->type = (flags >> 8) & 15; + + desc->unusable = !desc->present; + desc->padding = 0; +} + +static int rsm_load_seg_32(struct kvm_vcpu *vcpu, + const struct kvm_smm_seg_state_32 *state, + u16 selector, int n) +{ + struct kvm_segment desc; + + desc.selector = selector; + desc.base = state->base; + desc.limit = state->limit; + rsm_set_desc_flags(&desc, state->flags); + kvm_set_segment(vcpu, &desc, n); + return X86EMUL_CONTINUE; +} + +#ifdef CONFIG_X86_64 + +static int rsm_load_seg_64(struct kvm_vcpu *vcpu, + const struct kvm_smm_seg_state_64 *state, + int n) +{ + struct kvm_segment desc; + + desc.selector = state->selector; + rsm_set_desc_flags(&desc, state->attributes << 8); + desc.limit = state->limit; + desc.base = state->base; + kvm_set_segment(vcpu, &desc, n); + return X86EMUL_CONTINUE; +} +#endif + +static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu, + u64 cr0, u64 cr3, u64 cr4) +{ + int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = kvm_set_cr3(vcpu, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. + * Then enable protected mode. However, PCID cannot be enabled + * if EFER.LMA=0, so set it separately. + */ + bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE); + if (bad) + return X86EMUL_UNHANDLEABLE; + + bad = kvm_set_cr0(vcpu, cr0); + if (bad) + return X86EMUL_UNHANDLEABLE; + + if (cr4 & X86_CR4_PCIDE) { + bad = kvm_set_cr4(vcpu, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = kvm_set_cr3(vcpu, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + + } + + return X86EMUL_CONTINUE; +} + +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, + const struct kvm_smram_state_32 *smstate) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct desc_ptr dt; + int i, r; + + ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED; + ctxt->_eip = smstate->eip; + + for (i = 0; i < 8; i++) + *reg_write(ctxt, i) = smstate->gprs[i]; + + if (kvm_set_dr(vcpu, 6, smstate->dr6)) + return X86EMUL_UNHANDLEABLE; + if (kvm_set_dr(vcpu, 7, smstate->dr7)) + return X86EMUL_UNHANDLEABLE; + + rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR); + rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR); + + dt.address = smstate->gdtr.base; + dt.size = smstate->gdtr.limit; + static_call(kvm_x86_set_gdt)(vcpu, &dt); + + dt.address = smstate->idtr.base; + dt.size = smstate->idtr.limit; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES); + rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS); + rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS); + + rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS); + rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS); + rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS); + + vcpu->arch.smbase = smstate->smbase; + + r = rsm_enter_protected_mode(vcpu, smstate->cr0, + smstate->cr3, smstate->cr4); + + if (r != X86EMUL_CONTINUE) + return r; + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + ctxt->interruptibility = (u8)smstate->int_shadow; + + return r; +} + +#ifdef CONFIG_X86_64 +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, + const struct kvm_smram_state_64 *smstate) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct desc_ptr dt; + int i, r; + + for (i = 0; i < 16; i++) + *reg_write(ctxt, i) = smstate->gprs[15 - i]; + + ctxt->_eip = smstate->rip; + ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED; + + if (kvm_set_dr(vcpu, 6, smstate->dr6)) + return X86EMUL_UNHANDLEABLE; + if (kvm_set_dr(vcpu, 7, smstate->dr7)) + return X86EMUL_UNHANDLEABLE; + + vcpu->arch.smbase = smstate->smbase; + + if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) + return X86EMUL_UNHANDLEABLE; + + rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); + + dt.size = smstate->idtr.limit; + dt.address = smstate->idtr.base; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR); + + dt.size = smstate->gdtr.limit; + dt.address = smstate->gdtr.base; + static_call(kvm_x86_set_gdt)(vcpu, &dt); + + r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4); + if (r != X86EMUL_CONTINUE) + return r; + + rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES); + rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS); + rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS); + rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS); + rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS); + rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS); + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + ctxt->interruptibility = (u8)smstate->int_shadow; + + return X86EMUL_CONTINUE; +} +#endif + +int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + unsigned long cr0; + union kvm_smram smram; + u64 smbase; + int ret; + + smbase = vcpu->arch.smbase; + + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram)); + if (ret < 0) + return X86EMUL_UNHANDLEABLE; + + if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0) + static_call(kvm_x86_set_nmi_mask)(vcpu, false); + + kvm_smm_changed(vcpu, false); + + /* + * Get back to real mode, to prepare a safe state in which to load + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. + */ +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + struct kvm_segment cs_desc; + unsigned long cr4; + + /* Zero CR4.PCIDE before CR0.PG. */ + cr4 = kvm_read_cr4(vcpu); + if (cr4 & X86_CR4_PCIDE) + kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE); + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.present = 1; + kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS); + } +#endif + + /* For the 64-bit case, this will clear EFER.LMA. */ + cr0 = kvm_read_cr0(vcpu); + if (cr0 & X86_CR0_PE) + kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + unsigned long cr4, efer; + + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = kvm_read_cr4(vcpu); + if (cr4 & X86_CR4_PAE) + kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ + efer = 0; + kvm_set_msr(vcpu, MSR_EFER, efer); + } +#endif + + /* + * Give leave_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. enter guest mode) before loading state from the SMM + * state-save area. + */ + if (static_call(kvm_x86_leave_smm)(vcpu, &smram)) + return X86EMUL_UNHANDLEABLE; + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return rsm_load_state_64(ctxt, &smram.smram64); + else +#endif + return rsm_load_state_32(ctxt, &smram.smram32); +} diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h new file mode 100644 index 000000000000..a1cf2ac5bd78 --- /dev/null +++ b/arch/x86/kvm/smm.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_KVM_SMM_H +#define ASM_KVM_SMM_H + +#include <linux/build_bug.h> + +#ifdef CONFIG_KVM_SMM + + +/* + * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout + * (https://www.sandpile.org/x86/smm.htm). + */ + +struct kvm_smm_seg_state_32 { + u32 flags; + u32 limit; + u32 base; +} __packed; + +struct kvm_smram_state_32 { + u32 reserved1[62]; + u32 smbase; + u32 smm_revision; + u16 io_inst_restart; + u16 auto_hlt_restart; + u32 io_restart_rdi; + u32 io_restart_rcx; + u32 io_restart_rsi; + u32 io_restart_rip; + u32 cr4; + + /* A20M#, CPL, shutdown and other reserved/undocumented fields */ + u16 reserved2; + u8 int_shadow; /* KVM extension */ + u8 reserved3[17]; + + struct kvm_smm_seg_state_32 ds; + struct kvm_smm_seg_state_32 fs; + struct kvm_smm_seg_state_32 gs; + struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */ + struct kvm_smm_seg_state_32 tr; + u32 reserved; + struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */ + struct kvm_smm_seg_state_32 ldtr; + struct kvm_smm_seg_state_32 es; + struct kvm_smm_seg_state_32 cs; + struct kvm_smm_seg_state_32 ss; + + u32 es_sel; + u32 cs_sel; + u32 ss_sel; + u32 ds_sel; + u32 fs_sel; + u32 gs_sel; + u32 ldtr_sel; + u32 tr_sel; + + u32 dr7; + u32 dr6; + u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */ + u32 eip; + u32 eflags; + u32 cr3; + u32 cr0; +} __packed; + + +/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */ + +struct kvm_smm_seg_state_64 { + u16 selector; + u16 attributes; + u32 limit; + u64 base; +}; + +struct kvm_smram_state_64 { + + struct kvm_smm_seg_state_64 es; + struct kvm_smm_seg_state_64 cs; + struct kvm_smm_seg_state_64 ss; + struct kvm_smm_seg_state_64 ds; + struct kvm_smm_seg_state_64 fs; + struct kvm_smm_seg_state_64 gs; + struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/ + struct kvm_smm_seg_state_64 ldtr; + struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/ + struct kvm_smm_seg_state_64 tr; + + /* I/O restart and auto halt restart are not implemented by KVM */ + u64 io_restart_rip; + u64 io_restart_rcx; + u64 io_restart_rsi; + u64 io_restart_rdi; + u32 io_restart_dword; + u32 reserved1; + u8 io_inst_restart; + u8 auto_hlt_restart; + u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */ + u8 int_shadow; + u32 reserved2; + + u64 efer; + + /* + * Two fields below are implemented on AMD only, to store + * SVM guest vmcb address if the #SMI was received while in the guest mode. + */ + u64 svm_guest_flag; + u64 svm_guest_vmcb_gpa; + u64 svm_guest_virtual_int; /* unknown purpose, not implemented */ + + u32 reserved3[3]; + u32 smm_revison; + u32 smbase; + u32 reserved4[5]; + + /* ssp and svm_* fields below are not implemented by KVM */ + u64 ssp; + u64 svm_guest_pat; + u64 svm_host_efer; + u64 svm_host_cr4; + u64 svm_host_cr3; + u64 svm_host_cr0; + + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */ +}; + +union kvm_smram { + struct kvm_smram_state_64 smram64; + struct kvm_smram_state_32 smram32; + u8 bytes[512]; +}; + +static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_SMI, vcpu); + return 0; +} + +static inline bool is_smm(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_SMM_MASK; +} + +void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm); +void enter_smm(struct kvm_vcpu *vcpu); +int emulator_leave_smm(struct x86_emulate_ctxt *ctxt); +void process_smi(struct kvm_vcpu *vcpu); +#else +static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; } +static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; } + +/* + * emulator_leave_smm is used as a function pointer, so the + * stub is defined in x86.c. + */ +#endif + +#endif diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c new file mode 100644 index 000000000000..088f6429b24c --- /dev/null +++ b/arch/x86/kvm/svm/hyperv.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM specific code for Hyper-V on KVM. + * + * Copyright 2022 Red Hat, Inc. and/or its affiliates. + */ +#include "hyperv.h" + +void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; + svm->vmcb->control.exit_info_2 = 0; + nested_svm_vmexit(svm); +} diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index 7d6d97968fb9..02f4784b5d44 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -9,27 +9,37 @@ #include <asm/mshyperv.h> #include "../hyperv.h" +#include "svm.h" -/* - * Hyper-V uses the software reserved 32 bytes in VMCB - * control area to expose SVM enlightenments to guests. - */ -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; +static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + if (!hv_vcpu) + return; + + hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page; + hv_vcpu->nested.vm_id = hve->hv_vm_id; + hv_vcpu->nested.vp_id = hve->hv_vp_id; +} + +static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + if (!hve->hv_enlightenments_control.nested_flush_hypercall) + return false; + + return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; +} + +void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 995bc0f90759..bc9cd7086fa9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,7 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "lapic.h" #include "svm.h" @@ -149,8 +150,12 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_clr_intercept(c, INTERCEPT_VINTR); } - /* We don't want to see VMMCALLs from a nested guest */ - vmcb_clr_intercept(c, INTERCEPT_VMMCALL); + /* + * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB + * flush feature is enabled. + */ + if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu)) + vmcb_clr_intercept(c, INTERCEPT_VMMCALL); for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; @@ -179,8 +184,7 @@ void recalc_intercepts(struct vcpu_svm *svm) */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - struct hv_enlightenments *hve = - (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; int i; /* @@ -194,7 +198,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!svm->nested.force_msr_bitmap_recalc && kvm_hv_hypercall_enabled(&svm->vcpu) && hve->hv_enlightenments_control.msr_bitmap && - (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) goto set_msrpm_base_pa; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) @@ -369,8 +373,8 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, /* Hyper-V extensions (Enlightened VMCB) */ if (kvm_hv_hypercall_enabled(vcpu)) { to->clean = from->clean; - memcpy(to->reserved_sw, from->reserved_sw, - sizeof(struct hv_enlightenments)); + memcpy(&to->hv_enlightenments, &from->hv_enlightenments, + sizeof(to->hv_enlightenments)); } } @@ -474,6 +478,15 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && npt_enabled) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + + /* * TODO: optimize unconditional TLB flush/MMU sync. A partial list of * things to fix before this can be conditional: * @@ -800,6 +813,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, if (kvm_vcpu_apicv_active(vcpu)) kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + nested_svm_hv_update_vm_vp_ids(vcpu); + return 0; } @@ -822,6 +837,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) return 1; } + /* This fails when VP assist page is enabled but the supplied GPA is bogus */ + ret = kvm_hv_verify_vp_assist(vcpu); + if (ret) { + kvm_inject_gp(vcpu, 0); + return ret; + } + vmcb12_gpa = svm->vmcb->save.rax; ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { @@ -1383,6 +1405,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; @@ -1391,6 +1414,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); return 0; } +#endif if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) { if (block_nested_events) @@ -1417,6 +1441,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; + struct kvm_vcpu *vcpu = &svm->vcpu; switch (exit_code) { case SVM_EXIT_INTR: @@ -1435,6 +1460,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_HOST; break; } + case SVM_EXIT_VMMCALL: + /* Hyper-V L2 TLB flush hypercall is handled by L0 */ + if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_svm_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu)) + return NESTED_EXIT_HOST; + break; default: break; } @@ -1485,7 +1517,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; dst->pause_filter_thresh = from->pause_filter_thresh; - /* 'clean' and 'reserved_sw' are not changed by KVM */ + /* 'clean' and 'hv_enlightenments' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, @@ -1715,6 +1747,9 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) return false; } + if (kvm_hv_verify_vp_assist(vcpu)) + return false; + return true; } @@ -1726,4 +1761,5 @@ struct kvm_x86_nested_ops svm_nested_ops = { .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, + .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush, }; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 9d65cd095691..0e313fbae055 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -159,7 +159,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } return 0; } @@ -212,7 +212,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index efaaef2b7ae1..86d6897f4806 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -465,9 +465,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) return; for (i = 0; i < npages; i++) { - page_virtual = kmap_atomic(pages[i]); + page_virtual = kmap_local_page(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); - kunmap_atomic(page_virtual); + kunmap_local(page_virtual); cond_resched(); } } @@ -2648,7 +2648,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) ghcb_scratch_beg = control->ghcb_gpa + offsetof(struct ghcb, shared_buffer); ghcb_scratch_end = control->ghcb_gpa + - offsetof(struct ghcb, reserved_1); + offsetof(struct ghcb, reserved_0xff0); /* * If the scratch area begins within the GHCB, it must be diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ce362e88a567..9a194aa1a75a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -6,6 +6,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "pmu.h" @@ -2708,8 +2709,6 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; - case MSR_IA32_PERF_CAPABILITIES: - return 0; default: return KVM_MSR_RET_INVALID; } @@ -3724,6 +3723,13 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); /* + * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. + * A TLB flush for the current ASID flushes both "host" and "guest" TLB + * entries, and thus is a superset of Hyper-V's fine grained flushing. + */ + kvm_hv_vcpu_purge_flush_tlb(vcpu); + + /* * Flush only the current ASID even if the TLB flush was invoked via * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and @@ -3889,8 +3895,14 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + /* + * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM + * can't read guest memory (dereference memslots) to decode the WRMSR. + */ + if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 && + nrips && control->next_rip) return handle_fastpath_set_msr_irqoff(vcpu); return EXIT_FASTPATH_NONE; @@ -4102,6 +4114,8 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* SEV-ES guests do not support SMM, so report false */ if (kvm && sev_es_guest(kvm)) return false; @@ -4358,6 +4372,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +#ifdef CONFIG_KVM_SMM bool svm_smi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4385,7 +4400,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map_save; @@ -4394,10 +4409,16 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) if (!is_guest_mode(vcpu)) return 0; - /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + /* + * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is + * responsible for ensuring nested SVM and SMIs are mutually exclusive. + */ + + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return 1; + + smram->smram64.svm_guest_flag = 1; + smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4419,8 +4440,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) return 1; BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); @@ -4432,34 +4452,33 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - u64 saved_efer, vmcb12_gpa; struct vmcb *vmcb12; int ret; + const struct kvm_smram_state_64 *smram64 = &smram->smram64; + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ - if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + if (!smram64->svm_guest_flag) return 0; if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return 1; - saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - if (!(saved_efer & EFER_SVME)) + if (!(smram64->efer & EFER_SVME)) return 1; - vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) return 1; ret = 1; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) goto unmap_map; if (svm_allocate_nested(svm)) @@ -4481,7 +4500,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false); if (ret) goto unmap_save; @@ -4507,6 +4526,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) /* We must be in SMM; RSM will cause a vmexit anyway. */ } } +#endif static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) @@ -4782,10 +4802,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = svm_smi_allowed, .enter_smm = svm_enter_smm, .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, +#endif .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, @@ -4851,6 +4873,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); + kvm_caps.supported_perf_cap = 0; kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 199a2ecef1ce..4826e6cc611b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -151,7 +151,10 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; - u8 reserved_sw[32]; + union { + struct hv_vmcb_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; struct svm_nested_state { diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 8cdc62c74a96..26a89d0da93e 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -14,9 +14,9 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { - struct hv_enlightenments *hve; + struct hv_vmcb_enlightenments *hve; struct hv_partition_assist_pg **p_hv_pa_pg = &to_kvm_hv(vcpu->kvm)->hv_pa_pg; @@ -26,13 +26,13 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) if (!*p_hv_pa_pg) return -ENOMEM; - hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw; + hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; hve->partition_assist_page = __pa(*p_hv_pa_pg); hve->hv_vm_id = (unsigned long)vcpu->kvm; if (!hve->hv_enlightenments_control.nested_flush_hypercall) { hve->hv_enlightenments_control.nested_flush_hypercall = 1; - vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } return 0; diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index e2fc59380465..45faf84476ce 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -13,12 +13,14 @@ static struct kvm_x86_ops svm_x86_ops; -int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); +int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; + + BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) != + sizeof(vmcb->control.reserved_sw)); if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) @@ -51,8 +53,8 @@ static inline void svm_hv_hardware_setup(void) vp_ap->nested_control.features.directhypercall = 1; } - svm_x86_ops.enable_direct_tlbflush = - svm_hv_enable_direct_tlbflush; + svm_x86_ops.enable_l2_tlb_flush = + svm_hv_enable_l2_tlb_flush; } } @@ -60,23 +62,20 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { struct vmcb *vmcb = to_svm(vcpu)->vmcb; - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; if (hve->hv_enlightenments_control.msr_bitmap) - vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } -static inline void svm_hv_update_vp_id(struct vmcb *vmcb, - struct kvm_vcpu *vcpu) +static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) { - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; u32 vp_index = kvm_hv_get_vpindex(vcpu); if (hve->hv_vp_id != vp_index) { hve->hv_vp_id = vp_index; - vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } } #else diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bc25589ad588..83843379813e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -113,12 +113,13 @@ TRACE_EVENT(kvm_hv_hypercall_done, * Tracepoint for Xen hypercall. */ TRACE_EVENT(kvm_xen_hypercall, - TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, unsigned long a4, - unsigned long a5), - TP_ARGS(nr, a0, a1, a2, a3, a4, a5), + TP_PROTO(u8 cpl, unsigned long nr, + unsigned long a0, unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, unsigned long a5), + TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5), TP_STRUCT__entry( + __field(u8, cpl) __field(unsigned long, nr) __field(unsigned long, a0) __field(unsigned long, a1) @@ -129,6 +130,7 @@ TRACE_EVENT(kvm_xen_hypercall, ), TP_fast_assign( + __entry->cpl = cpl; __entry->nr = nr; __entry->a0 = a0; __entry->a1 = a1; @@ -138,8 +140,9 @@ TRACE_EVENT(kvm_xen_hypercall, __entry->a4 = a5; ), - TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", - __entry->nr, __entry->a0, __entry->a1, __entry->a2, + TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", + __entry->cpl, __entry->nr, + __entry->a0, __entry->a1, __entry->a2, __entry->a3, __entry->a4, __entry->a5) ); @@ -1547,38 +1550,41 @@ TRACE_EVENT(kvm_hv_timer_state, * Tracepoint for kvm_hv_flush_tlb. */ TRACE_EVENT(kvm_hv_flush_tlb, - TP_PROTO(u64 processor_mask, u64 address_space, u64 flags), - TP_ARGS(processor_mask, address_space, flags), + TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode), + TP_ARGS(processor_mask, address_space, flags, guest_mode), TP_STRUCT__entry( __field(u64, processor_mask) __field(u64, address_space) __field(u64, flags) + __field(bool, guest_mode) ), TP_fast_assign( __entry->processor_mask = processor_mask; __entry->address_space = address_space; __entry->flags = flags; + __entry->guest_mode = guest_mode; ), - TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx", + TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s", __entry->processor_mask, __entry->address_space, - __entry->flags) + __entry->flags, __entry->guest_mode ? "(L2)" : "") ); /* * Tracepoint for kvm_hv_flush_tlb_ex. */ TRACE_EVENT(kvm_hv_flush_tlb_ex, - TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags), - TP_ARGS(valid_bank_mask, format, address_space, flags), + TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode), + TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode), TP_STRUCT__entry( __field(u64, valid_bank_mask) __field(u64, format) __field(u64, address_space) __field(u64, flags) + __field(bool, guest_mode) ), TP_fast_assign( @@ -1586,12 +1592,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex, __entry->format = format; __entry->address_space = address_space; __entry->flags = flags; + __entry->guest_mode = guest_mode; ), TP_printk("valid_bank_mask 0x%llx format 0x%llx " - "address_space 0x%llx flags 0x%llx", + "address_space 0x%llx flags 0x%llx %s", __entry->valid_bank_mask, __entry->format, - __entry->address_space, __entry->flags) + __entry->address_space, __entry->flags, + __entry->guest_mode ? "(L2)" : "") ); /* diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 07254314f3dd..cd2ac9536c99 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void) return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; } -static inline u64 vmx_get_perf_capabilities(void) -{ - u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; - u64 host_perf_cap = 0; - - if (!enable_pmu) - return 0; - - if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - - if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - - if (vmx_pebs_supported()) { - perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; - } - - return perf_cap; -} - static inline bool cpu_has_notify_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c index d8b23c96d627..ae03d1fe0355 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -3,9 +3,9 @@ #include <linux/errno.h> #include <linux/smp.h> -#include "../hyperv.h" #include "../cpuid.h" -#include "evmcs.h" +#include "hyperv.h" +#include "nested.h" #include "vmcs.h" #include "vmx.h" #include "trace.h" @@ -322,24 +322,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); -bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) +u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { - struct hv_vp_assist_page assist_page; - - *evmcs_gpa = -1ull; - - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) - return false; - - if (unlikely(!assist_page.enlighten_vmentry)) - return false; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs))) - return false; + if (unlikely(kvm_hv_get_assist_page(vcpu))) + return EVMPTR_INVALID; - *evmcs_gpa = assist_page.current_nested_vmcs; + if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry)) + return EVMPTR_INVALID; - return true; + return hv_vcpu->vp_assist_page.current_nested_vmcs; } uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) @@ -507,3 +500,23 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, return 0; } + +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + if (!hv_vcpu || !evmcs) + return false; + + if (!evmcs->hv_enlightenments_control.nested_flush_hypercall) + return false; + + return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; +} + +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) +{ + nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0); +} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h index 6f746ef3c038..571e7929d14e 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KVM_X86_VMX_EVMCS_H -#define __KVM_X86_VMX_EVMCS_H +#ifndef __KVM_X86_VMX_HYPERV_H +#define __KVM_X86_VMX_HYPERV_H #include <linux/jump_label.h> @@ -8,6 +8,8 @@ #include <asm/mshyperv.h> #include <asm/vmx.h> +#include "../hyperv.h" + #include "capabilities.h" #include "vmcs.h" #include "vmcs12.h" @@ -235,11 +237,13 @@ enum nested_evmptrld_status { EVMPTRLD_ERROR, }; -bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); +u64 nested_get_evmptr(struct kvm_vcpu *vcpu); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); -#endif /* __KVM_X86_VMX_EVMCS_H */ +#endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5b0d4859e4b7..b6f4411b613e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,7 +7,6 @@ #include <asm/mmu_context.h> #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" @@ -16,6 +15,7 @@ #include "trace.h" #include "vmx.h" #include "x86.h" +#include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); @@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { @@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) } vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + if (hv_vcpu) { + hv_vcpu->nested.pa_page_gpa = INVALID_GPA; + hv_vcpu->nested.vm_id = 0; + hv_vcpu->nested.vp_id = 0; + } } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -1126,6 +1133,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && enable_ept) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + + /* * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a * full TLB flush from the guest's perspective. This is required even @@ -1557,12 +1573,20 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { + hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; + hv_vcpu->nested.vm_id = evmcs->hv_vm_id; + hv_vcpu->nested.vp_id = evmcs->hv_vp_id; + } + + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; @@ -1977,7 +2001,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( if (likely(!guest_cpuid_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; - if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + evmcs_gpa = nested_get_evmptr(vcpu); + if (!evmptr_is_valid(evmcs_gpa)) { nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; } @@ -2563,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_ept_init_mmu_context(vcpu); /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we - * have more bits than L1 expected. + * Override the CR0/CR4 read shadows after setting the effective guest + * CR0/CR4. The common helpers also set the shadows, but they don't + * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); @@ -3251,6 +3273,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { + /* + * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy + * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory + * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post + * migration. + */ if (!nested_get_evmcs_page(vcpu)) { pr_debug_ratelimited("%s: enlightened vmptrld failed\n", __func__); @@ -4767,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); + /* + * If IBRS is advertised to the vCPU, KVM must flush the indirect + * branch predictors when transitioning from L2 to L1, as L1 expects + * hardware (KVM in this case) to provide separate predictor modes. + * Bare metal isolates VMX root (host) from VMX non-root (guest), but + * doesn't isolate different VMCSs, i.e. in this case, doesn't provide + * separate modes for L2 vs L1. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + indirect_branch_prediction_barrier(); + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -5100,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks - * that have higher priority than VM-Exit (see Intel SDM's pseudocode - * for VMXON), as KVM must load valid CR0/CR4 values into hardware while - * running the guest, i.e. KVM needs to check the _guest_ values. + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). * - * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and - * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real - * Mode, but KVM will never take the guest out of those modes. + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || - !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* - * CPL=0 and all other checks that are lower priority than VM-Exit must - * be checked manually. + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); @@ -5127,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); @@ -5206,7 +5267,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; - u64 evmcs_gpa; int r; if (!nested_vmx_check_permission(vcpu)) @@ -5232,7 +5292,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) * vmx->nested.hv_evmcs but this shouldn't be a problem. */ if (likely(!guest_cpuid_has_evmcs(vcpu) || - !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { + !evmptr_is_valid(nested_get_evmptr(vcpu)))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); @@ -6129,6 +6189,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, * Handle L2's bus locks in L0 directly. */ return true; + case EXIT_REASON_VMCALL: + /* Hyper-V L2 TLB flush hypercall is handled by L0 */ + return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_evmcs_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu); default: break; } @@ -6980,4 +7045,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, + .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, }; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6312c9541c3c..96952263b029 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). + * Return the cr0/4 value that a nested guest would read. This is a combination + * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed + * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as + * the value+mask loaded into vmcs02 may not match the vmcs12 fields. */ static inline unsigned long nested_read_cr0(struct vmcs12 *fields) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10b33da9bd05..e5cec07ca8d9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].current_config = 0; } - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); lbr_desc->records.nr = 0; lbr_desc->event = NULL; lbr_desc->msr_passthrough = false; @@ -647,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 8f95c7c01433..b12da2a6dec9 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, /* Enforce CPUID restriction on max enclave size. */ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : sgx_12_0->edx; - if (size >= BIT_ULL(max_size_log2)) + if (size >= BIT_ULL(max_size_log2)) { kvm_inject_gp(vcpu, 0); + return 1; + } /* * sgx_virt_ecreate() returns: diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 746129ddd5ae..01936013428b 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -208,9 +208,8 @@ struct __packed vmcs12 { /* * For save/restore compatibility, the vmcs12 field offsets must not change. */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") +#define CHECK_OFFSET(field, loc) \ + ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) static inline void vmx_check_vmcs12_offsets(void) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0b5db4de4d09..766c6b3ef5ed 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run) .section .text, "ax" +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) +#endif SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 63247c57c72c..fe5615fd8295 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -51,7 +51,6 @@ #include "capabilities.h" #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "kvm_onhyperv.h" #include "irq.h" @@ -66,6 +65,7 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "smm.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -526,7 +526,7 @@ static unsigned long host_idt_base; static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; struct hv_partition_assist_pg **p_hv_pa_pg = @@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; return flags; @@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, /* * No indirect branch prediction barrier needed when switching - * the active VMCS within a guest, e.g. on nested VM-Enter. - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); @@ -1834,12 +1836,42 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) +/* + * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of + * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain + * backwards compatibility even though KVM doesn't support emulating SMX. And + * because userspace set "VMX in SMX", the guest must also be allowed to set it, + * e.g. if the MSR is left unlocked and the guest does a RMW operation. + */ +#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ + FEAT_CTL_SGX_LC_ENABLED | \ + FEAT_CTL_SGX_ENABLED | \ + FEAT_CTL_LMCE_ENABLED) + +static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + uint64_t valid_bits; + + /* + * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are + * exposed to the guest. + */ + WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & + ~KVM_SUPPORTED_FEATURE_CONTROL); + + if (!msr->host_initiated && + (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) + return false; + + if (msr->host_initiated) + valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; + else + valid_bits = vmx->msr_ia32_feature_control_valid_bits; - return !(val & ~valid_bits); + return !(msr->data & ~valid_bits); } static int vmx_get_msr_feature(struct kvm_msr_entry *msr) @@ -1849,9 +1881,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - case MSR_IA32_PERF_CAPABILITIES: - msr->data = vmx_get_perf_capabilities(); - return 0; default: return KVM_MSR_RET_INVALID; } @@ -2029,7 +2058,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2241,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vcpu, data) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEAT_CTL_LOCKED && !msr_info->host_initiated)) + if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); @@ -2342,14 +2370,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != - (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; } if (data & PERF_CAP_PEBS_FORMAT) { if ((data & PERF_CAP_PEBS_MASK) != - (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) return 1; @@ -6844,6 +6872,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* * We cannot do SMM unless we can run the guest in big * real mode. @@ -7669,6 +7699,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } +static u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + struct x86_pmu_lbr lbr; + u64 host_perf_cap = 0; + + if (!enable_pmu) + return 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; +} + static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); @@ -7691,6 +7746,7 @@ static __init void vmx_set_cpu_caps(void) if (!enable_pmu) kvm_cpu_cap_clear(X86_FEATURE_PDCM); + kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7906,6 +7962,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } +#ifdef CONFIG_KVM_SMM static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ @@ -7914,7 +7971,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7935,7 +7992,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7960,6 +8017,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } +#endif static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { @@ -8127,10 +8185,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .setup_mce = vmx_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = vmx_smi_allowed, .enter_smm = vmx_enter_smm, .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, +#endif .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, @@ -8490,8 +8550,8 @@ static int __init vmx_init(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_direct_tlbflush - = hv_enable_direct_tlbflush; + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; } else { enlightened_vmcs = false; diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index ec268df83ed6..842dc898c972 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -6,19 +6,33 @@ #include <asm/vmx.h> -#include "evmcs.h" +#include "hyperv.h" #include "vmcs.h" #include "../x86.h" void vmread_error(unsigned long field, bool fault); -__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, - bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +/* + * The VMREAD error trampoline _always_ uses the stack to pass parameters, even + * for 64-bit targets. Preserving all registers allows the VMREAD inline asm + * blob to avoid clobbering GPRs, which in turn allows the compiler to better + * optimize sequences of VMREADs. + * + * Declare the trampoline as an opaque label as it's not safe to call from C + * code; there is no way to tell the compiler to pass params on the stack for + * 64-bit targets. + * + * void vmread_error_trampoline(unsigned long field, bool fault); + */ +extern unsigned long vmread_error_trampoline; +#endif + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69227f77b201..312aea1854ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -30,6 +30,7 @@ #include "hyperv.h" #include "lapic.h" #include "xen.h" +#include "smm.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -119,8 +120,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); -static void process_smi(struct kvm_vcpu *vcpu); -static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); @@ -464,7 +463,6 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) { @@ -492,7 +490,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_recalculate_apic_map(vcpu->kvm); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. @@ -783,7 +780,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); } -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) @@ -812,7 +808,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) atomic_inc(&vcpu->arch.nmi_queued); kvm_make_request(KVM_REQ_NMI, vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { @@ -837,7 +832,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } -EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { @@ -1654,6 +1648,9 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) case MSR_IA32_ARCH_CAPABILITIES: msr->data = kvm_get_arch_capabilities(); break; + case MSR_IA32_PERF_CAPABILITIES: + msr->data = kvm_caps.supported_perf_cap; + break; case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; @@ -2067,7 +2064,6 @@ int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); int kvm_emulate_invd(struct kvm_vcpu *vcpu) { @@ -2315,13 +2311,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ - if (system_time & 1) { - kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, + if (system_time & 1) + kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, sizeof(struct pvclock_vcpu_time_info)); - } else { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); - } + else + kvm_gpc_deactivate(&vcpu->arch.pv_time); return; } @@ -2513,7 +2507,6 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio) return _tsc; } -EXPORT_SYMBOL_GPL(kvm_scale_tsc); static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { @@ -2972,6 +2965,22 @@ static void kvm_update_masterclock(struct kvm *kvm) kvm_end_pvclock_update(kvm); } +/* + * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's + * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz + * can change during boot even if the TSC is constant, as it's possible for KVM + * to be loaded before TSC calibration completes. Ideally, KVM would get a + * notification when calibration completes, but practically speaking calibration + * will complete before userspace is alive enough to create VMs. + */ +static unsigned long get_cpu_tsc_khz(void) +{ + if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { @@ -2982,7 +2991,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) get_cpu(); data->flags = 0; - if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { + if (ka->use_master_clock && + (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { #ifdef CONFIG_X86_64 struct timespec64 ts; @@ -2996,7 +3006,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); @@ -3035,12 +3045,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, unsigned long flags; read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) { + while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) + if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) return; read_lock_irqsave(&gpc->lock, flags); @@ -3106,7 +3114,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -3389,7 +3397,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(&vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -3397,6 +3405,9 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; static_call(kvm_x86_flush_tlb_all)(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3415,6 +3426,12 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) } static_call(kvm_x86_flush_tlb_guest)(vcpu); + + /* + * Flushing all "guest" TLB is always a superset of Hyper-V's fine + * grained flushing. + */ + kvm_hv_vcpu_purge_flush_tlb(vcpu); } @@ -3566,20 +3583,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; - case MSR_IA32_PERF_CAPABILITIES: { - struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; - + case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated) return 1; - if (kvm_get_msr_feature(&msr_ent)) - return 1; - if (data & ~msr_ent.data) + if (data & ~kvm_caps.supported_perf_cap) return 1; vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); return 0; - } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -3651,7 +3663,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; @@ -4067,7 +4079,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; msr_info->data = vcpu->arch.smbase; break; @@ -4425,7 +4437,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND; if (sched_info_on()) - r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + r |= KVM_XEN_HVM_CONFIG_RUNSTATE | + KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; break; #endif case KVM_CAP_SYNC_REGS: @@ -4441,6 +4454,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; case KVM_CAP_X86_SMM: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + break; + /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is @@ -4481,7 +4497,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops.enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_l2_tlb_flush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; @@ -4897,13 +4913,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } -static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_SMI, vcpu); - - return 0; -} - static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -5039,8 +5048,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, process_nmi(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif /* * KVM's ABI only allows for one exception to be migrated. Luckily, @@ -5068,16 +5079,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, ex->pending && ex->has_payload) kvm_deliver_exception_payload(vcpu, ex); + memset(events, 0, sizeof(*events)); + /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(ex->vector)) { - events->exception.injected = 0; - events->exception.pending = 0; - } else { + if (!kvm_exception_is_soft(ex->vector)) { events->exception.injected = ex->injected; events->exception.pending = ex->pending; /* @@ -5097,20 +5107,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); - events->nmi.pad = 0; - events->sipi_vector = 0; /* never valid when reporting to user space */ + /* events->sipi_vector is never valid when reporting to user space */ +#ifdef CONFIG_KVM_SMM events->smi.smm = is_smm(vcpu); events->smi.pending = vcpu->arch.smi_pending; events->smi.smm_inside_nmi = !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); +#endif events->smi.latched_init = kvm_lapic_latched_init(vcpu); events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING @@ -5122,12 +5132,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; } - - memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); - static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -5200,6 +5206,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { +#ifdef CONFIG_KVM_SMM if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { kvm_leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); @@ -5214,6 +5221,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; } +#else + if (events->smi.smm || events->smi.pending || + events->smi.smm_inside_nmi) + return -EINVAL; +#endif + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -5497,10 +5510,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops.enable_direct_tlbflush) + if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; - return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); + return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu); case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); @@ -5580,7 +5593,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SMI: { - r = kvm_vcpu_ioctl_smi(vcpu); + r = kvm_inject_smi(vcpu); break; } case KVM_SET_CPUID: { @@ -6239,9 +6252,7 @@ split_irqchip_unlock: break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; - if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | - KVM_MSR_EXIT_REASON_UNKNOWN | - KVM_MSR_EXIT_REASON_FILTER)) + if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK) break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; @@ -6418,7 +6429,7 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, if (!user_range->nmsrs) return 0; - if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) + if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) return -EINVAL; if (!user_range->flags) @@ -6452,7 +6463,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, int r = 0; u32 i; - if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) return -EINVAL; for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) @@ -7125,8 +7136,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return handled; } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { static_call(kvm_x86_set_segment)(vcpu, var, seg); } @@ -7162,16 +7173,6 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - access |= PFERR_FETCH_MASK; - return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); -} - gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { @@ -7284,15 +7285,6 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); - - return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; -} - static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) @@ -8084,26 +8076,6 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) -{ - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); -} - -static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - return vcpu->arch.smbase; -} - -static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - vcpu->arch.smbase = smbase; -} - static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { @@ -8178,18 +8150,13 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } -static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) +#ifndef CONFIG_KVM_SMM +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - kvm_smm_changed(vcpu, false); -} - -static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); + WARN_ON_ONCE(1); + return X86EMUL_UNHANDLEABLE; } +#endif static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { @@ -8215,7 +8182,6 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, .write_std = emulator_write_std, - .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -8235,11 +8201,8 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .get_smbase = emulator_get_smbase, - .set_smbase = emulator_set_smbase, .set_msr_with_filter = emulator_set_msr_with_filter, .get_msr_with_filter = emulator_get_msr_with_filter, - .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, .read_pmc = emulator_read_pmc, @@ -8254,7 +8217,6 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, - .exiting_smm = emulator_exiting_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8327,8 +8289,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); ctxt->interruptibility = 0; ctxt->have_exception = false; @@ -8587,29 +8547,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) -{ - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); - - if (entering_smm) { - vcpu->arch.hflags |= HF_SMM_MASK; - } else { - vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - - /* Process a latched INIT or SMI, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - - /* - * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, - * on SMM exit we still need to reload them from - * guest memory - */ - vcpu->arch.pdptrs_from_userspace = false; - } - - kvm_mmu_reset_context(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -8841,7 +8778,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; - if (ctxt->have_exception) { + + if (ctxt->have_exception && + !(emulation_type & EMULTYPE_SKIP)) { /* * #UD should result in just EMULATION_FAILED, and trap-like * exception should not be encountered during decode. @@ -9105,9 +9044,11 @@ static void tsc_khz_changed(void *data) struct cpufreq_freqs *freq = data; unsigned long khz = 0; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); + if (data) khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + else khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; @@ -9128,8 +9069,10 @@ static void kvm_hyperv_tsc_notifier(void) hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ - for_each_present_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -9266,10 +9209,10 @@ static void kvm_timer_init(void) } cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - } - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", - kvmclock_cpu_online, kvmclock_cpu_down_prep); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", + kvmclock_cpu_online, kvmclock_cpu_down_prep); + } } #ifdef CONFIG_X86_64 @@ -9429,10 +9372,11 @@ void kvm_arch_exit(void) #endif kvm_lapic_exit(); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + } #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); irq_work_sync(&pvclock_irq_work); @@ -9999,6 +9943,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, * in order to make progress and get back here for another iteration. * The kvm_x86_ops hooks communicate this by returning -EBUSY. */ +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending) { r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) @@ -10011,6 +9956,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, } else static_call(kvm_x86_enable_smi_window)(vcpu); } +#endif if (vcpu->arch.nmi_pending) { r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; @@ -10086,246 +10032,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) -{ - u32 flags = 0; - flags |= seg->g << 23; - flags |= seg->db << 22; - flags |= seg->l << 21; - flags |= seg->avl << 20; - flags |= seg->present << 15; - flags |= seg->dpl << 13; - flags |= seg->s << 12; - flags |= seg->type << 8; - return flags; -} - -static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - - kvm_get_segment(vcpu, &seg, n); - put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - put_smstate(u32, buf, offset + 8, seg.base); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - u16 flags; - - kvm_get_segment(vcpu, &seg, n); - offset = 0x7e00 + n * 16; - - flags = enter_smm_get_segment_flags(&seg) >> 8; - put_smstate(u16, buf, offset, seg.selector); - put_smstate(u16, buf, offset + 2, flags); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u64, buf, offset + 8, seg.base); -} -#endif - -static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); - put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); - put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); - put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); - - for (i = 0; i < 8; i++) - put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); - - kvm_get_dr(vcpu, 6, &val); - put_smstate(u32, buf, 0x7fcc, (u32)val); - kvm_get_dr(vcpu, 7, &val); - put_smstate(u32, buf, 0x7fc8, (u32)val); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u32, buf, 0x7fc4, seg.selector); - put_smstate(u32, buf, 0x7f64, seg.base); - put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u32, buf, 0x7fc0, seg.selector); - put_smstate(u32, buf, 0x7f80, seg.base); - put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f74, dt.address); - put_smstate(u32, buf, 0x7f70, dt.size); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f58, dt.address); - put_smstate(u32, buf, 0x7f54, dt.size); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_32(vcpu, buf, i); - - put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); - - /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020000); - put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - for (i = 0; i < 16; i++) - put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); - - put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); - put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); - - kvm_get_dr(vcpu, 6, &val); - put_smstate(u64, buf, 0x7f68, val); - kvm_get_dr(vcpu, 7, &val); - put_smstate(u64, buf, 0x7f60, val); - - put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); - put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); - put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); - - put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); - - /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020064); - - put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e94, seg.limit); - put_smstate(u64, buf, 0x7e98, seg.base); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e84, dt.size); - put_smstate(u64, buf, 0x7e88, dt.address); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e74, seg.limit); - put_smstate(u64, buf, 0x7e78, seg.base); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e64, dt.size); - put_smstate(u64, buf, 0x7e68, dt.address); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_64(vcpu, buf, i); -} -#endif - -static void enter_smm(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ds; - struct desc_ptr dt; - unsigned long cr0; - char buf[512]; - - memset(buf, 0, 512); -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - enter_smm_save_state_64(vcpu, buf); - else -#endif - enter_smm_save_state_32(vcpu, buf); - - /* - * Give enter_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. leave guest mode) after we've saved the state into the - * SMM state-save area. - */ - static_call(kvm_x86_enter_smm)(vcpu, buf); - - kvm_smm_changed(vcpu, true); - kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - - if (static_call(kvm_x86_get_nmi_mask)(vcpu)) - vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; - else - static_call(kvm_x86_set_nmi_mask)(vcpu, true); - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0x8000); - - cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; - - static_call(kvm_x86_set_cr4)(vcpu, 0); - - /* Undocumented: IDT limit is set to zero on entry to SMM. */ - dt.address = dt.size = 0; - static_call(kvm_x86_set_idt)(vcpu, &dt); - - kvm_set_dr(vcpu, 7, DR7_FIXED_1); - - cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; - cs.base = vcpu->arch.smbase; - - ds.selector = 0; - ds.base = 0; - - cs.limit = ds.limit = 0xffffffff; - cs.type = ds.type = 0x3; - cs.dpl = ds.dpl = 0; - cs.db = ds.db = 0; - cs.s = ds.s = 1; - cs.l = ds.l = 0; - cs.g = ds.g = 1; - cs.avl = ds.avl = 0; - cs.present = ds.present = 1; - cs.unusable = ds.unusable = 0; - cs.padding = ds.padding = 0; - - kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); - kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); - -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - static_call(kvm_x86_set_efer)(vcpu, 0); -#endif - - kvm_update_cpuid_runtime(vcpu); - kvm_mmu_reset_context(vcpu); -} - -static void process_smi(struct kvm_vcpu *vcpu) -{ - vcpu->arch.smi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { @@ -10516,20 +10222,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - /* Forbid vmenter if vcpu dirty ring is soft-full */ - if (unlikely(vcpu->kvm->dirty_ring_size && - kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { - vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; - trace_kvm_dirty_ring_exit(vcpu); - r = 0; - goto out; - } - if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } + + if (kvm_dirty_ring_check_request(vcpu)) { + r = 0; + goto out; + } + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; @@ -10553,14 +10256,27 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + + /* + * Note, the order matters here, as flushing "all" TLB entries + * also flushes the "current" TLB entries, i.e. servicing the + * flush "all" will clear any request to flush "current". + */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb_all(vcpu); - /* Flushing all ASIDs flushes the current ASID... */ - kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } kvm_service_local_tlb_flush_requests(vcpu); + /* + * Fall back to a "full" guest flush if Hyper-V's precise + * flushing fails. Note, Hyper-V's flushing is per-vCPU, but + * the flushes are considered "remote" and not "local" because + * the requests can be initiated from other vCPUs. + */ + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) && + kvm_hv_vcpu_flush_tlb(vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -10585,8 +10301,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) @@ -11834,7 +11552,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_gpc_init(&vcpu->arch.pv_time); + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -11900,6 +11618,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; kvm_async_pf_hash_reset(vcpu); + + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; @@ -12334,7 +12054,6 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -12909,10 +12628,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) static_call(kvm_x86_nmi_allowed)(vcpu, false))) return true; +#ifdef CONFIG_KVM_SMM if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; +#endif if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || @@ -12953,7 +12674,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) return true; if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 829d3134c1eb..9de72586f406 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -27,6 +27,7 @@ struct kvm_caps { u64 supported_mce_cap; u64 supported_xcr0; u64 supported_xss; + u64 supported_perf_cap; }; void kvm_spurious_fault(void); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index f3098c0e386a..d7af40240248 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gpc_deactivate(kvm, gpc); + kvm_gpc_deactivate(gpc); goto out; } do { - ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, - PAGE_SIZE); + ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); if (ret) goto out; @@ -170,112 +169,45 @@ static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) vcpu->arch.xen.timer.function = xen_timer_callback; } -static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; - u64 now = get_kvmclock_ns(v->kvm); - u64 delta_ns = now - vx->runstate_entry_time; - u64 run_delay = current->sched_info.run_delay; - - if (unlikely(!vx->runstate_entry_time)) - vx->current_runstate = RUNSTATE_offline; - - /* - * Time waiting for the scheduler isn't "stolen" if the - * vCPU wasn't running anyway. - */ - if (vx->current_runstate == RUNSTATE_running) { - u64 steal_ns = run_delay - vx->last_steal; - - delta_ns -= steal_ns; - - vx->runstate_times[RUNSTATE_runnable] += steal_ns; - } - vx->last_steal = run_delay; - - vx->runstate_times[vx->current_runstate] += delta_ns; - vx->current_runstate = state; - vx->runstate_entry_time = now; -} - -void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) -{ - struct kvm_vcpu_xen *vx = &v->arch.xen; - struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; - uint64_t *user_times; + struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache; + struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; + size_t user_len, user_len1, user_len2; + struct vcpu_runstate_info rs; unsigned long flags; - size_t user_len; - int *user_state; - - kvm_xen_update_runstate(v, state); - - if (!vx->runstate_cache.active) - return; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_len = sizeof(struct vcpu_runstate_info); - else - user_len = sizeof(struct compat_vcpu_runstate_info); - - read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - user_len)) { - read_unlock_irqrestore(&gpc->lock, flags); - - /* When invoked from kvm_sched_out() we cannot sleep */ - if (state == RUNSTATE_runnable) - return; - - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) - return; - - read_lock_irqsave(&gpc->lock, flags); - } + size_t times_ofs; + uint8_t *update_bit = NULL; + uint64_t entry_time; + uint64_t *rs_times; + int *rs_state; /* * The only difference between 32-bit and 64-bit versions of the - * runstate struct us the alignment of uint64_t in 32-bit, which + * runstate struct is the alignment of uint64_t in 32-bit, which * means that the 64-bit version has an additional 4 bytes of - * padding after the first field 'state'. - * - * So we use 'int __user *user_state' to point to the state field, - * and 'uint64_t __user *user_times' for runstate_entry_time. So - * the actual array of time[] in each state starts at user_times[1]. + * padding after the first field 'state'. Let's be really really + * paranoid about that, and matching it with our internal data + * structures that we memcpy into it... */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); #ifdef CONFIG_X86_64 + /* + * The 64-bit structure has 4 bytes of padding before 'state_entry_time' + * so each subsequent field is shifted by 4, and it's 4 bytes longer. + */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); + BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); #endif - - user_state = gpc->khva; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_times = gpc->khva + offsetof(struct vcpu_runstate_info, - state_entry_time); - else - user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, - state_entry_time); - /* - * First write the updated state_entry_time at the appropriate - * location determined by 'offset'. - */ - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - - user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; - smp_wmb(); - - /* - * Next, write the new runstate. This is in the *same* place - * for 32-bit and 64-bit guests, asserted here for paranoia. + * The state field is in the same place at the start of both structs, + * and is the same size (int) as vx->current_runstate. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); @@ -284,34 +216,238 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - *user_state = vx->current_runstate; + /* + * The state_entry_time field is 64 bits in both versions, and the + * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 + * is little-endian means that it's in the last *byte* of the word. + * That detail is important later. + */ + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); /* - * Write the actual runstate times immediately after the - * runstate_entry_time. + * The time array is four 64-bit quantities in both versions, matching + * the vx->runstate_times and immediately following state_entry_time. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != - offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != - offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof_field(struct compat_vcpu_runstate_info, time)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); - smp_wmb(); + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + user_len = sizeof(struct vcpu_runstate_info); + times_ofs = offsetof(struct vcpu_runstate_info, + state_entry_time); + } else { + user_len = sizeof(struct compat_vcpu_runstate_info); + times_ofs = offsetof(struct compat_vcpu_runstate_info, + state_entry_time); + } + + /* + * There are basically no alignment constraints. The guest can set it + * up so it crosses from one page to the next, and at arbitrary byte + * alignment (and the 32-bit ABI doesn't align the 64-bit integers + * anyway, even if the overall struct had been 64-bit aligned). + */ + if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) { + user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK); + user_len2 = user_len - user_len1; + } else { + user_len1 = user_len; + user_len2 = 0; + } + BUG_ON(user_len1 + user_len2 != user_len); + + retry: + /* + * Attempt to obtain the GPC lock on *both* (if there are two) + * gfn_to_pfn caches that cover the region. + */ + read_lock_irqsave(&gpc1->lock, flags); + while (!kvm_gpc_check(gpc1, user_len1)) { + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + if (kvm_gpc_refresh(gpc1, user_len1)) + return; + + read_lock_irqsave(&gpc1->lock, flags); + } + + if (likely(!user_len2)) { + /* + * Set up three pointers directly to the runstate_info + * struct in the guest (via the GPC). + * + * • @rs_state → state field + * • @rs_times → state_entry_time field. + * • @update_bit → last byte of state_entry_time, which + * contains the XEN_RUNSTATE_UPDATE bit. + */ + rs_state = gpc1->khva; + rs_times = gpc1->khva + times_ofs; + if (v->kvm->arch.xen.runstate_update_flag) + update_bit = ((void *)(&rs_times[1])) - 1; + } else { + /* + * The guest's runstate_info is split across two pages and we + * need to hold and validate both GPCs simultaneously. We can + * declare a lock ordering GPC1 > GPC2 because nothing else + * takes them more than one at a time. + */ + read_lock(&gpc2->lock); + + if (!kvm_gpc_check(gpc2, user_len2)) { + read_unlock(&gpc2->lock); + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + /* + * Use kvm_gpc_activate() here because if the runstate + * area was configured in 32-bit mode and only extends + * to the second page now because the guest changed to + * 64-bit mode, the second GPC won't have been set up. + */ + if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1, + user_len2)) + return; + + /* + * We dropped the lock on GPC1 so we have to go all the + * way back and revalidate that too. + */ + goto retry; + } + + /* + * In this case, the runstate_info struct will be assembled on + * the kernel stack (compat or not as appropriate) and will + * be copied to GPC1/GPC2 with a dual memcpy. Set up the three + * rs pointers accordingly. + */ + rs_times = &rs.state_entry_time; + + /* + * The rs_state pointer points to the start of what we'll + * copy to the guest, which in the case of a compat guest + * is the 32-bit field that the compiler thinks is padding. + */ + rs_state = ((void *)rs_times) - times_ofs; + + /* + * The update_bit is still directly in the guest memory, + * via one GPC or the other. + */ + if (v->kvm->arch.xen.runstate_update_flag) { + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; + } + +#ifdef CONFIG_X86_64 + /* + * Don't leak kernel memory through the padding in the 64-bit + * version of the struct. + */ + memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); +#endif + } + + /* + * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the + * state_entry_time field, directly in the guest. We need to set + * that (and write-barrier) before writing to the rest of the + * structure, and clear it last. Just as Xen does, we address the + * single *byte* in which it resides because it might be in a + * different cache line to the rest of the 64-bit word, due to + * the (lack of) alignment constraints. + */ + entry_time = vx->runstate_entry_time; + if (update_bit) { + entry_time |= XEN_RUNSTATE_UPDATE; + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; + smp_wmb(); + } /* - * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's - * runstate_entry_time field. + * Now assemble the actual structure, either on our kernel stack + * or directly in the guest according to how the rs_state and + * rs_times pointers were set up above. */ - user_times[0] &= ~XEN_RUNSTATE_UPDATE; + *rs_state = vx->current_runstate; + rs_times[0] = entry_time; + memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); + + /* For the split case, we have to then copy it to the guest. */ + if (user_len2) { + memcpy(gpc1->khva, rs_state, user_len1); + memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); + } smp_wmb(); - read_unlock_irqrestore(&gpc->lock, flags); + /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ + if (update_bit) { + entry_time &= ~XEN_RUNSTATE_UPDATE; + *update_bit = entry_time >> 56; + smp_wmb(); + } - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + if (user_len2) + read_unlock(&gpc2->lock); + + read_unlock_irqrestore(&gpc1->lock, flags); + + mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); + if (user_len2) + mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); +} + +void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + u64 now = get_kvmclock_ns(v->kvm); + u64 delta_ns = now - vx->runstate_entry_time; + u64 run_delay = current->sched_info.run_delay; + + if (unlikely(!vx->runstate_entry_time)) + vx->current_runstate = RUNSTATE_offline; + + /* + * Time waiting for the scheduler isn't "stolen" if the + * vCPU wasn't running anyway. + */ + if (vx->current_runstate == RUNSTATE_running) { + u64 steal_ns = run_delay - vx->last_steal; + + delta_ns -= steal_ns; + + vx->runstate_times[RUNSTATE_runnable] += steal_ns; + } + vx->last_steal = run_delay; + + vx->runstate_times[vx->current_runstate] += delta_ns; + vx->current_runstate = state; + vx->runstate_entry_time = now; + + if (vx->runstate_cache.active) + kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) @@ -352,12 +488,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); @@ -417,8 +551,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* @@ -432,8 +565,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (in_atomic() || !task_is_running(current)) return 1; - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. @@ -493,6 +625,17 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + mutex_lock(&kvm->lock); + kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; + mutex_unlock(&kvm->lock); + r = 0; + break; + default: break; } @@ -530,6 +673,15 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; + r = 0; + break; + default: break; } @@ -554,15 +706,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, NULL, - KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -570,37 +720,65 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache, + data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; - case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: { + size_t sz, sz1, sz2; + if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); r = 0; + deactivate_out: + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); break; } - r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); - break; + /* + * If the guest switches to 64-bit mode after setting the runstate + * address, that's actually OK. kvm_xen_update_runstate_guest() + * will cope. + */ + if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) + sz = sizeof(struct vcpu_runstate_info); + else + sz = sizeof(struct compat_vcpu_runstate_info); + + /* How much fits in the (first) page? */ + sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache, + data->u.gpa, sz1); + if (r) + goto deactivate_out; + + /* Either map the second page, or deactivate the second GPC */ + if (sz1 >= sz) { + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + } else { + sz2 = sz - sz1; + BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache, + data->u.gpa + sz1, sz2); + if (r) + goto deactivate_out; + } + kvm_xen_update_runstate_guest(vcpu, false); + break; + } case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: if (!sched_info_on()) { r = -EOPNOTSUPP; @@ -693,6 +871,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (data->u.runstate.state <= RUNSTATE_offline) kvm_xen_update_runstate(vcpu, data->u.runstate.state); + else if (vcpu->arch.xen.runstate_cache.active) + kvm_xen_update_runstate_guest(vcpu, false); r = 0; break; @@ -972,9 +1152,9 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, bool ret = true; int idx, i; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + read_lock_irqsave(&gpc->lock, flags); + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; ret = false; @@ -994,8 +1174,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -1008,20 +1188,45 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, evtchn_port_t port, *ports; gpa_t gpa; - if (!longmode || !lapic_in_kernel(vcpu) || + if (!lapic_in_kernel(vcpu) || !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, - sizeof(sched_poll))) { + if (!gpa) { *r = -EFAULT; return true; } + if (IS_ENABLED(CONFIG_64BIT) && !longmode) { + struct compat_sched_poll sp32; + + /* Sanity check that the compat struct definition is correct */ + BUILD_BUG_ON(sizeof(sp32) != 16); + + if (kvm_vcpu_read_guest(vcpu, gpa, &sp32, sizeof(sp32))) { + *r = -EFAULT; + return true; + } + + /* + * This is a 32-bit pointer to an array of evtchn_port_t which + * are uint32_t, so once it's converted no further compat + * handling is needed. + */ + sched_poll.ports = (void *)(unsigned long)(sp32.ports); + sched_poll.nr_ports = sp32.nr_ports; + sched_poll.timeout = sp32.timeout; + } else { + if (kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, + sizeof(sched_poll))) { + *r = -EFAULT; + return true; + } + } + if (unlikely(sched_poll.nr_ports > 1)) { /* Xen (unofficially) limits number of pollers to 128 */ if (sched_poll.nr_ports > 128) { @@ -1256,7 +1461,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) } #endif cpl = static_call(kvm_x86_get_cpl)(vcpu); - trace_kvm_xen_hypercall(input, params[0], params[1], params[2], + trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2], params[3], params[4], params[5]); /* @@ -1371,7 +1576,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { @@ -1405,7 +1610,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. @@ -1503,7 +1708,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); + rc = kvm_gpc_refresh(gpc, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); @@ -1833,9 +2038,14 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - kvm_gpc_init(&vcpu->arch.xen.runstate_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1843,9 +2053,10 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); del_timer_sync(&vcpu->arch.xen.poll_timer); } @@ -1853,7 +2064,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); - kvm_gpc_init(&kvm->arch.xen.shinfo_cache); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1861,7 +2072,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 532a535a9e99..ea33d80a0c51 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -143,11 +143,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu); #include <asm/xen/interface.h> #include <xen/interface/vcpu.h> -void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state); +void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state); static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu) { - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running); + kvm_xen_update_runstate(vcpu, RUNSTATE_running); } static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) @@ -162,7 +162,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!vcpu->preempted)) return; - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); + kvm_xen_update_runstate(vcpu, RUNSTATE_runnable); } /* 32-bit compatibility definitions, also used natively in 32-bit build */ @@ -207,4 +207,11 @@ struct compat_vcpu_runstate_info { uint64_t time[4]; } __attribute__((packed)); +struct compat_sched_poll { + /* This is actually a guest virtual address which points to ports. */ + uint32_t ports; + unsigned int nr_ports; + uint64_t timeout; +}; + #endif /* __ARCH_X86_KVM_XEN_H__ */ |