diff options
52 files changed, 1302 insertions, 680 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4d0542c5206b..a4482cce4bae 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -199,8 +199,8 @@ Type: vm ioctl Parameters: vcpu id (apic id on x86) Returns: vcpu fd on success, -1 on error -This API adds a vcpu to a virtual machine. The vcpu id is a small integer -in the range [0, max_vcpus). +This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. +The vcpu id is an integer in the range [0, max_vcpu_id). The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. @@ -212,6 +212,12 @@ cpus max. If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is same as the value returned from KVM_CAP_NR_VCPUS. +The maximum possible value for max_vcpu_id can be retrieved using the +KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id +is the same as the value returned from KVM_CAP_MAX_VCPUS. + On powerpc using book3s_hv mode, the vcpus are mapped onto virtual threads in one or more virtual CPU cores. (This is because the hardware requires all the hardware threads in a CPU core to be in the @@ -3788,6 +3794,14 @@ a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. Fails if VCPU has already been created, or if the irqchip is already in the kernel (i.e. KVM_CREATE_IRQCHIP has already been called). +7.6 KVM_CAP_S390_RI + +Architectures: s390 +Parameters: none + +Allows use of runtime-instrumentation introduced with zEC12 processor. +Will return -EINVAL if the machine does not support runtime-instrumentation. +Will return -EBUSY if a VCPU has already been created. 8. Other capabilities. ---------------------- diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt index 8478de1c0192..6b0e115301c8 100644 --- a/Documentation/virtual/kvm/devices/s390_flic.txt +++ b/Documentation/virtual/kvm/devices/s390_flic.txt @@ -74,7 +74,7 @@ struct kvm_s390_io_adapter { KVM_DEV_FLIC_ADAPTER_MODIFY Modifies attributes of an existing I/O adapter interrupt source. Takes - a kvm_s390_io_adapter_req specifiying the adapter and the operation: + a kvm_s390_io_adapter_req specifying the adapter and the operation: struct kvm_s390_io_adapter_req { __u32 id; diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index da44be9db4fa..ef0b276d97fc 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -47,6 +47,7 @@ #include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/pgalloc.h> +#include <asm/stage2_pgtable.h> int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); @@ -105,14 +106,16 @@ static inline void kvm_clean_pte(pte_t *pte) clean_pte_table(pte); } -static inline void kvm_set_s2pte_writable(pte_t *pte) +static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { - pte_val(*pte) |= L_PTE_S2_RDWR; + pte_val(pte) |= L_PTE_S2_RDWR; + return pte; } -static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) { - pmd_val(*pmd) |= L_PMD_S2_RDWR; + pmd_val(pmd) |= L_PMD_S2_RDWR; + return pmd; } static inline void kvm_set_s2pte_readonly(pte_t *pte) @@ -135,22 +138,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; } - -/* Open coded p*d_addr_end that can deal with 64bit addresses */ -#define kvm_pgd_addr_end(addr, end) \ -({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ - (__boundary - 1 < (end) - 1)? __boundary: (end); \ -}) - -#define kvm_pud_addr_end(addr,end) (end) - -#define kvm_pmd_addr_end(addr, end) \ -({ u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ - (__boundary - 1 < (end) - 1)? __boundary: (end); \ -}) - -#define kvm_pgd_index(addr) pgd_index(addr) - static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); @@ -159,19 +146,11 @@ static inline bool kvm_page_empty(void *ptr) #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) -#define kvm_pud_table_empty(kvm, pudp) (0) - -#define KVM_PREALLOC_LEVEL 0 +#define kvm_pud_table_empty(kvm, pudp) false -static inline void *kvm_get_hwpgd(struct kvm *kvm) -{ - return kvm->arch.pgd; -} - -static inline unsigned int kvm_get_hwpgd_size(void) -{ - return PTRS_PER_S2_PGD * sizeof(pgd_t); -} +#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) +#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define hyp_pud_table_empty(pudp) false struct kvm; diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h new file mode 100644 index 000000000000..460d616bb2d6 --- /dev/null +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * stage2 page table helpers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM_S2_PGTABLE_H_ +#define __ARM_S2_PGTABLE_H_ + +#define stage2_pgd_none(pgd) pgd_none(pgd) +#define stage2_pgd_clear(pgd) pgd_clear(pgd) +#define stage2_pgd_present(pgd) pgd_present(pgd) +#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(pud) pud_free(NULL, pud) + +#define stage2_pud_none(pud) pud_none(pud) +#define stage2_pud_clear(pud) pud_clear(pud) +#define stage2_pud_present(pud) pud_present(pud) +#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(pud) pud_huge(pud) + +/* Open coded p*d_addr_end that can deal with 64bit addresses */ +static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#define stage2_pud_addr_end(addr, end) (end) + +static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#define stage2_pgd_index(addr) pgd_index(addr) + +#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(pudp) false + +#endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dded1b763c16..be4b6394a062 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -448,7 +448,7 @@ static void update_vttbr(struct kvm *kvm) kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm)); + pgd_phys = virt_to_phys(kvm->arch.pgd); BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = pgd_phys | vmid; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 58dbd5c439df..783e5ff0b32e 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -43,11 +43,9 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) -#define kvm_pud_huge(_x) pud_huge(_x) - #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) @@ -69,14 +67,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { - /* - * This function also gets called when dealing with HYP page - * tables. As HYP doesn't have an associated struct kvm (and - * the HYP page tables are fairly static), we don't do - * anything there. - */ - if (kvm) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } /* @@ -115,7 +106,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) */ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) { - if (!kvm_pmd_huge(*pmd)) + if (!pmd_thp_or_huge(*pmd)) return; pmd_clear(pmd); @@ -155,29 +146,29 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } -static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) +static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); - pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); + stage2_pgd_clear(pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - pud_free(NULL, pud_table); + stage2_pud_free(pud_table); put_page(virt_to_page(pgd)); } -static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) +static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); + VM_BUG_ON(stage2_pud_huge(*pud)); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + stage2_pmd_free(pmd_table); put_page(virt_to_page(pud)); } -static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) +static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(kvm_pmd_huge(*pmd)); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); pte_free_kernel(NULL, pte_table); @@ -204,7 +195,7 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure * the IO subsystem will never hit in the cache. */ -static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, +static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) { phys_addr_t start_addr = addr; @@ -226,21 +217,21 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (kvm_pte_table_empty(kvm, start_pte)) - clear_pmd_entry(kvm, pmd, start_addr); + if (stage2_pte_table_empty(start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); } -static void unmap_pmds(struct kvm *kvm, pud_t *pud, +static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; pmd_clear(pmd); @@ -250,57 +241,64 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud, put_page(virt_to_page(pmd)); } else { - unmap_ptes(kvm, pmd, addr, next); + unmap_stage2_ptes(kvm, pmd, addr, next); } } } while (pmd++, addr = next, addr != end); - if (kvm_pmd_table_empty(kvm, start_pmd)) - clear_pud_entry(kvm, pud, start_addr); + if (stage2_pmd_table_empty(start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) { pud_t old_pud = *pud; - pud_clear(pud); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); } else { - unmap_pmds(kvm, pud, addr, next); + unmap_stage2_pmds(kvm, pud, addr, next); } } } while (pud++, addr = next, addr != end); - if (kvm_pud_table_empty(kvm, start_pud)) - clear_pgd_entry(kvm, pgd, start_addr); + if (stage2_pud_table_empty(start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); } - -static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - phys_addr_t start, u64 size) +/** + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @kvm: The VM pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) { pgd_t *pgd; phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_puds(kvm, pgd, addr, next); + next = stage2_pgd_addr_end(addr, end); + if (!stage2_pgd_none(*pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -322,11 +320,11 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) + if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); else stage2_flush_ptes(kvm, pmd, addr, next); @@ -340,11 +338,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -360,9 +358,9 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); + next = stage2_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -391,6 +389,100 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } +static void clear_hyp_pgd_entry(pgd_t *pgd) +{ + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + pgd_clear(pgd); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_hyp_pud_entry(pud_t *pud) +{ + pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_hyp_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (hyp_pte_table_empty(start_pte)) + clear_hyp_pmd_entry(pmd); +} + +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + /* Hyp doesn't use huge pmds */ + if (!pmd_none(*pmd)) + unmap_hyp_ptes(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (hyp_pmd_table_empty(start_pmd)) + clear_hyp_pud_entry(pud); +} + +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + /* Hyp doesn't use huge puds */ + if (!pud_none(*pud)) + unmap_hyp_pmds(pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (hyp_pud_table_empty(start_pud)) + clear_hyp_pgd_entry(pgd); +} + +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + /* + * We don't unmap anything from HYP, except at the hyp tear down. + * Hence, we don't have to invalidate the TLBs here. + */ + pgd = pgdp + pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_hyp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + /** * free_boot_hyp_pgd - free HYP boot page tables * @@ -401,14 +493,14 @@ void free_boot_hyp_pgd(void) mutex_lock(&kvm_hyp_pgd_mutex); if (boot_hyp_pgd) { - unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); boot_hyp_pgd = NULL; } if (hyp_pgd) - unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -433,9 +525,9 @@ void free_hyp_pgds(void) if (hyp_pgd) { for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; @@ -645,20 +737,6 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } -/* Free the HW pgd, one page at a time */ -static void kvm_free_hwpgd(void *hwpgd) -{ - free_pages_exact(hwpgd, kvm_get_hwpgd_size()); -} - -/* Allocate the HW PGD, making sure that each page gets its own refcount */ -static void *kvm_alloc_hwpgd(void) -{ - unsigned int size = kvm_get_hwpgd_size(); - - return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); -} - /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -673,81 +751,22 @@ static void *kvm_alloc_hwpgd(void) int kvm_alloc_stage2_pgd(struct kvm *kvm) { pgd_t *pgd; - void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - hwpgd = kvm_alloc_hwpgd(); - if (!hwpgd) + /* Allocate the HW PGD, making sure that each page gets its own refcount */ + pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + if (!pgd) return -ENOMEM; - /* When the kernel uses more levels of page tables than the - * guest, we allocate a fake PGD and pre-populate it to point - * to the next-level page table, which will be the real - * initial page table pointed to by the VTTBR. - * - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for - * the PMD and the kernel will use folded pud. - * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD - * pages. - */ - if (KVM_PREALLOC_LEVEL > 0) { - int i; - - /* - * Allocate fake pgd for the page table manipulation macros to - * work. This is not used by the hardware and we have no - * alignment requirement for this allocation. - */ - pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), - GFP_KERNEL | __GFP_ZERO); - - if (!pgd) { - kvm_free_hwpgd(hwpgd); - return -ENOMEM; - } - - /* Plug the HW PGD into the fake one. */ - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - } else { - /* - * Allocate actual first-level Stage-2 page table used by the - * hardware for Stage-2 page table walks. - */ - pgd = (pgd_t *)hwpgd; - } - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; } -/** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer - * @start: The intermediate physical base address of the range to unmap - * @size: The size of the area to unmap - * - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must - * be called while holding mmu_lock (unless for freeing the stage2 pgd before - * destroying the VM), otherwise another faulting VCPU may come in and mess - * with things behind our backs. - */ -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) -{ - unmap_range(kvm, kvm->arch.pgd, start, size); -} - static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -830,10 +849,8 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - kvm_free_hwpgd(kvm_get_hwpgd(kvm)); - if (KVM_PREALLOC_LEVEL > 0) - kfree(kvm->arch.pgd); - + /* Free the HW pgd, one page at a time */ + free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE); kvm->arch.pgd = NULL; } @@ -843,16 +860,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); - if (WARN_ON(pgd_none(*pgd))) { + pgd = kvm->arch.pgd + stage2_pgd_index(addr); + if (WARN_ON(stage2_pgd_none(*pgd))) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - pgd_populate(NULL, pgd, pud); + stage2_pgd_populate(pgd, pud); get_page(virt_to_page(pgd)); } - return pud_offset(pgd, addr); + return stage2_pud_offset(pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -862,15 +879,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (pud_none(*pud)) { + if (stage2_pud_none(*pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - pud_populate(NULL, pud, pmd); + stage2_pud_populate(pud, pmd); get_page(virt_to_page(pud)); } - return pmd_offset(pud, addr); + return stage2_pmd_offset(pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -893,11 +910,14 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); old_pmd = *pmd; - kvm_set_pmd(pmd, *new_pmd); - if (pmd_present(old_pmd)) + if (pmd_present(old_pmd)) { + pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); - else + } else { get_page(virt_to_page(pmd)); + } + + kvm_set_pmd(pmd, *new_pmd); return 0; } @@ -946,15 +966,38 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, /* Create 2nd stage page table mapping - Level 3 */ old_pte = *pte; - kvm_set_pte(pte, *new_pte); - if (pte_present(old_pte)) + if (pte_present(old_pte)) { + kvm_set_pte(pte, __pte(0)); kvm_tlb_flush_vmid_ipa(kvm, addr); - else + } else { get_page(virt_to_page(pte)); + } + kvm_set_pte(pte, *new_pte); return 0; } +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); + return 1; + } + return 0; +} +#else +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return __ptep_test_and_clear_young(pte); +} +#endif + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pmd); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -978,7 +1021,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) - kvm_set_s2pte_writable(&pte); + pte = kvm_s2pte_mkwrite(pte); ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, KVM_NR_MEM_OBJS); @@ -1078,12 +1121,12 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) kvm_set_s2pmd_readonly(pmd); } else { @@ -1106,12 +1149,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(kvm_pud_huge(*pud)); + BUG_ON(stage2_pud_huge(*pud)); stage2_wp_pmds(pud, addr, next); } } while (pud++, addr = next, addr != end); @@ -1128,7 +1171,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1140,8 +1183,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - next = kvm_pgd_addr_end(addr, end); - if (pgd_present(*pgd)) + next = stage2_pgd_addr_end(addr, end); + if (stage2_pgd_present(*pgd)) stage2_wp_puds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1320,7 +1363,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) { - kvm_set_s2pmd_writable(&new_pmd); + new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); @@ -1329,7 +1372,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pte_t new_pte = pfn_pte(pfn, mem_type); if (writable) { - kvm_set_s2pte_writable(&new_pte); + new_pte = kvm_s2pte_mkwrite(new_pte); kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } @@ -1348,6 +1391,8 @@ out_unlock: * Resolve the access fault by making the page young again. * Note that because the faulting entry is guaranteed not to be * cached in the TLB, we don't need to invalidate anything. + * Only the HW Access Flag updates are supported for Stage 2 (no DBM), + * so there is no need for atomic (pte|pmd)_mkyoung operations. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { @@ -1364,7 +1409,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) if (!pmd || pmd_none(*pmd)) /* Nothing there */ goto out; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; @@ -1588,25 +1633,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ - if (pmd_young(*pmd)) { - *pmd = pmd_mkold(*pmd); - return 1; - } - - return 0; - } + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + return stage2_pmdp_test_and_clear_young(pmd); pte = pte_offset_kernel(pmd, gpa); if (pte_none(*pte)) return 0; - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); /* Just a page... */ - return 1; - } - - return 0; + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) @@ -1618,7 +1652,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ return pmd_young(*pmd); pte = pte_offset_kernel(pmd, gpa); diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 3f29887995bc..ffde15fed3e1 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -96,32 +96,37 @@ SCTLR_EL2_SA | SCTLR_EL2_I) /* TCR_EL2 Registers bits */ -#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) -#define TCR_EL2_TBI (1 << 20) -#define TCR_EL2_PS (7 << 16) -#define TCR_EL2_PS_40B (2 << 16) -#define TCR_EL2_TG0 (1 << 14) -#define TCR_EL2_SH0 (3 << 12) -#define TCR_EL2_ORGN0 (3 << 10) -#define TCR_EL2_IRGN0 (3 << 8) -#define TCR_EL2_T0SZ 0x3f -#define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ - TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) +#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) +#define TCR_EL2_TBI (1 << 20) +#define TCR_EL2_PS_SHIFT 16 +#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) +#define TCR_EL2_PS_40B (2 << TCR_EL2_PS_SHIFT) +#define TCR_EL2_TG0_MASK TCR_TG0_MASK +#define TCR_EL2_SH0_MASK TCR_SH0_MASK +#define TCR_EL2_ORGN0_MASK TCR_ORGN0_MASK +#define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK +#define TCR_EL2_T0SZ_MASK 0x3f +#define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ #define VTCR_EL2_RES1 (1 << 31) -#define VTCR_EL2_PS_MASK (7 << 16) -#define VTCR_EL2_TG0_MASK (1 << 14) -#define VTCR_EL2_TG0_4K (0 << 14) -#define VTCR_EL2_TG0_64K (1 << 14) -#define VTCR_EL2_SH0_MASK (3 << 12) -#define VTCR_EL2_SH0_INNER (3 << 12) -#define VTCR_EL2_ORGN0_MASK (3 << 10) -#define VTCR_EL2_ORGN0_WBWA (1 << 10) -#define VTCR_EL2_IRGN0_MASK (3 << 8) -#define VTCR_EL2_IRGN0_WBWA (1 << 8) -#define VTCR_EL2_SL0_MASK (3 << 6) -#define VTCR_EL2_SL0_LVL1 (1 << 6) +#define VTCR_EL2_HD (1 << 22) +#define VTCR_EL2_HA (1 << 21) +#define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK +#define VTCR_EL2_TG0_MASK TCR_TG0_MASK +#define VTCR_EL2_TG0_4K TCR_TG0_4K +#define VTCR_EL2_TG0_16K TCR_TG0_16K +#define VTCR_EL2_TG0_64K TCR_TG0_64K +#define VTCR_EL2_SH0_MASK TCR_SH0_MASK +#define VTCR_EL2_SH0_INNER TCR_SH0_INNER +#define VTCR_EL2_ORGN0_MASK TCR_ORGN0_MASK +#define VTCR_EL2_ORGN0_WBWA TCR_ORGN0_WBWA +#define VTCR_EL2_IRGN0_MASK TCR_IRGN0_MASK +#define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA +#define VTCR_EL2_SL0_SHIFT 6 +#define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f #define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 @@ -137,35 +142,45 @@ * (see hyp-init.S). * * Note that when using 4K pages, we concatenate two first level page tables - * together. + * together. With 16K pages, we concatenate 16 first level page tables. * * The magic numbers used for VTTBR_X in this patch can be found in Tables * D4-23 and D4-25 in ARM DDI 0487A.b. */ + +#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B +#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ + VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) + #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: - * 40bits input (T0SZ = 24) * 64kB pages (TG0 = 1) * 2 level page tables (SL = 1) */ -#define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ - VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1) -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) -#else +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 38 +#elif defined(CONFIG_ARM64_16K_PAGES) +/* + * Stage2 translation configuration: + * 16kB pages (TG0 = 2) + * 2 level page tables (SL = 1) + */ +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 42 +#else /* 4K */ /* * Stage2 translation configuration: - * 40bits input (T0SZ = 24) * 4kB pages (TG0 = 0) * 3 level page tables (SL = 1) */ -#define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ - VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1) -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 37 #endif +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) +#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) + #define VTTBR_BADDR_SHIFT (VTTBR_X - 1) #define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) #define VTTBR_VMID_SHIFT (UL(48)) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 22732a5e3119..844fe5d5ff44 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -45,18 +45,6 @@ */ #define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD and potentially the PUD which are - * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2 - * tables use one level of tables less than the kernel. - */ -#ifdef CONFIG_ARM64_64K_PAGES -#define KVM_MMU_CACHE_MIN_PAGES 1 -#else -#define KVM_MMU_CACHE_MIN_PAGES 2 -#endif - #ifdef __ASSEMBLY__ #include <asm/alternative.h> @@ -91,6 +79,8 @@ alternative_endif #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) +#include <asm/stage2_pgtable.h> + int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); void free_boot_hyp_pgd(void); @@ -121,19 +111,32 @@ static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} static inline void kvm_clean_pte(pte_t *pte) {} static inline void kvm_clean_pte_entry(pte_t *pte) {} -static inline void kvm_set_s2pte_writable(pte_t *pte) +static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { - pte_val(*pte) |= PTE_S2_RDWR; + pte_val(pte) |= PTE_S2_RDWR; + return pte; } -static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) { - pmd_val(*pmd) |= PMD_S2_RDWR; + pmd_val(pmd) |= PMD_S2_RDWR; + return pmd; } static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY; + pteval_t pteval; + unsigned long tmp; + + asm volatile("// kvm_set_s2pte_readonly\n" + " prfm pstl1strm, %2\n" + "1: ldxr %0, %2\n" + " and %0, %0, %3 // clear PTE_S2_RDWR\n" + " orr %0, %0, %4 // set PTE_S2_RDONLY\n" + " stxr %w1, %0, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) + : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); } static inline bool kvm_s2pte_readonly(pte_t *pte) @@ -143,69 +146,12 @@ static inline bool kvm_s2pte_readonly(pte_t *pte) static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) { - pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY; + kvm_set_s2pte_readonly((pte_t *)pmd); } static inline bool kvm_s2pmd_readonly(pmd_t *pmd) { - return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY; -} - - -#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end) -#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) -#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) - -/* - * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address - * the entire IPA input range with a single pgd entry, and we would only need - * one pgd entry. Note that in this case, the pgd is actually not used by - * the MMU for Stage-2 translations, but is merely a fake pgd used as a data - * structure for the kernel pgtable macros to work. - */ -#if PGDIR_SHIFT > KVM_PHYS_SHIFT -#define PTRS_PER_S2_PGD_SHIFT 0 -#else -#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT) -#endif -#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) - -#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) - -/* - * If we are concatenating first level stage-2 page tables, we would have less - * than or equal to 16 pointers in the fake PGD, because that's what the - * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) - * represents the first level for the host, and we add 1 to go to the next - * level (which uses contatenation) for the stage-2 tables. - */ -#if PTRS_PER_S2_PGD <= 16 -#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) -#else -#define KVM_PREALLOC_LEVEL (0) -#endif - -static inline void *kvm_get_hwpgd(struct kvm *kvm) -{ - pgd_t *pgd = kvm->arch.pgd; - pud_t *pud; - - if (KVM_PREALLOC_LEVEL == 0) - return pgd; - - pud = pud_offset(pgd, 0); - if (KVM_PREALLOC_LEVEL == 1) - return pud; - - BUG_ON(KVM_PREALLOC_LEVEL != 2); - return pmd_offset(pud, 0); -} - -static inline unsigned int kvm_get_hwpgd_size(void) -{ - if (KVM_PREALLOC_LEVEL > 0) - return PTRS_PER_S2_PGD * PAGE_SIZE; - return PTRS_PER_S2_PGD * sizeof(pgd_t); + return kvm_s2pte_readonly((pte_t *)pmd); } static inline bool kvm_page_empty(void *ptr) @@ -214,23 +160,20 @@ static inline bool kvm_page_empty(void *ptr) return page_count(ptr_page) == 1; } -#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED -#define kvm_pmd_table_empty(kvm, pmdp) (0) +#define hyp_pmd_table_empty(pmdp) (0) #else -#define kvm_pmd_table_empty(kvm, pmdp) \ - (kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2)) +#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) #endif #ifdef __PAGETABLE_PUD_FOLDED -#define kvm_pud_table_empty(kvm, pudp) (0) +#define hyp_pud_table_empty(pudp) (0) #else -#define kvm_pud_table_empty(kvm, pudp) \ - (kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1)) +#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) #endif - struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5c25b831273d..936f1732727c 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -208,23 +208,69 @@ #define TCR_T1SZ(x) ((UL(64) - (x)) << TCR_T1SZ_OFFSET) #define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x)) #define TCR_TxSZ_WIDTH 6 -#define TCR_IRGN_NC ((UL(0) << 8) | (UL(0) << 24)) -#define TCR_IRGN_WBWA ((UL(1) << 8) | (UL(1) << 24)) -#define TCR_IRGN_WT ((UL(2) << 8) | (UL(2) << 24)) -#define TCR_IRGN_WBnWA ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_IRGN_MASK ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_ORGN_NC ((UL(0) << 10) | (UL(0) << 26)) -#define TCR_ORGN_WBWA ((UL(1) << 10) | (UL(1) << 26)) -#define TCR_ORGN_WT ((UL(2) << 10) | (UL(2) << 26)) -#define TCR_ORGN_WBnWA ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_ORGN_MASK ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_SHARED ((UL(3) << 12) | (UL(3) << 28)) -#define TCR_TG0_4K (UL(0) << 14) -#define TCR_TG0_64K (UL(1) << 14) -#define TCR_TG0_16K (UL(2) << 14) -#define TCR_TG1_16K (UL(1) << 30) -#define TCR_TG1_4K (UL(2) << 30) -#define TCR_TG1_64K (UL(3) << 30) + +#define TCR_IRGN0_SHIFT 8 +#define TCR_IRGN0_MASK (UL(3) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_NC (UL(0) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBWA (UL(1) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WT (UL(2) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBnWA (UL(3) << TCR_IRGN0_SHIFT) + +#define TCR_IRGN1_SHIFT 24 +#define TCR_IRGN1_MASK (UL(3) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_NC (UL(0) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WBWA (UL(1) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WT (UL(2) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WBnWA (UL(3) << TCR_IRGN1_SHIFT) + +#define TCR_IRGN_NC (TCR_IRGN0_NC | TCR_IRGN1_NC) +#define TCR_IRGN_WBWA (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA) +#define TCR_IRGN_WT (TCR_IRGN0_WT | TCR_IRGN1_WT) +#define TCR_IRGN_WBnWA (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA) +#define TCR_IRGN_MASK (TCR_IRGN0_MASK | TCR_IRGN1_MASK) + + +#define TCR_ORGN0_SHIFT 10 +#define TCR_ORGN0_MASK (UL(3) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_NC (UL(0) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBWA (UL(1) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WT (UL(2) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBnWA (UL(3) << TCR_ORGN0_SHIFT) + +#define TCR_ORGN1_SHIFT 26 +#define TCR_ORGN1_MASK (UL(3) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_NC (UL(0) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WBWA (UL(1) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WT (UL(2) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WBnWA (UL(3) << TCR_ORGN1_SHIFT) + +#define TCR_ORGN_NC (TCR_ORGN0_NC | TCR_ORGN1_NC) +#define TCR_ORGN_WBWA (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA) +#define TCR_ORGN_WT (TCR_ORGN0_WT | TCR_ORGN1_WT) +#define TCR_ORGN_WBnWA (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA) +#define TCR_ORGN_MASK (TCR_ORGN0_MASK | TCR_ORGN1_MASK) + +#define TCR_SH0_SHIFT 12 +#define TCR_SH0_MASK (UL(3) << TCR_SH0_SHIFT) +#define TCR_SH0_INNER (UL(3) << TCR_SH0_SHIFT) + +#define TCR_SH1_SHIFT 28 +#define TCR_SH1_MASK (UL(3) << TCR_SH1_SHIFT) +#define TCR_SH1_INNER (UL(3) << TCR_SH1_SHIFT) +#define TCR_SHARED (TCR_SH0_INNER | TCR_SH1_INNER) + +#define TCR_TG0_SHIFT 14 +#define TCR_TG0_MASK (UL(3) << TCR_TG0_SHIFT) +#define TCR_TG0_4K (UL(0) << TCR_TG0_SHIFT) +#define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) +#define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) + +#define TCR_TG1_SHIFT 30 +#define TCR_TG1_MASK (UL(3) << TCR_TG1_SHIFT) +#define TCR_TG1_16K (UL(1) << TCR_TG1_SHIFT) +#define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) +#define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) + #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 989fef16d461..f1d5afdb12db 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -290,6 +290,8 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) +#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) + #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) @@ -530,14 +532,12 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) * Atomic pte/pmd modifications. */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline int __ptep_test_and_clear_young(pte_t *ptep) { pteval_t pteval; unsigned int tmp, res; - asm volatile("// ptep_test_and_clear_young\n" + asm volatile("// __ptep_test_and_clear_young\n" " prfm pstl1strm, %2\n" "1: ldxr %0, %2\n" " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" @@ -550,6 +550,13 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, return res; } +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + return __ptep_test_and_clear_young(ptep); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h new file mode 100644 index 000000000000..2656a0fd05a6 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable-nopmd.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ +#define __ARM64_S2_PGTABLE_NOPMD_H_ + +#include <asm/stage2_pgtable-nopud.h> + +#define __S2_PGTABLE_PMD_FOLDED + +#define S2_PMD_SHIFT S2_PUD_SHIFT +#define S2_PTRS_PER_PMD 1 +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) +#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) + +#define stage2_pud_none(pud) (0) +#define stage2_pud_present(pud) (1) +#define stage2_pud_clear(pud) do { } while (0) +#define stage2_pud_populate(pud, pmd) do { } while (0) +#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) + +#define stage2_pmd_free(pmd) do { } while (0) + +#define stage2_pmd_addr_end(addr, end) (end) + +#define stage2_pud_huge(pud) (0) +#define stage2_pmd_table_empty(pmdp) (0) + +#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h new file mode 100644 index 000000000000..5ee87b54ebf3 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable-nopud.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ +#define __ARM64_S2_PGTABLE_NOPUD_H_ + +#define __S2_PGTABLE_PUD_FOLDED + +#define S2_PUD_SHIFT S2_PGDIR_SHIFT +#define S2_PTRS_PER_PUD 1 +#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) + +#define stage2_pgd_none(pgd) (0) +#define stage2_pgd_present(pgd) (1) +#define stage2_pgd_clear(pgd) do { } while (0) +#define stage2_pgd_populate(pgd, pud) do { } while (0) + +#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) + +#define stage2_pud_free(x) do { } while (0) + +#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_table_empty(pmdp) (0) + +#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h new file mode 100644 index 000000000000..8b68099348e5 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * stage2 page table helpers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __ARM64_S2_PGTABLE_H_ +#define __ARM64_S2_PGTABLE_H_ + +#include <asm/pgtable.h> + +/* + * The hardware supports concatenation of up to 16 tables at stage2 entry level + * and we use the feature whenever possible. + * + * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). + * On arm64, the smallest PAGE_SIZE supported is 4k, which means + * (PAGE_SHIFT - 3) > 4 holds for all page sizes. + * This implies, the total number of page table levels at stage2 expected + * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * in normal translations(e.g, stage1), since we cannot have another level in + * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). + */ +#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) + +/* + * With all the supported VA_BITs and 40bit guest IPA, the following condition + * is always true: + * + * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS + * + * We base our stage-2 page table walker helpers on this assumption and + * fall back to using the host version of the helper wherever possible. + * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back + * to using the host version, since it is guaranteed it is not folded at host. + * + * If the condition breaks in the future, we can rearrange the host level + * definitions and reuse them for stage2. Till then... + */ +#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS +#error "Unsupported combination of guest IPA and host VA_BITS." +#endif + +/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ +#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) +#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) +#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) + +/* + * The number of PTRS across all concatenated stage2 tables given by the + * number of bits resolved at the initial level. + */ +#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) + +/* + * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation + * levels in addition to the PGD. + */ +#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) + + +#if STAGE2_PGTABLE_LEVELS > 3 + +#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) +#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) + +#define stage2_pgd_none(pgd) pgd_none(pgd) +#define stage2_pgd_clear(pgd) pgd_clear(pgd) +#define stage2_pgd_present(pgd) pgd_present(pgd) +#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(pud) pud_free(NULL, pud) + +#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) + +static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* STAGE2_PGTABLE_LEVELS > 3 */ + + +#if STAGE2_PGTABLE_LEVELS > 2 + +#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) +#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) + +#define stage2_pud_none(pud) pud_none(pud) +#define stage2_pud_clear(pud) pud_clear(pud) +#define stage2_pud_present(pud) pud_present(pud) +#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(pud) pud_huge(pud) +#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) + +static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* STAGE2_PGTABLE_LEVELS > 2 */ + +#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) + +#if STAGE2_PGTABLE_LEVELS == 2 +#include <asm/stage2_pgtable-nopmd.h> +#elif STAGE2_PGTABLE_LEVELS == 3 +#include <asm/stage2_pgtable-nopud.h> +#endif + + +#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) + +static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index de7450df7629..aa2e34e99582 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM_ARM_VGIC_V3 config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF - depends on !ARM64_16K_PAGES select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index bcbe761a5a3d..b81f4091c909 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -66,6 +66,14 @@ u32 __hyp_text __init_stage2_translation(void) val |= 64 - (parange > 40 ? 40 : parange); /* + * Check the availability of Hardware Access Flag / Dirty Bit + * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. + */ + tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; + if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + val |= VTCR_EL2_HA; + + /* * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS * bit in VTCR_EL2. */ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f6b12790716c..942b8f6bf35b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -747,7 +747,7 @@ extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu); void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count); -void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare); +void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack); void kvm_mips_init_count(struct kvm_vcpu *vcpu); int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl); int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index b37954cc880d..b8b7860ec1a8 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -302,12 +302,31 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu) */ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) { - ktime_t expires; + struct mips_coproc *cop0 = vcpu->arch.cop0; + ktime_t expires, threshold; + uint32_t count, compare; int running; - /* Is the hrtimer pending? */ + /* Calculate the biased and scaled guest CP0_Count */ + count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now); + compare = kvm_read_c0_guest_compare(cop0); + + /* + * Find whether CP0_Count has reached the closest timer interrupt. If + * not, we shouldn't inject it. + */ + if ((int32_t)(count - compare) < 0) + return count; + + /* + * The CP0_Count we're going to return has already reached the closest + * timer interrupt. Quickly check if it really is a new interrupt by + * looking at whether the interval until the hrtimer expiry time is + * less than 1/4 of the timer period. + */ expires = hrtimer_get_expires(&vcpu->arch.comparecount_timer); - if (ktime_compare(now, expires) >= 0) { + threshold = ktime_add_ns(now, vcpu->arch.count_period / 4); + if (ktime_before(expires, threshold)) { /* * Cancel it while we handle it so there's no chance of * interference with the timeout handler. @@ -329,8 +348,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) } } - /* Return the biased and scaled guest CP0_Count */ - return vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now); + return count; } /** @@ -420,32 +438,6 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, } /** - * kvm_mips_update_hrtimer() - Update next expiry time of hrtimer. - * @vcpu: Virtual CPU. - * - * Recalculates and updates the expiry time of the hrtimer. This can be used - * after timer parameters have been altered which do not depend on the time that - * the change occurs (in those cases kvm_mips_freeze_hrtimer() and - * kvm_mips_resume_hrtimer() are used directly). - * - * It is guaranteed that no timer interrupts will be lost in the process. - * - * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). - */ -static void kvm_mips_update_hrtimer(struct kvm_vcpu *vcpu) -{ - ktime_t now; - uint32_t count; - - /* - * freeze_hrtimer takes care of a timer interrupts <= count, and - * resume_hrtimer the hrtimer takes care of a timer interrupts > count. - */ - now = kvm_mips_freeze_hrtimer(vcpu, &count); - kvm_mips_resume_hrtimer(vcpu, now, count); -} - -/** * kvm_mips_write_count() - Modify the count and update timer. * @vcpu: Virtual CPU. * @count: Guest CP0_Count value to set. @@ -540,23 +532,42 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz) * kvm_mips_write_compare() - Modify compare and update timer. * @vcpu: Virtual CPU. * @compare: New CP0_Compare value. + * @ack: Whether to acknowledge timer interrupt. * * Update CP0_Compare to a new value and update the timeout. + * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure + * any pending timer interrupt is preserved. */ -void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare) +void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack) { struct mips_coproc *cop0 = vcpu->arch.cop0; + int dc; + u32 old_compare = kvm_read_c0_guest_compare(cop0); + ktime_t now; + uint32_t count; /* if unchanged, must just be an ack */ - if (kvm_read_c0_guest_compare(cop0) == compare) + if (old_compare == compare) { + if (!ack) + return; + kvm_mips_callbacks->dequeue_timer_int(vcpu); + kvm_write_c0_guest_compare(cop0, compare); return; + } + + /* freeze_hrtimer() takes care of timer interrupts <= count */ + dc = kvm_mips_count_disabled(vcpu); + if (!dc) + now = kvm_mips_freeze_hrtimer(vcpu, &count); + + if (ack) + kvm_mips_callbacks->dequeue_timer_int(vcpu); - /* Update compare */ kvm_write_c0_guest_compare(cop0, compare); - /* Update timeout if count enabled */ - if (!kvm_mips_count_disabled(vcpu)) - kvm_mips_update_hrtimer(vcpu); + /* resume_hrtimer() takes care of timer interrupts > count */ + if (!dc) + kvm_mips_resume_hrtimer(vcpu, now, count); } /** @@ -1095,9 +1106,9 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, /* If we are writing to COMPARE */ /* Clear pending timer interrupt, if any */ - kvm_mips_callbacks->dequeue_timer_int(vcpu); kvm_mips_write_compare(vcpu, - vcpu->arch.gprs[rt]); + vcpu->arch.gprs[rt], + true); } else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) { unsigned int old_val, val, change; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 70ef1a43c114..23b209463238 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1079,7 +1079,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; case KVM_CAP_MIPS_FPU: - r = !!cpu_has_fpu; + /* We don't handle systems with inconsistent cpu_has_fpu */ + r = !!raw_cpu_has_fpu; break; case KVM_CAP_MIPS_MSA: /* @@ -1555,8 +1556,10 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) /* Disable MSA & FPU */ disable_msa(); - if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) + if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { clear_c0_status(ST0_CU1 | ST0_FR); + disable_fpu_hazard(); + } vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA); } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { set_c0_status(ST0_CU1); @@ -1567,6 +1570,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) /* Disable FPU */ clear_c0_status(ST0_CU1 | ST0_FR); + disable_fpu_hazard(); } preempt_enable(); } diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index e0e1d0a611fc..60e4ad0016e7 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -268,6 +268,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, int even; struct kvm *kvm = vcpu->kvm; const int flush_dcache_mask = 0; + int ret; if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); @@ -299,14 +300,18 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, pfn1 = kvm->arch.guest_pmap[gfn]; } - entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | (1 << 2) | (0x1 << 1); entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | (1 << 2) | (0x1 << 1); - return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - flush_dcache_mask); + preempt_disable(); + entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); + ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, + flush_dcache_mask); + preempt_enable(); + + return ret; } EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault); @@ -361,6 +366,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; kvm_pfn_t pfn0, pfn1; + int ret; if ((tlb->tlb_hi & VPN2_MASK) == 0) { pfn0 = 0; @@ -387,9 +393,6 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, *hpa1 = pfn1 << PAGE_SHIFT; /* Get attributes from the Guest TLB */ - entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? - kvm_mips_get_kernel_asid(vcpu) : - kvm_mips_get_user_asid(vcpu)); entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | @@ -398,8 +401,15 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo0, tlb->tlb_lo1); - return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - tlb->tlb_mask); + preempt_disable(); + entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? + kvm_mips_get_kernel_asid(vcpu) : + kvm_mips_get_user_asid(vcpu)); + ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, + tlb->tlb_mask); + preempt_enable(); + + return ret; } EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault); diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index c4038d2a724c..caa5ea1038a0 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -546,7 +546,7 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, kvm_mips_write_count(vcpu, v); break; case KVM_REG_MIPS_CP0_COMPARE: - kvm_mips_write_compare(vcpu, v); + kvm_mips_write_compare(vcpu, v, false); break; case KVM_REG_MIPS_CP0_CAUSE: /* diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7b343170453..a07645c17818 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -40,6 +40,9 @@ #define KVM_MAX_VCORES NR_CPUS #define KVM_USER_MEM_SLOTS 512 +#include <asm/cputhreads.h> +#define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES) + #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6da41fab70fb..9282ccf1d136 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -544,10 +544,6 @@ struct kvm_vcpu_arch { struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; - union { - struct cpuid cpu_id; - u64 stidp_data; - }; struct gmap *gmap; struct kvm_guestdbg_info_arch guestdbg; unsigned long pfault_token; @@ -605,7 +601,7 @@ struct kvm_s390_cpu_model { __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; /* facility list requested by guest (in dma page) */ __u64 *fac_list; - struct cpuid cpu_id; + u64 cpuid; unsigned short ibc; }; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index bab456be9a4f..994a66c09e8f 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -69,6 +69,7 @@ struct sclp_info { unsigned int max_cores; unsigned long hsa_size; unsigned long facilities; + unsigned int hmfai; }; extern struct sclp_info sclp; diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index ec60cf7fa0a2..1c8f33fca356 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -27,6 +27,7 @@ /* SIGP cpu status bits */ +#define SIGP_STATUS_INVALID_ORDER 0x00000002UL #define SIGP_STATUS_CHECK_STOP 0x00000010UL #define SIGP_STATUS_STOPPED 0x00000040UL #define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 668c087513e5..c597201a5ca9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -118,9 +118,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { }; /* upper facilities limit for kvm */ -unsigned long kvm_s390_fac_list_mask[] = { - 0xffe6fffbfcfdfc40UL, - 0x005e800000000000UL, +unsigned long kvm_s390_fac_list_mask[16] = { + 0xffe6000000000000UL, + 0x005e000000000000UL, }; unsigned long kvm_s390_fac_list_mask_size(void) @@ -638,6 +638,7 @@ static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_cpu_processor *proc; + u16 lowest_ibc, unblocked_ibc; int ret = 0; mutex_lock(&kvm->lock); @@ -652,9 +653,17 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) } if (!copy_from_user(proc, (void __user *)attr->addr, sizeof(*proc))) { - memcpy(&kvm->arch.model.cpu_id, &proc->cpuid, - sizeof(struct cpuid)); - kvm->arch.model.ibc = proc->ibc; + kvm->arch.model.cpuid = proc->cpuid; + lowest_ibc = sclp.ibc >> 16 & 0xfff; + unblocked_ibc = sclp.ibc & 0xfff; + if (lowest_ibc) { + if (proc->ibc > unblocked_ibc) + kvm->arch.model.ibc = unblocked_ibc; + else if (proc->ibc < lowest_ibc) + kvm->arch.model.ibc = lowest_ibc; + else + kvm->arch.model.ibc = proc->ibc; + } memcpy(kvm->arch.model.fac_list, proc->fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); } else @@ -687,7 +696,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) ret = -ENOMEM; goto out; } - memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid)); + proc->cpuid = kvm->arch.model.cpuid; proc->ibc = kvm->arch.model.ibc; memcpy(&proc->fac_list, kvm->arch.model.fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); @@ -1081,10 +1090,13 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm) kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; } -static void kvm_s390_get_cpu_id(struct cpuid *cpu_id) +static u64 kvm_s390_get_initial_cpuid(void) { - get_cpu_id(cpu_id); - cpu_id->version = 0xff; + struct cpuid cpuid; + + get_cpu_id(&cpuid); + cpuid.version = 0xff; + return *((u64 *) &cpuid); } static void kvm_s390_crypto_init(struct kvm *kvm) @@ -1175,7 +1187,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); - kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id); + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; kvm_s390_crypto_init(kvm); @@ -1624,7 +1636,6 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) { struct kvm_s390_cpu_model *model = &vcpu->kvm->arch.model; - vcpu->arch.cpu_id = model->cpu_id; vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; @@ -1645,11 +1656,14 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_setup_model(vcpu); - vcpu->arch.sie_block->ecb = 6; + vcpu->arch.sie_block->ecb = 0x02; + if (test_kvm_facility(vcpu->kvm, 9)) + vcpu->arch.sie_block->ecb |= 0x04; if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= 0x10; - vcpu->arch.sie_block->ecb2 = 8; + if (test_kvm_facility(vcpu->kvm, 8)) + vcpu->arch.sie_block->ecb2 |= 0x08; vcpu->arch.sie_block->eca = 0xC1002000U; if (sclp.has_siif) vcpu->arch.sie_block->eca |= 1; @@ -2971,13 +2985,26 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } +static inline unsigned long nonhyp_mask(int i) +{ + unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; + + return 0x0000ffffffffffffUL >> (nonhyp_fai << 4); +} + static int __init kvm_s390_init(void) { + int i; + if (!sclp.has_sief2) { pr_info("SIE not available\n"); return -ENODEV; } + for (i = 0; i < 16; i++) + kvm_s390_fac_list_mask[i] |= + S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i); + return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); } diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 0a1591d3d25d..95916fa7c670 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -439,7 +439,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) static int handle_stidp(struct kvm_vcpu *vcpu) { - u64 stidp_data = vcpu->arch.stidp_data; + u64 stidp_data = vcpu->kvm->arch.model.cpuid; u64 operand2; int rc; ar_t ar; @@ -670,8 +670,9 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* Only provide non-quiescing support if the host supports it */ - if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && !test_facility(14)) + /* Only provide non-quiescing support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && + !test_kvm_facility(vcpu->kvm, 14)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); /* No support for conditional-SSKE */ @@ -744,7 +745,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) { /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; - unsigned long *cbrlo, cbrle; + unsigned long *cbrlo; struct gmap *gmap; int i; @@ -765,17 +766,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); down_read(&gmap->mm->mmap_sem); - for (i = 0; i < entries; ++i) { - cbrle = cbrlo[i]; - if (unlikely(cbrle & ~PAGE_MASK || cbrle < 2 * PAGE_SIZE)) - /* invalid entry */ - break; - /* try to free backing */ - __gmap_zap(gmap, cbrle); - } + for (i = 0; i < entries; ++i) + __gmap_zap(gmap, cbrlo[i]); up_read(&gmap->mm->mmap_sem); - if (i < entries) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); return 0; } diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 77c22d685c7a..28ea0cab1f1b 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -240,6 +240,12 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, struct kvm_s390_local_interrupt *li; int rc; + if (!test_kvm_facility(vcpu->kvm, 9)) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INVALID_ORDER; + return SIGP_CC_STATUS_STORED; + } + li = &dst_vcpu->arch.local_int; if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) { /* running */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b7e394485a5f..c66e26280707 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -562,7 +562,6 @@ struct kvm_vcpu_arch { struct { u64 msr_val; u64 last_steal; - u64 accum_steal; struct gfn_to_hva_cache stime; struct kvm_steal_time steal; } st; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index cd54147cb365..739c0c594022 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -216,9 +216,9 @@ struct kvm_cpuid_entry2 { __u32 padding[3]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX (1 << 0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC (1 << 1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT (1 << 2) /* for KVM_SET_CPUID2 */ struct kvm_cpuid2 { diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 54ead79e444b..dfb4c6476877 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -382,9 +382,6 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u32 i, nr_ioapic_pins; int idx; - /* kvm->irq_routing must be read after clearing - * KVM_SCAN_IOAPIC. */ - smp_mb(); idx = srcu_read_lock(&kvm->irq_srcu); table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); nr_ioapic_pins = min_t(u32, table->nr_rt_entries, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ff4dbb73fb7..850335a71d9f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1909,18 +1909,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_indirect_valid_sp has skipped that kind of page and - * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped - * all the obsolete pages. + * for_each_gfn_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_sp(_kvm, _sp, _gfn) \ +#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn)) {} else + if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ + || (_sp)->role.invalid) {} else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct || (_sp)->role.invalid) {} else + for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -1961,6 +1960,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2105,11 +2109,6 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -2136,10 +2135,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn) { - if (is_obsolete_sp(vcpu->kvm, sp)) - continue; - + for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee1c8a93871c..ab4a387b98b1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5046,8 +5046,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b7798c7b210..6c774cdf553c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2002,22 +2002,8 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.pv_time_enabled = false; } -static void accumulate_steal_time(struct kvm_vcpu *vcpu) -{ - u64 delta; - - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - vcpu->arch.st.accum_steal = delta; -} - static void record_steal_time(struct kvm_vcpu *vcpu) { - accumulate_steal_time(vcpu); - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2025,9 +2011,26 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; - vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; - vcpu->arch.st.steal.version += 2; - vcpu->arch.st.accum_steal = 0; + if (vcpu->arch.st.steal.version & 1) + vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + + vcpu->arch.st.steal.version += 1; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + + smp_wmb(); + + vcpu->arch.st.steal.steal += current->sched_info.run_delay - + vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + + smp_wmb(); + + vcpu->arch.st.steal.version += 1; kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); @@ -8355,19 +8358,21 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +bool kvm_arch_has_irq_bypass(void) +{ + return kvm_x86_ops->update_pi_irte != NULL; +} + int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); - if (kvm_x86_ops->update_pi_irte) { - irqfd->producer = prod; - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - } + irqfd->producer = prod; - return -EINVAL; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, @@ -8377,11 +8382,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); - if (!kvm_x86_ops->update_pi_irte) { - WARN_ON(irqfd->producer != NULL); - return; - } - WARN_ON(irqfd->producer != prod); irqfd->producer = NULL; diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 5152b3898155..4814446a0024 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -468,11 +468,11 @@ static struct cyclecounter cyclecounter = { .mask = CLOCKSOURCE_MASK(56), }; -static struct timecounter timecounter; +static struct arch_timer_kvm_info arch_timer_kvm_info; -struct timecounter *arch_timer_get_timecounter(void) +struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) { - return &timecounter; + return &arch_timer_kvm_info; } static void __init arch_counter_register(unsigned type) @@ -500,7 +500,8 @@ static void __init arch_counter_register(unsigned type) clocksource_register_hz(&clocksource_counter, arch_timer_rate); cyclecounter.mult = clocksource_counter.mult; cyclecounter.shift = clocksource_counter.shift; - timecounter_init(&timecounter, &cyclecounter, start_count); + timecounter_init(&arch_timer_kvm_info.timecounter, + &cyclecounter, start_count); /* 56 bits minimum, so we assume worst case rollover */ sched_clock_register(arch_timer_read_counter, 56, arch_timer_rate); @@ -744,6 +745,8 @@ static void __init arch_timer_init(void) arch_timer_register(); arch_timer_common_init(); + + arch_timer_kvm_info.virtual_irq = arch_timer_ppi[VIRT_PPI]; } static void __init arch_timer_of_init(struct device_node *np) diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c index f174ce0ca361..2e9443be2b14 100644 --- a/drivers/irqchip/irq-gic-common.c +++ b/drivers/irqchip/irq-gic-common.c @@ -21,6 +21,19 @@ #include "irq-gic-common.h" +static const struct gic_kvm_info *gic_kvm_info; + +const struct gic_kvm_info *gic_get_kvm_info(void) +{ + return gic_kvm_info; +} + +void gic_set_kvm_info(const struct gic_kvm_info *info) +{ + BUG_ON(gic_kvm_info != NULL); + gic_kvm_info = info; +} + void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks, void *data) { diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h index fff697db8e22..205e5fddf6da 100644 --- a/drivers/irqchip/irq-gic-common.h +++ b/drivers/irqchip/irq-gic-common.h @@ -19,6 +19,7 @@ #include <linux/of.h> #include <linux/irqdomain.h> +#include <linux/irqchip/arm-gic-common.h> struct gic_quirk { const char *desc; @@ -35,4 +36,6 @@ void gic_cpu_config(void __iomem *base, void (*sync_access)(void)); void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks, void *data); +void gic_set_kvm_info(const struct gic_kvm_info *info); + #endif /* _IRQ_GIC_COMMON_H */ diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 5b7d3c2129d8..05a856073714 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -15,6 +15,8 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#define pr_fmt(fmt) "GICv3: " fmt + #include <linux/acpi.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> @@ -28,6 +30,7 @@ #include <linux/slab.h> #include <linux/irqchip.h> +#include <linux/irqchip/arm-gic-common.h> #include <linux/irqchip/arm-gic-v3.h> #include <asm/cputype.h> @@ -56,6 +59,8 @@ struct gic_chip_data { static struct gic_chip_data gic_data __read_mostly; static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE; +static struct gic_kvm_info gic_v3_kvm_info; + #define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K) @@ -901,6 +906,30 @@ static int __init gic_validate_dist_version(void __iomem *dist_base) return 0; } +static void __init gic_of_setup_kvm_info(struct device_node *node) +{ + int ret; + struct resource r; + u32 gicv_idx; + + gic_v3_kvm_info.type = GIC_V3; + + gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0); + if (!gic_v3_kvm_info.maint_irq) + return; + + if (of_property_read_u32(node, "#redistributor-regions", + &gicv_idx)) + gicv_idx = 1; + + gicv_idx += 3; /* Also skip GICD, GICC, GICH */ + ret = of_address_to_resource(node, gicv_idx, &r); + if (!ret) + gic_v3_kvm_info.vcpu = r; + + gic_set_kvm_info(&gic_v3_kvm_info); +} + static int __init gic_of_init(struct device_node *node, struct device_node *parent) { void __iomem *dist_base; @@ -952,8 +981,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions, redist_stride, &node->fwnode); - if (!err) + if (!err) { + gic_of_setup_kvm_info(node); return 0; + } out_unmap_rdist: for (i = 0; i < nr_redist_regions; i++) @@ -968,19 +999,25 @@ out_unmap_dist: IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); #ifdef CONFIG_ACPI -static void __iomem *dist_base; -static struct redist_region *redist_regs __initdata; -static u32 nr_redist_regions __initdata; -static bool single_redist; +static struct +{ + void __iomem *dist_base; + struct redist_region *redist_regs; + u32 nr_redist_regions; + bool single_redist; + u32 maint_irq; + int maint_irq_mode; + phys_addr_t vcpu_base; +} acpi_data __initdata; static void __init gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) { static int count = 0; - redist_regs[count].phys_base = phys_base; - redist_regs[count].redist_base = redist_base; - redist_regs[count].single_redist = single_redist; + acpi_data.redist_regs[count].phys_base = phys_base; + acpi_data.redist_regs[count].redist_base = redist_base; + acpi_data.redist_regs[count].single_redist = acpi_data.single_redist; count++; } @@ -1008,7 +1045,7 @@ gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header, { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; - u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; + u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2; void __iomem *redist_base; @@ -1025,7 +1062,7 @@ static int __init gic_acpi_collect_gicr_base(void) acpi_tbl_entry_handler redist_parser; enum acpi_madt_type type; - if (single_redist) { + if (acpi_data.single_redist) { type = ACPI_MADT_TYPE_GENERIC_INTERRUPT; redist_parser = gic_acpi_parse_madt_gicc; } else { @@ -1076,14 +1113,14 @@ static int __init gic_acpi_count_gicr_regions(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, gic_acpi_match_gicr, 0); if (count > 0) { - single_redist = false; + acpi_data.single_redist = false; return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_match_gicc, 0); if (count > 0) - single_redist = true; + acpi_data.single_redist = true; return count; } @@ -1103,36 +1140,117 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, if (count <= 0) return false; - nr_redist_regions = count; + acpi_data.nr_redist_regions = count; return true; } +static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *gicc = + (struct acpi_madt_generic_interrupt *)header; + int maint_irq_mode; + static int first_madt = true; + + /* Skip unusable CPUs */ + if (!(gicc->flags & ACPI_MADT_ENABLED)) + return 0; + + maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ? + ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE; + + if (first_madt) { + first_madt = false; + + acpi_data.maint_irq = gicc->vgic_interrupt; + acpi_data.maint_irq_mode = maint_irq_mode; + acpi_data.vcpu_base = gicc->gicv_base_address; + + return 0; + } + + /* + * The maintenance interrupt and GICV should be the same for every CPU + */ + if ((acpi_data.maint_irq != gicc->vgic_interrupt) || + (acpi_data.maint_irq_mode != maint_irq_mode) || + (acpi_data.vcpu_base != gicc->gicv_base_address)) + return -EINVAL; + + return 0; +} + +static bool __init gic_acpi_collect_virt_info(void) +{ + int count; + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + gic_acpi_parse_virt_madt_gicc, 0); + + return (count > 0); +} + #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K) +#define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K) +#define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K) + +static void __init gic_acpi_setup_kvm_info(void) +{ + int irq; + + if (!gic_acpi_collect_virt_info()) { + pr_warn("Unable to get hardware information used for virtualization\n"); + return; + } + + gic_v3_kvm_info.type = GIC_V3; + + irq = acpi_register_gsi(NULL, acpi_data.maint_irq, + acpi_data.maint_irq_mode, + ACPI_ACTIVE_HIGH); + if (irq <= 0) + return; + + gic_v3_kvm_info.maint_irq = irq; + + if (acpi_data.vcpu_base) { + struct resource *vcpu = &gic_v3_kvm_info.vcpu; + + vcpu->flags = IORESOURCE_MEM; + vcpu->start = acpi_data.vcpu_base; + vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1; + } + + gic_set_kvm_info(&gic_v3_kvm_info); +} static int __init gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_generic_distributor *dist; struct fwnode_handle *domain_handle; + size_t size; int i, err; /* Get distributor base address */ dist = (struct acpi_madt_generic_distributor *)header; - dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE); - if (!dist_base) { + acpi_data.dist_base = ioremap(dist->base_address, + ACPI_GICV3_DIST_MEM_SIZE); + if (!acpi_data.dist_base) { pr_err("Unable to map GICD registers\n"); return -ENOMEM; } - err = gic_validate_dist_version(dist_base); + err = gic_validate_dist_version(acpi_data.dist_base); if (err) { - pr_err("No distributor detected at @%p, giving up", dist_base); + pr_err("No distributor detected at @%p, giving up", + acpi_data.dist_base); goto out_dist_unmap; } - redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions, - GFP_KERNEL); - if (!redist_regs) { + size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions; + acpi_data.redist_regs = kzalloc(size, GFP_KERNEL); + if (!acpi_data.redist_regs) { err = -ENOMEM; goto out_dist_unmap; } @@ -1141,29 +1259,31 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) if (err) goto out_redist_unmap; - domain_handle = irq_domain_alloc_fwnode(dist_base); + domain_handle = irq_domain_alloc_fwnode(acpi_data.dist_base); if (!domain_handle) { err = -ENOMEM; goto out_redist_unmap; } - err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0, - domain_handle); + err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs, + acpi_data.nr_redist_regions, 0, domain_handle); if (err) goto out_fwhandle_free; acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle); + gic_acpi_setup_kvm_info(); + return 0; out_fwhandle_free: irq_domain_free_fwnode(domain_handle); out_redist_unmap: - for (i = 0; i < nr_redist_regions; i++) - if (redist_regs[i].redist_base) - iounmap(redist_regs[i].redist_base); - kfree(redist_regs); + for (i = 0; i < acpi_data.nr_redist_regions; i++) + if (acpi_data.redist_regs[i].redist_base) + iounmap(acpi_data.redist_regs[i].redist_base); + kfree(acpi_data.redist_regs); out_dist_unmap: - iounmap(dist_base); + iounmap(acpi_data.dist_base); return err; } IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 282344b95ec2..3f1d9fd3a462 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -102,6 +102,8 @@ static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE; static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly; +static struct gic_kvm_info gic_v2_kvm_info; + #ifdef CONFIG_GIC_NON_BANKED static void __iomem *gic_get_percpu_base(union gic_base *base) { @@ -1189,6 +1191,29 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base) return true; } +static void __init gic_of_setup_kvm_info(struct device_node *node) +{ + int ret; + struct resource *vctrl_res = &gic_v2_kvm_info.vctrl; + struct resource *vcpu_res = &gic_v2_kvm_info.vcpu; + + gic_v2_kvm_info.type = GIC_V2; + + gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0); + if (!gic_v2_kvm_info.maint_irq) + return; + + ret = of_address_to_resource(node, 2, vctrl_res); + if (ret) + return; + + ret = of_address_to_resource(node, 3, vcpu_res); + if (ret) + return; + + gic_set_kvm_info(&gic_v2_kvm_info); +} + int __init gic_of_init(struct device_node *node, struct device_node *parent) { @@ -1218,8 +1243,10 @@ gic_of_init(struct device_node *node, struct device_node *parent) __gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, &node->fwnode); - if (!gic_cnt) + if (!gic_cnt) { gic_init_physaddr(node); + gic_of_setup_kvm_info(node); + } if (parent) { irq = irq_of_parse_and_map(node, 0); @@ -1245,7 +1272,14 @@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init); #endif #ifdef CONFIG_ACPI -static phys_addr_t cpu_phy_base __initdata; +static struct +{ + phys_addr_t cpu_phys_base; + u32 maint_irq; + int maint_irq_mode; + phys_addr_t vctrl_base; + phys_addr_t vcpu_base; +} acpi_data __initdata; static int __init gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, @@ -1265,10 +1299,16 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, * All CPU interface addresses have to be the same. */ gic_cpu_base = processor->base_address; - if (cpu_base_assigned && gic_cpu_base != cpu_phy_base) + if (cpu_base_assigned && gic_cpu_base != acpi_data.cpu_phys_base) return -EINVAL; - cpu_phy_base = gic_cpu_base; + acpi_data.cpu_phys_base = gic_cpu_base; + acpi_data.maint_irq = processor->vgic_interrupt; + acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ? + ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE; + acpi_data.vctrl_base = processor->gich_base_address; + acpi_data.vcpu_base = processor->gicv_base_address; + cpu_base_assigned = 1; return 0; } @@ -1299,6 +1339,41 @@ static bool __init gic_validate_dist(struct acpi_subtable_header *header, #define ACPI_GICV2_DIST_MEM_SIZE (SZ_4K) #define ACPI_GIC_CPU_IF_MEM_SIZE (SZ_8K) +#define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K) +#define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K) + +static void __init gic_acpi_setup_kvm_info(void) +{ + int irq; + struct resource *vctrl_res = &gic_v2_kvm_info.vctrl; + struct resource *vcpu_res = &gic_v2_kvm_info.vcpu; + + gic_v2_kvm_info.type = GIC_V2; + + if (!acpi_data.vctrl_base) + return; + + vctrl_res->flags = IORESOURCE_MEM; + vctrl_res->start = acpi_data.vctrl_base; + vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1; + + if (!acpi_data.vcpu_base) + return; + + vcpu_res->flags = IORESOURCE_MEM; + vcpu_res->start = acpi_data.vcpu_base; + vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1; + + irq = acpi_register_gsi(NULL, acpi_data.maint_irq, + acpi_data.maint_irq_mode, + ACPI_ACTIVE_HIGH); + if (irq <= 0) + return; + + gic_v2_kvm_info.maint_irq = irq; + + gic_set_kvm_info(&gic_v2_kvm_info); +} static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, const unsigned long end) @@ -1316,7 +1391,7 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, return -EINVAL; } - cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE); + cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE); if (!cpu_base) { pr_err("Unable to map GICC registers\n"); return -ENOMEM; @@ -1356,6 +1431,8 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, if (IS_ENABLED(CONFIG_ARM_GIC_V2M)) gicv2m_init(NULL, gic_data[0].domain); + gic_acpi_setup_kvm_info(); + return 0; } IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 6804354c42bd..0ac520dd1b21 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -49,7 +49,9 @@ struct read_info_sccb { u8 _pad_117[119 - 117]; /* 117-118 */ u8 fac119; /* 119 */ u16 hcpua; /* 120-121 */ - u8 _pad_122[4096 - 122]; /* 122-4095 */ + u8 _pad_122[124 - 122]; /* 122-123 */ + u32 hmfai; /* 124-127 */ + u8 _pad_128[4096 - 128]; /* 128-4095 */ } __packed __aligned(PAGE_SIZE); static char sccb_early[PAGE_SIZE] __aligned(PAGE_SIZE) __initdata; @@ -155,6 +157,8 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.mtid = (sccb->fac42 & 0x80) ? (sccb->fac42 & 31) : 0; sclp.mtid_cp = (sccb->fac42 & 0x80) ? (sccb->fac43 & 31) : 0; sclp.mtid_prev = (sccb->fac42 & 0x80) ? (sccb->fac66 & 31) : 0; + + sclp.hmfai = sccb->hmfai; } /* diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 25d0914481a2..caedb74c9210 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -49,11 +49,16 @@ enum arch_timer_reg { #define ARCH_TIMER_EVT_STREAM_FREQ 10000 /* 100us */ +struct arch_timer_kvm_info { + struct timecounter timecounter; + int virtual_irq; +}; + #ifdef CONFIG_ARM_ARCH_TIMER extern u32 arch_timer_get_rate(void); extern u64 (*arch_timer_read_counter)(void); -extern struct timecounter *arch_timer_get_timecounter(void); +extern struct arch_timer_kvm_info *arch_timer_get_kvm_info(void); #else @@ -67,11 +72,6 @@ static inline u64 arch_timer_read_counter(void) return 0; } -static inline struct timecounter *arch_timer_get_timecounter(void) -{ - return NULL; -} - #endif #endif diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 281caf847fad..be6037aa703d 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <kvm/iodev.h> +#include <linux/irqchip/arm-gic-common.h> #define VGIC_NR_IRQS_LEGACY 256 #define VGIC_NR_SGIS 16 @@ -353,15 +354,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map); #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) #define vgic_ready(k) ((k)->arch.vgic.ready) -int vgic_v2_probe(struct device_node *vgic_node, +int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params); #ifdef CONFIG_KVM_ARM_VGIC_V3 -int vgic_v3_probe(struct device_node *vgic_node, +int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params); #else -static inline int vgic_v3_probe(struct device_node *vgic_node, +static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params) { diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 1551b5b2f4c2..f0f5d2671509 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -34,7 +34,7 @@ struct irq_bypass_consumer; /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer + * @token: opaque token to match between producer and consumer (non-NULL) * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -60,7 +60,7 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer + * @token: opaque token to match between producer and consumer (non-NULL) * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h new file mode 100644 index 000000000000..c647b0547bcd --- /dev/null +++ b/include/linux/irqchip/arm-gic-common.h @@ -0,0 +1,34 @@ +/* + * include/linux/irqchip/arm-gic-common.h + * + * Copyright (C) 2016 ARM Limited, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __LINUX_IRQCHIP_ARM_GIC_COMMON_H +#define __LINUX_IRQCHIP_ARM_GIC_COMMON_H + +#include <linux/types.h> +#include <linux/ioport.h> + +enum gic_type { + GIC_V2, + GIC_V3, +}; + +struct gic_kvm_info { + /* GIC type */ + enum gic_type type; + /* Virtual CPU interface */ + struct resource vcpu; + /* Interrupt number */ + unsigned int maint_irq; + /* Virtual control interface */ + struct resource vctrl; +}; + +const struct gic_kvm_info *gic_get_kvm_info(void); + +#endif /* __LINUX_IRQCHIP_ARM_GIC_COMMON_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5276fe0916fc..92a0229044fb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -35,6 +35,10 @@ #include <asm/kvm_host.h> +#ifndef KVM_MAX_VCPU_ID +#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS +#endif + /* * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used * in kvm, other bits are visible for userspace which are defined in @@ -447,12 +451,13 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu = NULL; int i; - if (id < 0 || id >= KVM_MAX_VCPUS) + if (id < 0) return NULL; - vcpu = kvm_get_vcpu(kvm, id); + if (id < KVM_MAX_VCPUS) + vcpu = kvm_get_vcpu(kvm, id); if (vcpu && vcpu->vcpu_id == id) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -1091,6 +1096,11 @@ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { + /* + * Ensure the rest of the request is published to kvm_check_request's + * caller. Paired with the smp_mb__after_atomic in kvm_check_request. + */ + smp_wmb(); set_bit(req, &vcpu->requests); } @@ -1098,6 +1108,12 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { if (test_bit(req, &vcpu->requests)) { clear_bit(req, &vcpu->requests); + + /* + * Ensure the rest of the request is visible to kvm_check_request's + * caller. Paired with the smp_wmb in kvm_make_request. + */ + smp_mb__after_atomic(); return true; } else { return false; @@ -1169,6 +1185,7 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a7f1f8032ec1..05ebf475104c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -865,6 +865,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_SPAPR_TCE_64 125 #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 +#define KVM_CAP_MAX_VCPU_ID 128 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 9aaa35dd9144..409db3304471 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -17,7 +17,6 @@ */ #include <linux/cpu.h> -#include <linux/of_irq.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/interrupt.h> @@ -438,45 +437,29 @@ static struct notifier_block kvm_timer_cpu_nb = { .notifier_call = kvm_timer_cpu_notify, }; -static const struct of_device_id arch_timer_of_match[] = { - { .compatible = "arm,armv7-timer", }, - { .compatible = "arm,armv8-timer", }, - {}, -}; - int kvm_timer_hyp_init(void) { - struct device_node *np; - unsigned int ppi; + struct arch_timer_kvm_info *info; int err; - timecounter = arch_timer_get_timecounter(); - if (!timecounter) - return -ENODEV; + info = arch_timer_get_kvm_info(); + timecounter = &info->timecounter; - np = of_find_matching_node(NULL, arch_timer_of_match); - if (!np) { - kvm_err("kvm_arch_timer: can't find DT node\n"); + if (info->virtual_irq <= 0) { + kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", + info->virtual_irq); return -ENODEV; } + host_vtimer_irq = info->virtual_irq; - ppi = irq_of_parse_and_map(np, 2); - if (!ppi) { - kvm_err("kvm_arch_timer: no virtual timer interrupt\n"); - err = -EINVAL; - goto out; - } - - err = request_percpu_irq(ppi, kvm_arch_timer_handler, + err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, "kvm guest timer", kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", - ppi, err); + host_vtimer_irq, err); goto out; } - host_vtimer_irq = ppi; - err = __register_cpu_notifier(&kvm_timer_cpu_nb); if (err) { kvm_err("Cannot register timer CPU notifier\n"); @@ -489,14 +472,13 @@ int kvm_timer_hyp_init(void) goto out_free; } - kvm_info("%s IRQ%d\n", np->name, ppi); + kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); on_each_cpu(kvm_timer_init_interrupt, NULL, 1); goto out; out_free: - free_percpu_irq(ppi, kvm_get_running_vcpus()); + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); out: - of_node_put(np); return err; } diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index 67ec334ce1d0..7e826c9b2b0a 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -20,9 +20,6 @@ #include <linux/kvm_host.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <linux/of_irq.h> #include <linux/irqchip/arm-gic.h> @@ -186,38 +183,39 @@ static void vgic_cpu_init_lrs(void *params) } /** - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT - * @node: pointer to the DT node - * @ops: address of a pointer to the GICv2 operations - * @params: address of a pointer to HW-specific parameters + * vgic_v2_probe - probe for a GICv2 compatible interrupt controller + * @gic_kvm_info: pointer to the GIC description + * @ops: address of a pointer to the GICv2 operations + * @params: address of a pointer to HW-specific parameters * * Returns 0 if a GICv2 has been found, with the low level operations * in *ops and the HW parameters in *params. Returns an error code * otherwise. */ -int vgic_v2_probe(struct device_node *vgic_node, - const struct vgic_ops **ops, - const struct vgic_params **params) +int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, + const struct vgic_ops **ops, + const struct vgic_params **params) { int ret; - struct resource vctrl_res; - struct resource vcpu_res; struct vgic_params *vgic = &vgic_v2_params; + const struct resource *vctrl_res = &gic_kvm_info->vctrl; + const struct resource *vcpu_res = &gic_kvm_info->vcpu; - vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0); - if (!vgic->maint_irq) { - kvm_err("error getting vgic maintenance irq from DT\n"); + if (!gic_kvm_info->maint_irq) { + kvm_err("error getting vgic maintenance irq\n"); ret = -ENXIO; goto out; } + vgic->maint_irq = gic_kvm_info->maint_irq; - ret = of_address_to_resource(vgic_node, 2, &vctrl_res); - if (ret) { - kvm_err("Cannot obtain GICH resource\n"); + if (!gic_kvm_info->vctrl.start) { + kvm_err("GICH not present in the firmware table\n"); + ret = -ENXIO; goto out; } - vgic->vctrl_base = of_iomap(vgic_node, 2); + vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start, + resource_size(&gic_kvm_info->vctrl)); if (!vgic->vctrl_base) { kvm_err("Cannot ioremap GICH\n"); ret = -ENOMEM; @@ -228,29 +226,23 @@ int vgic_v2_probe(struct device_node *vgic_node, vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1; ret = create_hyp_io_mappings(vgic->vctrl_base, - vgic->vctrl_base + resource_size(&vctrl_res), - vctrl_res.start); + vgic->vctrl_base + resource_size(vctrl_res), + vctrl_res->start); if (ret) { kvm_err("Cannot map VCTRL into hyp\n"); goto out_unmap; } - if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { - kvm_err("Cannot obtain GICV resource\n"); - ret = -ENXIO; - goto out_unmap; - } - - if (!PAGE_ALIGNED(vcpu_res.start)) { + if (!PAGE_ALIGNED(vcpu_res->start)) { kvm_err("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res.start); + (unsigned long long)vcpu_res->start); ret = -ENXIO; goto out_unmap; } - if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { + if (!PAGE_ALIGNED(resource_size(vcpu_res))) { kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&vcpu_res), + (unsigned long long)resource_size(vcpu_res), PAGE_SIZE); ret = -ENXIO; goto out_unmap; @@ -259,10 +251,10 @@ int vgic_v2_probe(struct device_node *vgic_node, vgic->can_emulate_gicv2 = true; kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); - vgic->vcpu_base = vcpu_res.start; + vgic->vcpu_base = vcpu_res->start; - kvm_info("%s@%llx IRQ%d\n", vgic_node->name, - vctrl_res.start, vgic->maint_irq); + kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n", + gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq); vgic->type = VGIC_V2; vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS; @@ -276,6 +268,5 @@ int vgic_v2_probe(struct device_node *vgic_node, out_unmap: iounmap(vgic->vctrl_base); out: - of_node_put(vgic_node); return ret; } diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 999bdc6d9d9f..c02a1b1cf855 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -20,11 +20,9 @@ #include <linux/kvm_host.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <linux/of_irq.h> #include <linux/irqchip/arm-gic-v3.h> +#include <linux/irqchip/arm-gic-common.h> #include <asm/kvm_emulate.h> #include <asm/kvm_arm.h> @@ -222,30 +220,24 @@ static void vgic_cpu_init_lrs(void *params) } /** - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT - * @node: pointer to the DT node - * @ops: address of a pointer to the GICv3 operations - * @params: address of a pointer to HW-specific parameters + * vgic_v3_probe - probe for a GICv3 compatible interrupt controller + * @gic_kvm_info: pointer to the GIC description + * @ops: address of a pointer to the GICv3 operations + * @params: address of a pointer to HW-specific parameters * * Returns 0 if a GICv3 has been found, with the low level operations * in *ops and the HW parameters in *params. Returns an error code * otherwise. */ -int vgic_v3_probe(struct device_node *vgic_node, +int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params) { int ret = 0; - u32 gicv_idx; - struct resource vcpu_res; struct vgic_params *vgic = &vgic_v3_params; + const struct resource *vcpu_res = &gic_kvm_info->vcpu; - vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0); - if (!vgic->maint_irq) { - kvm_err("error getting vgic maintenance irq from DT\n"); - ret = -ENXIO; - goto out; - } + vgic->maint_irq = gic_kvm_info->maint_irq; ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); @@ -256,24 +248,19 @@ int vgic_v3_probe(struct device_node *vgic_node, vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1; vgic->can_emulate_gicv2 = false; - if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx)) - gicv_idx = 1; - - gicv_idx += 3; /* Also skip GICD, GICC, GICH */ - if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) { + if (!vcpu_res->start) { kvm_info("GICv3: no GICV resource entry\n"); vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(vcpu_res.start)) { + } else if (!PAGE_ALIGNED(vcpu_res->start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res.start); + (unsigned long long)vcpu_res->start); vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { + } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) { pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&vcpu_res), + (unsigned long long)resource_size(vcpu_res), PAGE_SIZE); - vgic->vcpu_base = 0; } else { - vgic->vcpu_base = vcpu_res.start; + vgic->vcpu_base = vcpu_res->start; vgic->can_emulate_gicv2 = true; kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); @@ -286,15 +273,13 @@ int vgic_v3_probe(struct device_node *vgic_node, vgic->type = VGIC_V3; vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS; - kvm_info("%s@%llx IRQ%d\n", vgic_node->name, - vcpu_res.start, vgic->maint_irq); + kvm_info("GICV base=0x%llx, IRQ=%d\n", + vgic->vcpu_base, vgic->maint_irq); on_each_cpu(vgic_cpu_init_lrs, vgic, 1); *ops = &vgic_v3_ops; *params = vgic; -out: - of_node_put(vgic_node); return ret; } diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 00429b392c61..60668a7f319a 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -21,9 +21,7 @@ #include <linux/kvm_host.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <linux/of_irq.h> +#include <linux/irq.h> #include <linux/rculist.h> #include <linux/uaccess.h> @@ -33,6 +31,7 @@ #include <trace/events/kvm.h> #include <asm/kvm.h> #include <kvm/iodev.h> +#include <linux/irqchip/arm-gic-common.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -2389,33 +2388,38 @@ static struct notifier_block vgic_cpu_nb = { .notifier_call = vgic_cpu_notify, }; -static const struct of_device_id vgic_ids[] = { - { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, }, - { .compatible = "arm,cortex-a7-gic", .data = vgic_v2_probe, }, - { .compatible = "arm,gic-400", .data = vgic_v2_probe, }, - { .compatible = "arm,gic-v3", .data = vgic_v3_probe, }, - {}, -}; - -int kvm_vgic_hyp_init(void) +static int kvm_vgic_probe(void) { - const struct of_device_id *matched_id; - const int (*vgic_probe)(struct device_node *,const struct vgic_ops **, - const struct vgic_params **); - struct device_node *vgic_node; + const struct gic_kvm_info *gic_kvm_info; int ret; - vgic_node = of_find_matching_node_and_match(NULL, - vgic_ids, &matched_id); - if (!vgic_node) { - kvm_err("error: no compatible GIC node found\n"); + gic_kvm_info = gic_get_kvm_info(); + if (!gic_kvm_info) return -ENODEV; + + switch (gic_kvm_info->type) { + case GIC_V2: + ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic); + break; + case GIC_V3: + ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic); + break; + default: + ret = -ENODEV; } - vgic_probe = matched_id->data; - ret = vgic_probe(vgic_node, &vgic_ops, &vgic); - if (ret) + return ret; +} + +int kvm_vgic_hyp_init(void) +{ + int ret; + + ret = kvm_vgic_probe(); + if (ret) { + kvm_err("error: KVM vGIC probing failed\n"); return ret; + } ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, "vgic", kvm_get_running_vcpus()); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 46dbc0a7dfc1..e469b6012471 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -408,15 +408,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ fdput(f); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS - irqfd->consumer.token = (void *)irqfd->eventfd; - irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; - irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; - irqfd->consumer.stop = kvm_arch_irq_bypass_stop; - irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); - if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", + if (kvm_arch_has_irq_bypass()) { + irqfd->consumer.token = (void *)irqfd->eventfd; + irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; + irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; + irqfd->consumer.stop = kvm_arch_irq_bypass_stop; + irqfd->consumer.start = kvm_arch_irq_bypass_start; + ret = irq_bypass_register_consumer(&irqfd->consumer); + if (ret) + pr_info("irq bypass consumer (token %p) registration fails: %d\n", irqfd->consumer.token, ret); + } #endif return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4fd482fb9260..ed3d9bb18a56 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2272,7 +2272,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) int r; struct kvm_vcpu *vcpu; - if (id >= KVM_MAX_VCPUS) + if (id >= KVM_MAX_VCPU_ID) return -EINVAL; vcpu = kvm_arch_vcpu_create(kvm, id); @@ -2746,6 +2746,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_MULTI_ADDRESS_SPACE: return KVM_ADDRESS_SPACE_NUM; #endif + case KVM_CAP_MAX_VCPU_ID: + return KVM_MAX_VCPU_ID; default: break; } diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 09a03b5a21ff..52abac4bb6a2 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -89,6 +89,9 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; + if (!producer->token) + return -EINVAL; + might_sleep(); if (!try_module_get(THIS_MODULE)) @@ -136,6 +139,9 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; + if (!producer->token) + return; + might_sleep(); if (!try_module_get(THIS_MODULE)) @@ -177,7 +183,8 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; - if (!consumer->add_producer || !consumer->del_producer) + if (!consumer->token || + !consumer->add_producer || !consumer->del_producer) return -EINVAL; might_sleep(); @@ -227,6 +234,9 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; + if (!consumer->token) + return; + might_sleep(); if (!try_module_get(THIS_MODULE)) |