diff options
Diffstat (limited to 'arch/arm/kvm')
-rw-r--r-- | arch/arm/kvm/Kconfig | 3 | ||||
-rw-r--r-- | arch/arm/kvm/Makefile | 1 | ||||
-rw-r--r-- | arch/arm/kvm/arm.c | 68 | ||||
-rw-r--r-- | arch/arm/kvm/coproc.c | 70 | ||||
-rw-r--r-- | arch/arm/kvm/coproc.h | 6 | ||||
-rw-r--r-- | arch/arm/kvm/coproc_a15.c | 2 | ||||
-rw-r--r-- | arch/arm/kvm/coproc_a7.c | 2 | ||||
-rw-r--r-- | arch/arm/kvm/handle_exit.c | 8 | ||||
-rw-r--r-- | arch/arm/kvm/interrupts.S | 11 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 435 | ||||
-rw-r--r-- | arch/arm/kvm/psci.c | 17 | ||||
-rw-r--r-- | arch/arm/kvm/trace.h | 50 |
12 files changed, 550 insertions, 123 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 466bd299b1a8..338ace78ed18 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -21,8 +21,11 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST + select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select SRCU depends on ARM_VIRT_EXT && ARM_LPAE ---help--- Support hosting virtualized guest machines. You will also diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index f7057ed045b6..443b8bea43e9 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -22,4 +22,5 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o +obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 2d6d91001062..07e7eb1d7ab6 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -132,6 +132,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Mark the initial VMID generation invalid */ kvm->arch.vmid_gen = 0; + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus(); + return ret; out_free_stage2_pgd: kvm_free_stage2_pgd(kvm); @@ -218,6 +221,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) goto out; } + if (id >= kvm->arch.max_vcpus) { + err = -EINVAL; + goto out; + } + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; @@ -241,9 +249,8 @@ out: return ERR_PTR(err); } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) @@ -281,15 +288,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); - /* - * Check whether this vcpu requires the cache to be flushed on - * this physical CPU. This is a consequence of doing dcache - * operations by set/way on this vcpu. We do it here to be in - * a non-preemptible section. - */ - if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush)) - flush_cache_all(); /* We'd really want v7_flush_dcache_all() */ - kvm_arm_set_running_vcpu(vcpu); } @@ -541,7 +539,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; - vcpu->arch.last_pcpu = smp_processor_id(); kvm_guest_exit(); trace_kvm_exit(*vcpu_pc(vcpu)); /* @@ -787,9 +784,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -EINVAL; + bool is_dirty = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, @@ -821,7 +848,7 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { if (vgic_present) - return kvm_vgic_create(kvm); + return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); else return -ENXIO; } @@ -1045,6 +1072,19 @@ static void check_kvm_target_cpu(void *ret) *(int *)ret = kvm_target_cpu(); } +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu; + int i; + + mpidr &= MPIDR_HWID_BITMASK; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + /** * Initialize Hyp-mode and memory mappings on all CPUs. */ diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 7928dbdf2102..f3d88dc388bc 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu, return true; } -/* See note at ARM ARM B1.14.4 */ +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + */ static bool access_dcsw(struct kvm_vcpu *vcpu, const struct coproc_params *p, const struct coproc_reg *r) { - unsigned long val; - int cpu; - if (!p->is_write) return read_from_write_only(vcpu, p); - cpu = get_cpu(); - - cpumask_setall(&vcpu->arch.require_dcache_flush); - cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush); - - /* If we were already preempted, take the long way around */ - if (cpu != vcpu->arch.last_pcpu) { - flush_cache_all(); - goto done; - } - - val = *vcpu_reg(vcpu, p->Rt1); - - switch (p->CRm) { - case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */ - case 14: /* DCCISW */ - asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val)); - break; - - case 10: /* DCCSW */ - asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val)); - break; - } - -done: - put_cpu(); - + kvm_set_way_flush(vcpu); return true; } /* * Generic accessor for VM registers. Only called as long as HCR_TVM - * is set. + * is set. If the guest enables the MMU, we stop trapping the VM + * sys_regs and leave it in complete control of the caches. + * + * Used by the cpu-specific code. */ -static bool access_vm_reg(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) +bool access_vm_reg(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) { + bool was_enabled = vcpu_has_cache_enabled(vcpu); + BUG_ON(!p->is_write); vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1); if (p->is_64bit) vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2); - return true; -} - -/* - * SCTLR accessor. Only called as long as HCR_TVM is set. If the - * guest enables the MMU, we stop trapping the VM sys_regs and leave - * it in complete control of the caches. - * - * Used by the cpu-specific code. - */ -bool access_sctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - access_vm_reg(vcpu, p, r); - - if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */ - vcpu->arch.hcr &= ~HCR_TVM; - stage2_flush_vm(vcpu->kvm); - } - + kvm_toggle_cache(vcpu, was_enabled); return true; } diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index 1a44bbe39643..88d24a3a9778 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -153,8 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1, #define is64 .is_64 = true #define is32 .is_64 = false -bool access_sctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r); +bool access_vm_reg(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r); #endif /* __ARM_KVM_COPROC_LOCAL_H__ */ diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c index e6f4ae48bda9..a7136757d373 100644 --- a/arch/arm/kvm/coproc_a15.c +++ b/arch/arm/kvm/coproc_a15.c @@ -34,7 +34,7 @@ static const struct coproc_reg a15_regs[] = { /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - access_sctlr, reset_val, c1_SCTLR, 0x00C50078 }, + access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 }, }; static struct kvm_coproc_target_table a15_target_table = { diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c index 17fc7cd479d3..b19e46d1b2c0 100644 --- a/arch/arm/kvm/coproc_a7.c +++ b/arch/arm/kvm/coproc_a7.c @@ -37,7 +37,7 @@ static const struct coproc_reg a7_regs[] = { /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - access_sctlr, reset_val, c1_SCTLR, 0x00C50878 }, + access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 }, }; static struct kvm_coproc_target_table a7_target_table = { diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index a96a8043277c..95f12b2ccdcb 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -87,11 +87,13 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - trace_kvm_wfi(*vcpu_pc(vcpu)); - if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { + trace_kvm_wfx(*vcpu_pc(vcpu), true); kvm_vcpu_on_spin(vcpu); - else + } else { + trace_kvm_wfx(*vcpu_pc(vcpu), false); kvm_vcpu_block(vcpu); + } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 01dcb0e752d9..79caf79b304a 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + /******************************************************************** * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1dc9778a00af..3e6859bc3e11 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -45,6 +45,26 @@ static phys_addr_t hyp_idmap_vector; #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) +#define kvm_pud_huge(_x) pud_huge(_x) + +#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { @@ -58,6 +78,45 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/* + * D-Cache management functions. They take the page table entries by + * value, as they are flushing the cache using the kernel mapping (or + * kmap on 32bit). + */ +static void kvm_flush_dcache_pte(pte_t pte) +{ + __kvm_flush_dcache_pte(pte); +} + +static void kvm_flush_dcache_pmd(pmd_t pmd) +{ + __kvm_flush_dcache_pmd(pmd); +} + +static void kvm_flush_dcache_pud(pud_t pud) +{ + __kvm_flush_dcache_pud(pud); +} + +/** + * stage2_dissolve_pmd() - clear and flush huge PMD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pmd: pmd pointer for IPA + * + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) +{ + if (!kvm_pmd_huge(*pmd)) + return; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pmd)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -119,6 +178,26 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) put_page(virt_to_page(pmd)); } +/* + * Unmapping vs dcache management: + * + * If a guest maps certain memory pages as uncached, all writes will + * bypass the data cache and go directly to RAM. However, the CPUs + * can still speculate reads (not writes) and fill cache lines with + * data. + * + * Those cache lines will be *clean* cache lines though, so a + * clean+invalidate operation is equivalent to an invalidate + * operation, because no cache lines are marked dirty. + * + * Those clean cache lines could be filled prior to an uncached write + * by the guest, and the cache coherent IO subsystem would therefore + * end up writing old data to disk. + * + * This is why right after unmapping a page/section and invalidating + * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure + * the IO subsystem will never hit in the cache. + */ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) { @@ -128,9 +207,16 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, start_pte = pte = pte_offset_kernel(pmd, addr); do { if (!pte_none(*pte)) { + pte_t old_pte = *pte; + kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); kvm_tlb_flush_vmid_ipa(kvm, addr); + + /* No need to invalidate the cache for device mappings */ + if ((pte_val(old_pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE) + kvm_flush_dcache_pte(old_pte); + + put_page(virt_to_page(pte)); } } while (pte++, addr += PAGE_SIZE, addr != end); @@ -149,8 +235,13 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud, next = kvm_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { if (kvm_pmd_huge(*pmd)) { + pmd_t old_pmd = *pmd; + pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); + + kvm_flush_dcache_pmd(old_pmd); + put_page(virt_to_page(pmd)); } else { unmap_ptes(kvm, pmd, addr, next); @@ -173,8 +264,13 @@ static void unmap_puds(struct kvm *kvm, pgd_t *pgd, next = kvm_pud_addr_end(addr, end); if (!pud_none(*pud)) { if (pud_huge(*pud)) { + pud_t old_pud = *pud; + pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); + + kvm_flush_dcache_pud(old_pud); + put_page(virt_to_page(pud)); } else { unmap_pmds(kvm, pud, addr, next); @@ -209,10 +305,9 @@ static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, pte = pte_offset_kernel(pmd, addr); do { - if (!pte_none(*pte)) { - hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); - kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE); - } + if (!pte_none(*pte) && + (pte_val(*pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE) + kvm_flush_dcache_pte(*pte); } while (pte++, addr += PAGE_SIZE, addr != end); } @@ -226,12 +321,10 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, do { next = kvm_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { - hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); - kvm_flush_dcache_to_poc((void*)hva, PMD_SIZE); - } else { + if (kvm_pmd_huge(*pmd)) + kvm_flush_dcache_pmd(*pmd); + else stage2_flush_ptes(kvm, pmd, addr, next); - } } } while (pmd++, addr = next, addr != end); } @@ -246,12 +339,10 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, do { next = kvm_pud_addr_end(addr, end); if (!pud_none(*pud)) { - if (pud_huge(*pud)) { - hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); - kvm_flush_dcache_to_poc((void*)hva, PUD_SIZE); - } else { + if (pud_huge(*pud)) + kvm_flush_dcache_pud(*pud); + else stage2_flush_pmds(kvm, pud, addr, next); - } } } while (pud++, addr = next, addr != end); } @@ -278,7 +369,7 @@ static void stage2_flush_memslot(struct kvm *kvm, * Go through the stage 2 page tables and invalidate any cache lines * backing memory already mapped to the VM. */ -void stage2_flush_vm(struct kvm *kvm) +static void stage2_flush_vm(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -767,10 +858,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, + unsigned long flags) { pmd_t *pmd; pte_t *pte, old_pte; + bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; + bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; + + VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ pmd = stage2_get_pmd(kvm, cache, addr); @@ -782,6 +878,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* + * While dirty page logging - dissolve huge PMD, then continue on to + * allocate page. + */ + if (logging_active) + stage2_dissolve_pmd(kvm, addr, pmd); + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -838,7 +941,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); + ret = stage2_set_pte(kvm, &cache, addr, &pte, + KVM_S2PTE_FLAG_IS_IOMAP); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -905,6 +1009,171 @@ static bool kvm_is_device_pfn(unsigned long pfn) return !pfn_valid(pfn); } +/** + * stage2_wp_ptes - write protect PMD range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmds - write protect PUD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else { + stage2_wp_ptes(pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +/** + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + * + * Process PUD entries, for a huge PUD we cause a panic. + */ +static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = pud_offset(pgd, addr); + do { + next = kvm_pud_addr_end(addr, end); + if (!pud_none(*pud)) { + /* TODO:PUD not supported, revisit later if supported */ + BUG_ON(kvm_pud_huge(*pud)); + stage2_wp_pmds(pud, addr, next); + } + } while (pud++, addr = next, addr != end); +} + +/** + * stage2_wp_range() - write protect stage2 memory region range + * @kvm: The KVM pointer + * @addr: Start address of range + * @end: End address of range + */ +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + pgd_t *pgd; + phys_addr_t next; + + pgd = kvm->arch.pgd + pgd_index(addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is + * large. Otherwise, we may see kernel panics with + * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, + * CONFIG_LOCKDEP. Additionally, holding the lock too long + * will also starve other vCPUs. + */ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + + next = kvm_pgd_addr_end(addr, end); + if (pgd_present(*pgd)) + stage2_wp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); + phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +/** + * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + +static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, + unsigned long size, bool uncached) +{ + __coherent_cache_guest_page(vcpu, pfn, size, uncached); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -919,6 +1188,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool fault_ipa_uncached; + bool logging_active = memslot_is_logging(memslot); + unsigned long flags = 0; write_fault = kvm_is_write_fault(vcpu); if (fault_status == FSC_PERM && !write_fault) { @@ -935,7 +1206,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { @@ -976,12 +1247,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_pfn(pfn)) return -EFAULT; - if (kvm_is_device_pfn(pfn)) + if (kvm_is_device_pfn(pfn)) { mem_type = PAGE_S2_DEVICE; + flags |= KVM_S2PTE_FLAG_IS_IOMAP; + } else if (logging_active) { + /* + * Faults on pages in a memslot with logging enabled + * should not be mapped with huge pages (it introduces churn + * and performance degradation), so force a pte mapping. + */ + force_pte = true; + flags |= KVM_S2_FLAG_LOGGING_ACTIVE; + + /* + * Only actually map the page as writable if this was a write + * fault. + */ + if (!write_fault) + writable = false; + } spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; + if (!hugetlb && !force_pte) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); @@ -994,22 +1283,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_s2pmd_writable(&new_pmd); kvm_set_pfn_dirty(pfn); } - coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE, - fault_ipa_uncached); + coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, mem_type); + if (writable) { kvm_set_s2pte_writable(&new_pte); kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); } - coherent_cache_guest_page(vcpu, hva, PAGE_SIZE, - fault_ipa_uncached); - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, - pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE)); + coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } - out_unlock: spin_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -1159,7 +1446,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + /* + * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE + * flag clear because MMU notifiers will have unmapped a huge PMD before + * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore stage2_set_pte() never needs to clear out a huge PMD + * through this calling path. + */ + stage2_set_pte(kvm, NULL, gpa, pte, 0); } @@ -1292,6 +1586,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_wp_memory_region(kvm, mem->slot); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -1304,7 +1605,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, bool writable = !(mem->flags & KVM_MEM_READONLY); int ret = 0; - if (change != KVM_MR_CREATE && change != KVM_MR_MOVE) + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) return 0; /* @@ -1355,6 +1657,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) + vm_start - vma->vm_start; + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) + return -EINVAL; + ret = kvm_phys_addr_ioremap(kvm, gpa, pa, vm_end - vm_start, writable); @@ -1364,6 +1670,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva = vm_end; } while (hva < reg_end); + if (change == KVM_MR_FLAGS_ONLY) + return ret; + spin_lock(&kvm->mmu_lock); if (ret) unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); @@ -1411,3 +1720,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, unmap_stage2_range(kvm, gpa, size); spin_unlock(&kvm->mmu_lock); } + +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + * + * Main problems: + * - S/W ops are local to a CPU (not broadcast) + * - We have line migration behind our back (speculation) + * - System caches don't support S/W at all (damn!) + * + * In the face of the above, the best we can do is to try and convert + * S/W ops to VA ops. Because the guest is not allowed to infer the + * S/W to PA mapping, it can only use S/W to nuke the whole cache, + * which is a rather good thing for us. + * + * Also, it is only used when turning caches on/off ("The expected + * usage of the cache maintenance instructions that operate by set/way + * is associated with the cache maintenance instructions associated + * with the powerdown and powerup of caches, if this is required by + * the implementation."). + * + * We use the following policy: + * + * - If we trap a S/W operation, we enable VM trapping to detect + * caches being turned on/off, and do a full clean. + * + * - We flush the caches on both caches being turned on and off. + * + * - Once the caches are enabled, we stop trapping VM ops. + */ +void kvm_set_way_flush(struct kvm_vcpu *vcpu) +{ + unsigned long hcr = vcpu_get_hcr(vcpu); + + /* + * If this is the first time we do a S/W operation + * (i.e. HCR_TVM not set) flush the whole memory, and set the + * VM trapping. + * + * Otherwise, rely on the VM trapping to wait for the MMU + + * Caches to be turned off. At that point, we'll be able to + * clean the caches again. + */ + if (!(hcr & HCR_TVM)) { + trace_kvm_set_way_flush(*vcpu_pc(vcpu), + vcpu_has_cache_enabled(vcpu)); + stage2_flush_vm(vcpu->kvm); + vcpu_set_hcr(vcpu, hcr | HCR_TVM); + } +} + +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) +{ + bool now_enabled = vcpu_has_cache_enabled(vcpu); + + /* + * If switching the MMU+caches on, need to invalidate the caches. + * If switching it off, need to clean the caches. + * Clean + invalidate does the trick always. + */ + if (now_enabled != was_enabled) + stage2_flush_vm(vcpu->kvm); + + /* Caches are now on, stop trapping VM ops (until a S/W op) */ + if (now_enabled) + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM); + + trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); +} diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 58cb3248d277..02fa8eff6ae1 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -22,6 +22,7 @@ #include <asm/cputype.h> #include <asm/kvm_emulate.h> #include <asm/kvm_psci.h> +#include <asm/kvm_host.h> /* * This is an implementation of the Power State Coordination Interface @@ -66,25 +67,17 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu = NULL, *tmp; + struct kvm_vcpu *vcpu = NULL; wait_queue_head_t *wq; unsigned long cpu_id; unsigned long context_id; - unsigned long mpidr; phys_addr_t target_pc; - int i; - cpu_id = *vcpu_reg(source_vcpu, 1); + cpu_id = *vcpu_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK; if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); - kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); - if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) { - vcpu = tmp; - break; - } - } + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); /* * Make sure the caller requested a valid CPU and that the CPU is @@ -155,7 +148,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) * then ON else OFF */ kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); + mpidr = kvm_vcpu_get_mpidr_aff(tmp); if (((mpidr & target_affinity_mask) == target_affinity) && !tmp->arch.pause) { return PSCI_0_2_AFFINITY_LEVEL_ON; diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index b1d640f78623..881874b1a036 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -140,19 +140,22 @@ TRACE_EVENT(kvm_emulate_cp15_imp, __entry->CRm, __entry->Op2) ); -TRACE_EVENT(kvm_wfi, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), +TRACE_EVENT(kvm_wfx, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), TP_STRUCT__entry( __field( unsigned long, vcpu_pc ) + __field( bool, is_wfe ) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; ), - TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc) + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); TRACE_EVENT(kvm_unmap_hva, @@ -223,6 +226,45 @@ TRACE_EVENT(kvm_hvc, __entry->vcpu_pc, __entry->r0, __entry->imm) ); +TRACE_EVENT(kvm_set_way_flush, + TP_PROTO(unsigned long vcpu_pc, bool cache), + TP_ARGS(vcpu_pc, cache), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, cache ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->cache = cache; + ), + + TP_printk("S/W flush at 0x%016lx (cache %s)", + __entry->vcpu_pc, __entry->cache ? "on" : "off") +); + +TRACE_EVENT(kvm_toggle_cache, + TP_PROTO(unsigned long vcpu_pc, bool was, bool now), + TP_ARGS(vcpu_pc, was, now), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, was ) + __field( bool, now ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->was = was; + __entry->now = now; + ), + + TP_printk("VM op at 0x%016lx (cache was %s, now %s)", + __entry->vcpu_pc, __entry->was ? "on" : "off", + __entry->now ? "on" : "off") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH |