diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-13 20:55:09 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-13 20:55:09 +0300 |
commit | b9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch) | |
tree | e397abf5682a45c096e75b3d0fa99c8e228425fc /arch/arm | |
parent | c7d7b98671552abade78834c522b7308bda73c0d (diff) | |
parent | 6557bada461afeaa920a189fae2cff7c8fdce39f (diff) | |
download | linux-b9085bcbf5f43adf60533f9b635b2e7faeed0fe9.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM update from Paolo Bonzini:
"Fairly small update, but there are some interesting new features.
Common:
Optional support for adding a small amount of polling on each HLT
instruction executed in the guest (or equivalent for other
architectures). This can improve latency up to 50% on some
scenarios (e.g. O_DSYNC writes or TCP_RR netperf tests). This
also has to be enabled manually for now, but the plan is to
auto-tune this in the future.
ARM/ARM64:
The highlights are support for GICv3 emulation and dirty page
tracking
s390:
Several optimizations and bugfixes. Also a first: a feature
exposed by KVM (UUID and long guest name in /proc/sysinfo) before
it is available in IBM's hypervisor! :)
MIPS:
Bugfixes.
x86:
Support for PML (page modification logging, a new feature in
Broadwell Xeons that speeds up dirty page tracking), nested
virtualization improvements (nested APICv---a nice optimization),
usual round of emulation fixes.
There is also a new option to reduce latency of the TSC deadline
timer in the guest; this needs to be tuned manually.
Some commits are common between this pull and Catalin's; I see you
have already included his tree.
Powerpc:
Nothing yet.
The KVM/PPC changes will come in through the PPC maintainers,
because I haven't received them yet and I might end up being
offline for some part of next week"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
KVM: ia64: drop kvm.h from installed user headers
KVM: x86: fix build with !CONFIG_SMP
KVM: x86: emulate: correct page fault error code for NoWrite instructions
KVM: Disable compat ioctl for s390
KVM: s390: add cpu model support
KVM: s390: use facilities and cpu_id per KVM
KVM: s390/CPACF: Choose crypto control block format
s390/kernel: Update /proc/sysinfo file with Extended Name and UUID
KVM: s390: reenable LPP facility
KVM: s390: floating irqs: fix user triggerable endless loop
kvm: add halt_poll_ns module parameter
kvm: remove KVM_MMIO_SIZE
KVM: MIPS: Don't leak FPU/DSP to guest
KVM: MIPS: Disable HTW while in guest
KVM: nVMX: Enable nested posted interrupt processing
KVM: nVMX: Enable nested virtual interrupt delivery
KVM: nVMX: Enable nested apic register virtualization
KVM: nVMX: Make nested control MSRs per-cpu
KVM: nVMX: Enable nested virtualize x2apic mode
KVM: nVMX: Prepare for using hardware MSR bitmap
...
Diffstat (limited to 'arch/arm')
-rw-r--r-- | arch/arm/include/asm/kvm_asm.h | 1 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_emulate.h | 5 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_host.h | 6 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_mmio.h | 1 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_mmu.h | 21 | ||||
-rw-r--r-- | arch/arm/include/asm/pgtable-3level.h | 1 | ||||
-rw-r--r-- | arch/arm/include/uapi/asm/kvm.h | 2 | ||||
-rw-r--r-- | arch/arm/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/arm/kvm/Makefile | 1 | ||||
-rw-r--r-- | arch/arm/kvm/arm.c | 58 | ||||
-rw-r--r-- | arch/arm/kvm/handle_exit.c | 8 | ||||
-rw-r--r-- | arch/arm/kvm/interrupts.S | 11 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 271 | ||||
-rw-r--r-- | arch/arm/kvm/psci.c | 17 | ||||
-rw-r--r-- | arch/arm/kvm/trace.h | 11 |
15 files changed, 382 insertions, 34 deletions
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 3a67bec72d0c..25410b2d8bc1 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -96,6 +96,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 7b0152321b20..a9c80a2ea1a7 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -23,6 +23,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> #include <asm/kvm_arm.h> +#include <asm/cputype.h> unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); @@ -177,9 +178,9 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; } -static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { - return vcpu->arch.cp15[c0_MPIDR]; + return vcpu->arch.cp15[c0_MPIDR] & MPIDR_HWID_BITMASK; } static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 04b4ea0b550a..41008cd7c53f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -68,6 +68,7 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_dist vgic; + int max_vcpus; }; #define KVM_NR_MEM_OBJS 40 @@ -144,6 +145,7 @@ struct kvm_vm_stat { }; struct kvm_vcpu_stat { + u32 halt_successful_poll; u32 halt_wakeup; }; @@ -231,6 +233,10 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic) int kvm_perf_init(void); int kvm_perf_teardown(void); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); + +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); + static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h index adcc0d7d3175..3f83db2f6cf0 100644 --- a/arch/arm/include/asm/kvm_mmio.h +++ b/arch/arm/include/asm/kvm_mmio.h @@ -37,6 +37,7 @@ struct kvm_exit_mmio { u8 data[8]; u32 len; bool is_write; + void *private; }; static inline void kvm_prepare_mmio(struct kvm_run *run, diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 1bca8f8af442..37ca2a4c6f09 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -115,6 +115,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 423a5ac09d3a..a745a2a53853 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ /* diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 09ee408c1a67..0db25bc32864 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -175,6 +175,8 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 +#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 +#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3afee5f40f4f..338ace78ed18 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -21,8 +21,10 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select SRCU depends on ARM_VIRT_EXT && ARM_LPAE ---help--- diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index f7057ed045b6..443b8bea43e9 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -22,4 +22,5 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o +obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 0b0d58a905c4..07e7eb1d7ab6 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -132,6 +132,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Mark the initial VMID generation invalid */ kvm->arch.vmid_gen = 0; + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus(); + return ret; out_free_stage2_pgd: kvm_free_stage2_pgd(kvm); @@ -218,6 +221,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) goto out; } + if (id >= kvm->arch.max_vcpus) { + err = -EINVAL; + goto out; + } + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; @@ -241,9 +249,8 @@ out: return ERR_PTR(err); } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) @@ -777,9 +784,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -EINVAL; + bool is_dirty = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, @@ -811,7 +848,7 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { if (vgic_present) - return kvm_vgic_create(kvm); + return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); else return -ENXIO; } @@ -1035,6 +1072,19 @@ static void check_kvm_target_cpu(void *ret) *(int *)ret = kvm_target_cpu(); } +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu; + int i; + + mpidr &= MPIDR_HWID_BITMASK; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + /** * Initialize Hyp-mode and memory mappings on all CPUs. */ diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index a96a8043277c..95f12b2ccdcb 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -87,11 +87,13 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - trace_kvm_wfi(*vcpu_pc(vcpu)); - if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { + trace_kvm_wfx(*vcpu_pc(vcpu), true); kvm_vcpu_on_spin(vcpu); - else + } else { + trace_kvm_wfx(*vcpu_pc(vcpu), false); kvm_vcpu_block(vcpu); + } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 01dcb0e752d9..79caf79b304a 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + /******************************************************************** * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 136662547ca6..3e6859bc3e11 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -45,6 +45,26 @@ static phys_addr_t hyp_idmap_vector; #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) +#define kvm_pud_huge(_x) pud_huge(_x) + +#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { @@ -78,6 +98,25 @@ static void kvm_flush_dcache_pud(pud_t pud) __kvm_flush_dcache_pud(pud); } +/** + * stage2_dissolve_pmd() - clear and flush huge PMD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pmd: pmd pointer for IPA + * + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) +{ + if (!kvm_pmd_huge(*pmd)) + return; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pmd)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -819,10 +858,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, + unsigned long flags) { pmd_t *pmd; pte_t *pte, old_pte; + bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; + bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; + + VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ pmd = stage2_get_pmd(kvm, cache, addr); @@ -834,6 +878,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* + * While dirty page logging - dissolve huge PMD, then continue on to + * allocate page. + */ + if (logging_active) + stage2_dissolve_pmd(kvm, addr, pmd); + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -890,7 +941,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); + ret = stage2_set_pte(kvm, &cache, addr, &pte, + KVM_S2PTE_FLAG_IS_IOMAP); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -957,6 +1009,165 @@ static bool kvm_is_device_pfn(unsigned long pfn) return !pfn_valid(pfn); } +/** + * stage2_wp_ptes - write protect PMD range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmds - write protect PUD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else { + stage2_wp_ptes(pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +/** + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + * + * Process PUD entries, for a huge PUD we cause a panic. + */ +static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = pud_offset(pgd, addr); + do { + next = kvm_pud_addr_end(addr, end); + if (!pud_none(*pud)) { + /* TODO:PUD not supported, revisit later if supported */ + BUG_ON(kvm_pud_huge(*pud)); + stage2_wp_pmds(pud, addr, next); + } + } while (pud++, addr = next, addr != end); +} + +/** + * stage2_wp_range() - write protect stage2 memory region range + * @kvm: The KVM pointer + * @addr: Start address of range + * @end: End address of range + */ +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + pgd_t *pgd; + phys_addr_t next; + + pgd = kvm->arch.pgd + pgd_index(addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is + * large. Otherwise, we may see kernel panics with + * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, + * CONFIG_LOCKDEP. Additionally, holding the lock too long + * will also starve other vCPUs. + */ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + + next = kvm_pgd_addr_end(addr, end); + if (pgd_present(*pgd)) + stage2_wp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); + phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +/** + * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, unsigned long size, bool uncached) { @@ -977,6 +1188,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool fault_ipa_uncached; + bool logging_active = memslot_is_logging(memslot); + unsigned long flags = 0; write_fault = kvm_is_write_fault(vcpu); if (fault_status == FSC_PERM && !write_fault) { @@ -993,7 +1206,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { @@ -1034,12 +1247,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_pfn(pfn)) return -EFAULT; - if (kvm_is_device_pfn(pfn)) + if (kvm_is_device_pfn(pfn)) { mem_type = PAGE_S2_DEVICE; + flags |= KVM_S2PTE_FLAG_IS_IOMAP; + } else if (logging_active) { + /* + * Faults on pages in a memslot with logging enabled + * should not be mapped with huge pages (it introduces churn + * and performance degradation), so force a pte mapping. + */ + force_pte = true; + flags |= KVM_S2_FLAG_LOGGING_ACTIVE; + + /* + * Only actually map the page as writable if this was a write + * fault. + */ + if (!write_fault) + writable = false; + } spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; + if (!hugetlb && !force_pte) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); @@ -1056,16 +1287,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, mem_type); + if (writable) { kvm_set_s2pte_writable(&new_pte); kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); } coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, - pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE)); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } - out_unlock: spin_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -1215,7 +1446,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + /* + * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE + * flag clear because MMU notifiers will have unmapped a huge PMD before + * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore stage2_set_pte() never needs to clear out a huge PMD + * through this calling path. + */ + stage2_set_pte(kvm, NULL, gpa, pte, 0); } @@ -1348,6 +1586,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_wp_memory_region(kvm, mem->slot); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -1360,7 +1605,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, bool writable = !(mem->flags & KVM_MEM_READONLY); int ret = 0; - if (change != KVM_MR_CREATE && change != KVM_MR_MOVE) + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) return 0; /* @@ -1411,6 +1657,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) + vm_start - vma->vm_start; + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) + return -EINVAL; + ret = kvm_phys_addr_ioremap(kvm, gpa, pa, vm_end - vm_start, writable); @@ -1420,6 +1670,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva = vm_end; } while (hva < reg_end); + if (change == KVM_MR_FLAGS_ONLY) + return ret; + spin_lock(&kvm->mmu_lock); if (ret) unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 58cb3248d277..02fa8eff6ae1 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -22,6 +22,7 @@ #include <asm/cputype.h> #include <asm/kvm_emulate.h> #include <asm/kvm_psci.h> +#include <asm/kvm_host.h> /* * This is an implementation of the Power State Coordination Interface @@ -66,25 +67,17 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu = NULL, *tmp; + struct kvm_vcpu *vcpu = NULL; wait_queue_head_t *wq; unsigned long cpu_id; unsigned long context_id; - unsigned long mpidr; phys_addr_t target_pc; - int i; - cpu_id = *vcpu_reg(source_vcpu, 1); + cpu_id = *vcpu_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK; if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); - kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); - if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) { - vcpu = tmp; - break; - } - } + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); /* * Make sure the caller requested a valid CPU and that the CPU is @@ -155,7 +148,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) * then ON else OFF */ kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); + mpidr = kvm_vcpu_get_mpidr_aff(tmp); if (((mpidr & target_affinity_mask) == target_affinity) && !tmp->arch.pause) { return PSCI_0_2_AFFINITY_LEVEL_ON; diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index b6a6e7102201..881874b1a036 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -140,19 +140,22 @@ TRACE_EVENT(kvm_emulate_cp15_imp, __entry->CRm, __entry->Op2) ); -TRACE_EVENT(kvm_wfi, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), +TRACE_EVENT(kvm_wfx, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), TP_STRUCT__entry( __field( unsigned long, vcpu_pc ) + __field( bool, is_wfe ) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; ), - TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc) + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); TRACE_EVENT(kvm_unmap_hva, |