diff options
Diffstat (limited to 'arch/arm/kvm')
-rw-r--r-- | arch/arm/kvm/Kconfig | 33 | ||||
-rw-r--r-- | arch/arm/kvm/Makefile | 10 | ||||
-rw-r--r-- | arch/arm/kvm/arm.c | 269 | ||||
-rw-r--r-- | arch/arm/kvm/coproc.c | 160 | ||||
-rw-r--r-- | arch/arm/kvm/coproc.h | 6 | ||||
-rw-r--r-- | arch/arm/kvm/coproc_a15.c | 2 | ||||
-rw-r--r-- | arch/arm/kvm/coproc_a7.c | 2 | ||||
-rw-r--r-- | arch/arm/kvm/guest.c | 64 | ||||
-rw-r--r-- | arch/arm/kvm/handle_exit.c | 10 | ||||
-rw-r--r-- | arch/arm/kvm/init.S | 11 | ||||
-rw-r--r-- | arch/arm/kvm/interrupts.S | 20 | ||||
-rw-r--r-- | arch/arm/kvm/interrupts_head.S | 63 | ||||
-rw-r--r-- | arch/arm/kvm/mmio.c | 67 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 1206 | ||||
-rw-r--r-- | arch/arm/kvm/psci.c | 35 | ||||
-rw-r--r-- | arch/arm/kvm/trace.h | 108 |
16 files changed, 1534 insertions, 532 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 4be5bb150bdd..f1f79d104309 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -18,15 +18,21 @@ if VIRTUALIZATION config KVM bool "Kernel-based Virtual Machine (KVM) support" + depends on MMU && OF select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST - depends on ARM_VIRT_EXT && ARM_LPAE && !CPU_BIG_ENDIAN + select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select SRCU + select MMU_NOTIFIER + select HAVE_KVM_EVENTFD + select HAVE_KVM_IRQFD + depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER ---help--- - Support hosting virtualized guest machines. You will also - need to select one or more of the processor modules below. + Support hosting virtualized guest machines. This module provides access to the hardware capabilities through a character device node named /dev/kvm. @@ -34,10 +40,7 @@ config KVM If unsure, say N. config KVM_ARM_HOST - bool "KVM host support for ARM cpus." - depends on KVM - depends on MMU - select MMU_NOTIFIER + bool ---help--- Provides host support for ARM processors. @@ -52,20 +55,4 @@ config KVM_ARM_MAX_VCPUS large, so only choose a reasonable number that you expect to actually use. -config KVM_ARM_VGIC - bool "KVM support for Virtual GIC" - depends on KVM_ARM_HOST && OF - select HAVE_KVM_IRQCHIP - default y - ---help--- - Adds support for a hardware assisted, in-kernel GIC emulation. - -config KVM_ARM_TIMER - bool "KVM support for Architected Timers" - depends on KVM_ARM_VGIC && ARM_ARCH_TIMER - select HAVE_KVM_IRQCHIP - default y - ---help--- - Adds support for the Architected Timers in virtual machines - endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 789bca9e64a7..139e46c08b6e 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -7,7 +7,7 @@ ifeq ($(plus_virt),+virt) plus_virt_def := -DREQUIRES_VIRT=1 endif -ccflags-y += -Ivirt/kvm -Iarch/arm/kvm +ccflags-y += -Iarch/arm/kvm CFLAGS_arm.o := -I. $(plus_virt_def) CFLAGS_mmu.o := -I. @@ -15,10 +15,12 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) KVM := ../../../virt/kvm -kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o +kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o -obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o -obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o +obj-y += $(KVM)/arm/vgic.o +obj-y += $(KVM)/arm/vgic-v2.o +obj-y += $(KVM)/arm/vgic-v2-emul.o +obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3c82b37c0f9e..6f536451ab78 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -61,8 +61,6 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u8 kvm_next_vmid; static DEFINE_SPINLOCK(kvm_vmid_lock); -static bool vgic_present; - static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { BUG_ON(preemptible()); @@ -82,12 +80,12 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void) /** * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus. */ -struct kvm_vcpu __percpu **kvm_get_running_vcpus(void) +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) { return &kvm_arm_running_vcpu; } -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { return 0; } @@ -97,27 +95,16 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -void kvm_arch_hardware_disable(void *garbage) -{ -} - int kvm_arch_hardware_setup(void) { return 0; } -void kvm_arch_hardware_unsetup(void) -{ -} - void kvm_arch_check_processor_compat(void *rtn) { *(int *)rtn = 0; } -void kvm_arch_sync_events(struct kvm *kvm) -{ -} /** * kvm_arch_init_vm - initializes a VM data structure @@ -143,6 +130,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Mark the initial VMID generation invalid */ kvm->arch.vmid_gen = 0; + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus(); + return ret; out_free_stage2_pgd: kvm_free_stage2_pgd(kvm); @@ -155,16 +145,6 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ -} - -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} /** * kvm_arch_destroy_vm - destroy the VM data structure @@ -182,15 +162,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm->vcpus[i] = NULL; } } + + kvm_vgic_destroy(kvm); } -int kvm_dev_ioctl_check_extension(long ext) +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; switch (ext) { case KVM_CAP_IRQCHIP: - r = vgic_present; - break; + case KVM_CAP_IRQFD: + case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: @@ -198,6 +180,8 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -225,39 +209,22 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -void kvm_arch_memslots_updated(struct kvm *kvm) -{ -} - -int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) -{ - return 0; -} - -void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, - enum kvm_mr_change change) -{ -} - -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err; struct kvm_vcpu *vcpu; + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) { + err = -EBUSY; + goto out; + } + + if (id >= kvm->arch.max_vcpus) { + err = -EINVAL; + goto out; + } + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; @@ -281,15 +248,15 @@ out: return ERR_PTR(err); } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); + kvm_vgic_vcpu_destroy(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } @@ -300,20 +267,14 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return 0; + return kvm_timer_should_fire(vcpu); } int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int ret; - /* Force users to call KVM_ARM_VCPU_INIT */ vcpu->arch.target = -1; - - /* Set up VGIC */ - ret = kvm_vgic_vcpu_init(vcpu); - if (ret) - return ret; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); /* Set up the timer */ kvm_timer_vcpu_init(vcpu); @@ -321,24 +282,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; } -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { vcpu->cpu = cpu; vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); - /* - * Check whether this vcpu requires the cache to be flushed on - * this physical CPU. This is a consequence of doing dcache - * operations by set/way on this vcpu. We do it here to be in - * a non-preemptible section. - */ - if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush)) - flush_cache_all(); /* We'd really want v7_flush_dcache_all() */ - kvm_arm_set_running_vcpu(vcpu); } @@ -364,13 +312,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - return -EINVAL; + if (vcpu->arch.pause) + mp_state->mp_state = KVM_MP_STATE_STOPPED; + else + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - return -EINVAL; + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.pause = false; + break; + case KVM_MP_STATE_STOPPED: + vcpu->arch.pause = true; + break; + default: + return -EINVAL; + } + + return 0; } /** @@ -464,16 +428,17 @@ static void update_vttbr(struct kvm *kvm) kvm_next_vmid++; /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm->arch.pgd); + pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm)); + BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; - kvm->arch.vttbr |= vmid; + kvm->arch.vttbr = pgd_phys | vmid; spin_unlock(&kvm_vmid_lock); } static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; int ret; if (likely(vcpu->arch.has_run_once)) @@ -482,18 +447,31 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) vcpu->arch.has_run_once = true; /* - * Initialize the VGIC before running a vcpu the first time on - * this VM. + * Map the VGIC hardware resources before running a vcpu the first + * time on this VM. */ - if (unlikely(!vgic_initialized(vcpu->kvm))) { - ret = kvm_vgic_init(vcpu->kvm); + if (unlikely(!vgic_ready(kvm))) { + ret = kvm_vgic_map_resources(kvm); if (ret) return ret; } + /* + * Enable the arch timers only if we have an in-kernel VGIC + * and it has been properly initialized, since we cannot handle + * interrupts from the virtual timer with a userspace gic. + */ + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) + kvm_timer_enable(kvm); + return 0; } +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ + return vgic_initialized(kvm); +} + static void vcpu_pause(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); @@ -581,9 +559,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; - vcpu->arch.last_pcpu = smp_processor_id(); kvm_guest_exit(); - trace_kvm_exit(*vcpu_pc(vcpu)); + trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* * We may have taken a host interrupt in HYP mode (ie * while executing the guest). This interrupt is still @@ -704,6 +681,48 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } +static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned int i; + int phys_target = kvm_target_cpu(); + + if (init->target != phys_target) + return -EINVAL; + + /* + * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must + * use the same target. + */ + if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) + return -EINVAL; + + /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ + for (i = 0; i < sizeof(init->features) * 8; i++) { + bool set = (init->features[i / 32] & (1 << (i % 32))); + + if (set && i >= KVM_VCPU_MAX_FEATURES) + return -ENOENT; + + /* + * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must + * use the same feature set. + */ + if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && + test_bit(i, vcpu->arch.features) != set) + return -EINVAL; + + if (set) + set_bit(i, vcpu->arch.features); + } + + vcpu->arch.target = phys_target; + + /* Now we know what it is, we can reset it. */ + return kvm_reset_vcpu(vcpu); +} + + static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { @@ -714,10 +733,21 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, return ret; /* + * Ensure a rebooted VM will fault in RAM pages and detect if the + * guest MMU is turned off and flush the caches as needed. + */ + if (vcpu->arch.has_run_once) + stage2_unmap_vm(vcpu->kvm); + + vcpu_reset_hcr(vcpu); + + /* * Handle the "start in power-off" case by marking the VCPU as paused. */ - if (__test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) + if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) vcpu->arch.pause = true; + else + vcpu->arch.pause = false; return 0; } @@ -774,9 +804,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -EINVAL; + bool is_dirty = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, @@ -791,8 +851,6 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, switch (dev_id) { case KVM_ARM_DEVICE_VGIC_V2: - if (!vgic_present) - return -ENXIO; return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); default: return -ENODEV; @@ -807,10 +865,7 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { - if (vgic_present) - return kvm_vgic_create(kvm); - else - return -ENXIO; + return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); } case KVM_ARM_SET_DEVICE_ADDR: { struct kvm_arm_device_addr dev_addr; @@ -863,7 +918,8 @@ static int hyp_init_cpu_notify(struct notifier_block *self, switch (action) { case CPU_STARTING: case CPU_STARTING_FROZEN: - cpu_init_hyp_mode(NULL); + if (__hyp_get_vectors() == hyp_default_vectors) + cpu_init_hyp_mode(NULL); break; } @@ -994,10 +1050,6 @@ static int init_hyp_mode(void) if (err) goto out_free_context; -#ifdef CONFIG_KVM_ARM_VGIC - vgic_present = true; -#endif - /* * Init HYP architected timer support */ @@ -1031,6 +1083,19 @@ static void check_kvm_target_cpu(void *ret) *(int *)ret = kvm_target_cpu(); } +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu; + int i; + + mpidr &= MPIDR_HWID_BITMASK; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + /** * Initialize Hyp-mode and memory mappings on all CPUs. */ diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index c58a35116f63..f3d88dc388bc 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -44,6 +44,31 @@ static u32 cache_levels; /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 12 +/* + * kvm_vcpu_arch.cp15 holds cp15 registers as an array of u32, but some + * of cp15 registers can be viewed either as couple of two u32 registers + * or one u64 register. Current u64 register encoding is that least + * significant u32 word is followed by most significant u32 word. + */ +static inline void vcpu_cp15_reg64_set(struct kvm_vcpu *vcpu, + const struct coproc_reg *r, + u64 val) +{ + vcpu->arch.cp15[r->reg] = val & 0xffffffff; + vcpu->arch.cp15[r->reg + 1] = val >> 32; +} + +static inline u64 vcpu_cp15_reg64_get(struct kvm_vcpu *vcpu, + const struct coproc_reg *r) +{ + u64 val; + + val = vcpu->arch.cp15[r->reg + 1]; + val = val << 32; + val = val | vcpu->arch.cp15[r->reg]; + return val; +} + int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run) { kvm_inject_undefined(vcpu); @@ -164,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu, return true; } -/* See note at ARM ARM B1.14.4 */ +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + */ static bool access_dcsw(struct kvm_vcpu *vcpu, const struct coproc_params *p, const struct coproc_reg *r) { - unsigned long val; - int cpu; - if (!p->is_write) return read_from_write_only(vcpu, p); - cpu = get_cpu(); - - cpumask_setall(&vcpu->arch.require_dcache_flush); - cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush); - - /* If we were already preempted, take the long way around */ - if (cpu != vcpu->arch.last_pcpu) { - flush_cache_all(); - goto done; - } - - val = *vcpu_reg(vcpu, p->Rt1); - - switch (p->CRm) { - case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */ - case 14: /* DCCISW */ - asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val)); - break; - - case 10: /* DCCSW */ - asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val)); - break; - } - -done: - put_cpu(); - + kvm_set_way_flush(vcpu); return true; } /* * Generic accessor for VM registers. Only called as long as HCR_TVM - * is set. + * is set. If the guest enables the MMU, we stop trapping the VM + * sys_regs and leave it in complete control of the caches. + * + * Used by the cpu-specific code. */ -static bool access_vm_reg(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) +bool access_vm_reg(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) { + bool was_enabled = vcpu_has_cache_enabled(vcpu); + BUG_ON(!p->is_write); vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1); if (p->is_64bit) vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2); - return true; -} - -/* - * SCTLR accessor. Only called as long as HCR_TVM is set. If the - * guest enables the MMU, we stop trapping the VM sys_regs and leave - * it in complete control of the caches. - * - * Used by the cpu-specific code. - */ -bool access_sctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - access_vm_reg(vcpu, p, r); - - if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */ - vcpu->arch.hcr &= ~HCR_TVM; - stage2_flush_vm(vcpu->kvm); - } - + kvm_toggle_cache(vcpu, was_enabled); return true; } @@ -682,17 +665,23 @@ static struct coproc_reg invariant_cp15[] = { { CRn( 0), CRm( 0), Op1( 1), Op2( 7), is32, NULL, get_AIDR }, }; +/* + * Reads a register value from a userspace address to a kernel + * variable. Make sure that register size matches sizeof(*__val). + */ static int reg_from_user(void *val, const void __user *uaddr, u64 id) { - /* This Just Works because we are little endian. */ if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) return -EFAULT; return 0; } +/* + * Writes a register value to a userspace address from a kernel variable. + * Make sure that register size matches sizeof(*__val). + */ static int reg_to_user(void __user *uaddr, const void *val, u64 id) { - /* This Just Works because we are little endian. */ if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) return -EFAULT; return 0; @@ -702,6 +691,7 @@ static int get_invariant_cp15(u64 id, void __user *uaddr) { struct coproc_params params; const struct coproc_reg *r; + int ret; if (!index_to_params(id, ¶ms)) return -ENOENT; @@ -710,7 +700,15 @@ static int get_invariant_cp15(u64 id, void __user *uaddr) if (!r) return -ENOENT; - return reg_to_user(uaddr, &r->val, id); + ret = -ENOENT; + if (KVM_REG_SIZE(id) == 4) { + u32 val = r->val; + + ret = reg_to_user(uaddr, &val, id); + } else if (KVM_REG_SIZE(id) == 8) { + ret = reg_to_user(uaddr, &r->val, id); + } + return ret; } static int set_invariant_cp15(u64 id, void __user *uaddr) @@ -718,7 +716,7 @@ static int set_invariant_cp15(u64 id, void __user *uaddr) struct coproc_params params; const struct coproc_reg *r; int err; - u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ + u64 val; if (!index_to_params(id, ¶ms)) return -ENOENT; @@ -726,7 +724,16 @@ static int set_invariant_cp15(u64 id, void __user *uaddr) if (!r) return -ENOENT; - err = reg_from_user(&val, uaddr, id); + err = -ENOENT; + if (KVM_REG_SIZE(id) == 4) { + u32 val32; + + err = reg_from_user(&val32, uaddr, id); + if (!err) + val = val32; + } else if (KVM_REG_SIZE(id) == 8) { + err = reg_from_user(&val, uaddr, id); + } if (err) return err; @@ -742,7 +749,7 @@ static bool is_valid_cache(u32 val) u32 level, ctype; if (val >= CSSELR_MAX) - return -ENOENT; + return false; /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ level = (val >> 1); @@ -1004,6 +1011,7 @@ int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { const struct coproc_reg *r; void __user *uaddr = (void __user *)(long)reg->addr; + int ret; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(reg->id, uaddr); @@ -1015,14 +1023,24 @@ int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (!r) return get_invariant_cp15(reg->id, uaddr); - /* Note: copies two regs if size is 64 bit. */ - return reg_to_user(uaddr, &vcpu->arch.cp15[r->reg], reg->id); + ret = -ENOENT; + if (KVM_REG_SIZE(reg->id) == 8) { + u64 val; + + val = vcpu_cp15_reg64_get(vcpu, r); + ret = reg_to_user(uaddr, &val, reg->id); + } else if (KVM_REG_SIZE(reg->id) == 4) { + ret = reg_to_user(uaddr, &vcpu->arch.cp15[r->reg], reg->id); + } + + return ret; } int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { const struct coproc_reg *r; void __user *uaddr = (void __user *)(long)reg->addr; + int ret; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(reg->id, uaddr); @@ -1034,8 +1052,18 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (!r) return set_invariant_cp15(reg->id, uaddr); - /* Note: copies two regs if size is 64 bit */ - return reg_from_user(&vcpu->arch.cp15[r->reg], uaddr, reg->id); + ret = -ENOENT; + if (KVM_REG_SIZE(reg->id) == 8) { + u64 val; + + ret = reg_from_user(&val, uaddr, reg->id); + if (!ret) + vcpu_cp15_reg64_set(vcpu, r, val); + } else if (KVM_REG_SIZE(reg->id) == 4) { + ret = reg_from_user(&vcpu->arch.cp15[r->reg], uaddr, reg->id); + } + + return ret; } static unsigned int num_demux_regs(void) diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index 1a44bbe39643..88d24a3a9778 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -153,8 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1, #define is64 .is_64 = true #define is32 .is_64 = false -bool access_sctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r); +bool access_vm_reg(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r); #endif /* __ARM_KVM_COPROC_LOCAL_H__ */ diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c index e6f4ae48bda9..a7136757d373 100644 --- a/arch/arm/kvm/coproc_a15.c +++ b/arch/arm/kvm/coproc_a15.c @@ -34,7 +34,7 @@ static const struct coproc_reg a15_regs[] = { /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - access_sctlr, reset_val, c1_SCTLR, 0x00C50078 }, + access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 }, }; static struct kvm_coproc_target_table a15_target_table = { diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c index 17fc7cd479d3..b19e46d1b2c0 100644 --- a/arch/arm/kvm/coproc_a7.c +++ b/arch/arm/kvm/coproc_a7.c @@ -37,7 +37,7 @@ static const struct coproc_reg a7_regs[] = { /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - access_sctlr, reset_val, c1_SCTLR, 0x00C50878 }, + access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 }, }; static struct kvm_coproc_target_table a7_target_table = { diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index b23a59c1c522..d503fbb787d3 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - vcpu->arch.hcr = HCR_GUEST_MASK; return 0; } @@ -110,32 +109,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } -#ifndef CONFIG_KVM_ARM_TIMER - -#define NUM_TIMER_REGS 0 - -static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - return 0; -} - -static bool is_timer_reg(u64 index) -{ - return false; -} - -int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) -{ - return 0; -} - -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) -{ - return 0; -} - -#else - #define NUM_TIMER_REGS 3 static bool is_timer_reg(u64 index) @@ -163,8 +136,6 @@ static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return 0; } -#endif - static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; @@ -173,7 +144,7 @@ static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); if (ret != 0) - return ret; + return -EFAULT; return kvm_arm_timer_set_reg(vcpu, reg->id, val); } @@ -274,13 +245,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int __attribute_const__ kvm_target_cpu(void) { - unsigned long implementor = read_cpuid_implementor(); - unsigned long part_number = read_cpuid_part_number(); - - if (implementor != ARM_CPU_IMP_ARM) - return -EINVAL; - - switch (part_number) { + switch (read_cpuid_part()) { case ARM_CPU_PART_CORTEX_A7: return KVM_ARM_TARGET_CORTEX_A7; case ARM_CPU_PART_CORTEX_A15: @@ -290,31 +255,6 @@ int __attribute_const__ kvm_target_cpu(void) } } -int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, - const struct kvm_vcpu_init *init) -{ - unsigned int i; - - /* We can only cope with guest==host and only on A15/A7 (for now). */ - if (init->target != kvm_target_cpu()) - return -EINVAL; - - vcpu->arch.target = init->target; - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ - for (i = 0; i < sizeof(init->features) * 8; i++) { - if (test_bit(i, (void *)init->features)) { - if (i >= KVM_VCPU_MAX_FEATURES) - return -ENOENT; - set_bit(i, vcpu->arch.features); - } - } - - /* Now we know what it is, we can reset it. */ - return kvm_reset_vcpu(vcpu); -} - int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { int target = kvm_target_cpu(); diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 4c979d466cc1..95f12b2ccdcb 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -87,11 +87,15 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - trace_kvm_wfi(*vcpu_pc(vcpu)); - if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { + trace_kvm_wfx(*vcpu_pc(vcpu), true); kvm_vcpu_on_spin(vcpu); - else + } else { + trace_kvm_wfx(*vcpu_pc(vcpu), false); kvm_vcpu_block(vcpu); + } + + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 1; } diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index 1b9844d369cc..3988e72d16ff 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -17,6 +17,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/unified.h> #include <asm/asm-offsets.h> #include <asm/kvm_asm.h> @@ -71,7 +72,7 @@ __do_hyp_init: bne phase2 @ Yes, second stage init @ Set the HTTBR to point to the hypervisor PGD pointer passed - mcrr p15, 4, r2, r3, c2 + mcrr p15, 4, rr_lo_hi(r2, r3), c2 @ Set the HTCR and VTCR to the same shareability and cacheability @ settings as the non-secure TTBCR and with T0SZ == 0. @@ -98,6 +99,10 @@ __do_hyp_init: mrc p15, 0, r0, c10, c2, 1 mcr p15, 4, r0, c10, c2, 1 + @ Invalidate the stale TLBs from Bootloader + mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH + dsb ish + @ Set the HSCTLR to: @ - ARM/THUMB exceptions: Kernel config (Thumb-2 kernel) @ - Endianness: Kernel config @@ -134,10 +139,10 @@ phase2: ldr r0, =TRAMPOLINE_VA adr r1, target bfi r0, r1, #0, #PAGE_SHIFT - mov pc, r0 + ret r0 target: @ We're now in the trampoline code, switch page tables - mcrr p15, 4, r2, r3, c2 + mcrr p15, 4, rr_lo_hi(r2, r3), c2 isb @ Invalidate the old TLBs diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 0d68d4073068..79caf79b304a 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -52,7 +52,7 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) dsb ishst add r0, r0, #KVM_VTTBR ldrd r2, r3, [r0] - mcrr p15, 6, r2, r3, c2 @ Write VTTBR + mcrr p15, 6, rr_lo_hi(r2, r3), c2 @ Write VTTBR isb mcr p15, 0, r0, c8, c3, 0 @ TLBIALLIS (rt ignored) dsb ish @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + /******************************************************************** * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs @@ -135,7 +146,7 @@ ENTRY(__kvm_vcpu_run) ldr r1, [vcpu, #VCPU_KVM] add r1, r1, #KVM_VTTBR ldrd r2, r3, [r1] - mcrr p15, 6, r2, r3, c2 @ Write VTTBR + mcrr p15, 6, rr_lo_hi(r2, r3), c2 @ Write VTTBR @ We're all done, just restore the GPRs and go to the guest restore_guest_regs @@ -199,8 +210,13 @@ after_vfp_restore: restore_host_regs clrex @ Clear exclusive monitor +#ifndef CONFIG_CPU_ENDIAN_BE8 mov r0, r1 @ Return the return code mov r1, #0 @ Clear upper bits in return value +#else + @ r1 already has return code + mov r0, #0 @ Clear upper bits in return value +#endif /* CONFIG_CPU_ENDIAN_BE8 */ bx lr @ return to IOCTL /******************************************************************** diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index 76af93025574..35e4a3a0c476 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -1,4 +1,5 @@ #include <linux/irqchip/arm-gic.h> +#include <asm/assembler.h> #define VCPU_USR_REG(_reg_nr) (VCPU_USR_REGS + (_reg_nr * 4)) #define VCPU_USR_SP (VCPU_USR_REG(13)) @@ -401,7 +402,6 @@ vcpu .req r0 @ vcpu pointer always in r0 * Assumes vcpu pointer in vcpu reg */ .macro save_vgic_state -#ifdef CONFIG_KVM_ARM_VGIC /* Get VGIC VCTRL base into r2 */ ldr r2, [vcpu, #VCPU_KVM] ldr r2, [r2, #KVM_VGIC_VCTRL] @@ -420,15 +420,30 @@ vcpu .req r0 @ vcpu pointer always in r0 ldr r8, [r2, #GICH_ELRSR0] ldr r9, [r2, #GICH_ELRSR1] ldr r10, [r2, #GICH_APR] - - str r3, [r11, #VGIC_CPU_HCR] - str r4, [r11, #VGIC_CPU_VMCR] - str r5, [r11, #VGIC_CPU_MISR] - str r6, [r11, #VGIC_CPU_EISR] - str r7, [r11, #(VGIC_CPU_EISR + 4)] - str r8, [r11, #VGIC_CPU_ELRSR] - str r9, [r11, #(VGIC_CPU_ELRSR + 4)] - str r10, [r11, #VGIC_CPU_APR] +ARM_BE8(rev r3, r3 ) +ARM_BE8(rev r4, r4 ) +ARM_BE8(rev r5, r5 ) +ARM_BE8(rev r6, r6 ) +ARM_BE8(rev r7, r7 ) +ARM_BE8(rev r8, r8 ) +ARM_BE8(rev r9, r9 ) +ARM_BE8(rev r10, r10 ) + + str r3, [r11, #VGIC_V2_CPU_HCR] + str r4, [r11, #VGIC_V2_CPU_VMCR] + str r5, [r11, #VGIC_V2_CPU_MISR] +#ifdef CONFIG_CPU_ENDIAN_BE8 + str r6, [r11, #(VGIC_V2_CPU_EISR + 4)] + str r7, [r11, #VGIC_V2_CPU_EISR] + str r8, [r11, #(VGIC_V2_CPU_ELRSR + 4)] + str r9, [r11, #VGIC_V2_CPU_ELRSR] +#else + str r6, [r11, #VGIC_V2_CPU_EISR] + str r7, [r11, #(VGIC_V2_CPU_EISR + 4)] + str r8, [r11, #VGIC_V2_CPU_ELRSR] + str r9, [r11, #(VGIC_V2_CPU_ELRSR + 4)] +#endif + str r10, [r11, #VGIC_V2_CPU_APR] /* Clear GICH_HCR */ mov r5, #0 @@ -436,14 +451,14 @@ vcpu .req r0 @ vcpu pointer always in r0 /* Save list registers */ add r2, r2, #GICH_LR0 - add r3, r11, #VGIC_CPU_LR + add r3, r11, #VGIC_V2_CPU_LR ldr r4, [r11, #VGIC_CPU_NR_LR] 1: ldr r6, [r2], #4 +ARM_BE8(rev r6, r6 ) str r6, [r3], #4 subs r4, r4, #1 bne 1b 2: -#endif .endm /* @@ -452,7 +467,6 @@ vcpu .req r0 @ vcpu pointer always in r0 * Assumes vcpu pointer in vcpu reg */ .macro restore_vgic_state -#ifdef CONFIG_KVM_ARM_VGIC /* Get VGIC VCTRL base into r2 */ ldr r2, [vcpu, #VCPU_KVM] ldr r2, [r2, #KVM_VGIC_VCTRL] @@ -463,9 +477,12 @@ vcpu .req r0 @ vcpu pointer always in r0 add r11, vcpu, #VCPU_VGIC_CPU /* We only restore a minimal set of registers */ - ldr r3, [r11, #VGIC_CPU_HCR] - ldr r4, [r11, #VGIC_CPU_VMCR] - ldr r8, [r11, #VGIC_CPU_APR] + ldr r3, [r11, #VGIC_V2_CPU_HCR] + ldr r4, [r11, #VGIC_V2_CPU_VMCR] + ldr r8, [r11, #VGIC_V2_CPU_APR] +ARM_BE8(rev r3, r3 ) +ARM_BE8(rev r4, r4 ) +ARM_BE8(rev r8, r8 ) str r3, [r2, #GICH_HCR] str r4, [r2, #GICH_VMCR] @@ -473,14 +490,14 @@ vcpu .req r0 @ vcpu pointer always in r0 /* Restore list registers */ add r2, r2, #GICH_LR0 - add r3, r11, #VGIC_CPU_LR + add r3, r11, #VGIC_V2_CPU_LR ldr r4, [r11, #VGIC_CPU_NR_LR] 1: ldr r6, [r3], #4 +ARM_BE8(rev r6, r6 ) str r6, [r2], #4 subs r4, r4, #1 bne 1b 2: -#endif .endm #define CNTHCTL_PL1PCTEN (1 << 0) @@ -494,7 +511,6 @@ vcpu .req r0 @ vcpu pointer always in r0 * Clobbers r2-r5 */ .macro save_timer_state -#ifdef CONFIG_KVM_ARM_TIMER ldr r4, [vcpu, #VCPU_KVM] ldr r2, [r4, #KVM_TIMER_ENABLED] cmp r2, #0 @@ -506,7 +522,7 @@ vcpu .req r0 @ vcpu pointer always in r0 mcr p15, 0, r2, c14, c3, 1 @ CNTV_CTL isb - mrrc p15, 3, r2, r3, c14 @ CNTV_CVAL + mrrc p15, 3, rr_lo_hi(r2, r3), c14 @ CNTV_CVAL ldr r4, =VCPU_TIMER_CNTV_CVAL add r5, vcpu, r4 strd r2, r3, [r5] @@ -516,7 +532,6 @@ vcpu .req r0 @ vcpu pointer always in r0 mcrr p15, 4, r2, r2, c14 @ CNTVOFF 1: -#endif @ Allow physical timer/counter access for the host mrc p15, 4, r2, c14, c1, 0 @ CNTHCTL orr r2, r2, #(CNTHCTL_PL1PCEN | CNTHCTL_PL1PCTEN) @@ -538,7 +553,6 @@ vcpu .req r0 @ vcpu pointer always in r0 bic r2, r2, #CNTHCTL_PL1PCEN mcr p15, 4, r2, c14, c1, 0 @ CNTHCTL -#ifdef CONFIG_KVM_ARM_TIMER ldr r4, [vcpu, #VCPU_KVM] ldr r2, [r4, #KVM_TIMER_ENABLED] cmp r2, #0 @@ -546,19 +560,18 @@ vcpu .req r0 @ vcpu pointer always in r0 ldr r2, [r4, #KVM_TIMER_CNTVOFF] ldr r3, [r4, #(KVM_TIMER_CNTVOFF + 4)] - mcrr p15, 4, r2, r3, c14 @ CNTVOFF + mcrr p15, 4, rr_lo_hi(r2, r3), c14 @ CNTVOFF ldr r4, =VCPU_TIMER_CNTV_CVAL add r5, vcpu, r4 ldrd r2, r3, [r5] - mcrr p15, 3, r2, r3, c14 @ CNTV_CVAL + mcrr p15, 3, rr_lo_hi(r2, r3), c14 @ CNTV_CVAL isb ldr r2, [vcpu, #VCPU_TIMER_CNTV_CTL] and r2, r2, #3 mcr p15, 0, r2, c14, c3, 1 @ CNTV_CTL 1: -#endif .endm .equ vmentry, 0 diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 4cb5a93182e9..974b1c606d04 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -121,12 +121,11 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_exit_mmio *mmio) +static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) { unsigned long rt; - int len; - bool is_write, sign_extend; + int access_size; + bool sign_extend; if (kvm_vcpu_dabt_isextabt(vcpu)) { /* cache operation on I/O addr, tell guest unsupported */ @@ -140,17 +139,15 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return 1; } - len = kvm_vcpu_dabt_get_as(vcpu); - if (unlikely(len < 0)) - return len; + access_size = kvm_vcpu_dabt_get_as(vcpu); + if (unlikely(access_size < 0)) + return access_size; - is_write = kvm_vcpu_dabt_iswrite(vcpu); + *is_write = kvm_vcpu_dabt_iswrite(vcpu); sign_extend = kvm_vcpu_dabt_issext(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); - mmio->is_write = is_write; - mmio->phys_addr = fault_ipa; - mmio->len = len; + *len = access_size; vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; @@ -165,20 +162,20 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa) { - struct kvm_exit_mmio mmio; unsigned long data; unsigned long rt; int ret; + bool is_write; + int len; + u8 data_buf[8]; /* - * Prepare MMIO operation. First stash it in a private - * structure that we can use for in-kernel emulation. If the - * kernel can't handle it, copy it into run->mmio and let user - * space do its magic. + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. */ - if (kvm_vcpu_dabt_isvalid(vcpu)) { - ret = decode_hsr(vcpu, fault_ipa, &mmio); + ret = decode_hsr(vcpu, &is_write, &len); if (ret) return ret; } else { @@ -187,19 +184,35 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, } rt = vcpu->arch.mmio_decode.rt; - data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), mmio.len); - trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE : - KVM_TRACE_MMIO_READ_UNSATISFIED, - mmio.len, fault_ipa, - (mmio.is_write) ? data : 0); + if (is_write) { + data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), len); - if (mmio.is_write) - mmio_write_buf(mmio.data, mmio.len, data); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); + mmio_write_buf(data_buf, len, data); + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } else { + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, + fault_ipa, 0); - if (vgic_handle_mmio(vcpu, run, &mmio)) + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); + } + + /* Now prepare kvm_run for the potential return to userland. */ + run->mmio.is_write = is_write; + run->mmio.phys_addr = fault_ipa; + run->mmio.len = len; + memcpy(run->mmio.data, data_buf, len); + + if (!ret) { + /* We handled the access successfully in the kernel. */ + kvm_handle_mmio_return(vcpu, run); return 1; + } - kvm_prepare_mmio(run, &mmio); + run->exit_reason = KVM_EXIT_MMIO; return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 16f804938b8f..1d5accbd3dcf 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -35,16 +35,36 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; static pgd_t *boot_hyp_pgd; static pgd_t *hyp_pgd; +static pgd_t *merged_hyp_pgd; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); -static void *init_bounce_page; static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; -#define pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) +#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) +#define kvm_pud_huge(_x) pud_huge(_x) + +#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { @@ -58,6 +78,45 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +/* + * D-Cache management functions. They take the page table entries by + * value, as they are flushing the cache using the kernel mapping (or + * kmap on 32bit). + */ +static void kvm_flush_dcache_pte(pte_t pte) +{ + __kvm_flush_dcache_pte(pte); +} + +static void kvm_flush_dcache_pmd(pmd_t pmd) +{ + __kvm_flush_dcache_pmd(pmd); +} + +static void kvm_flush_dcache_pud(pud_t pud) +{ + __kvm_flush_dcache_pud(pud); +} + +/** + * stage2_dissolve_pmd() - clear and flush huge PMD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pmd: pmd pointer for IPA + * + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) +{ + if (!kvm_pmd_huge(*pmd)) + return; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pmd)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -90,104 +149,153 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } -static bool page_empty(void *ptr) +static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); + pgd_clear(pgd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); } static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - if (pud_huge(*pud)) { - pud_clear(pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - pmd_t *pmd_table = pmd_offset(pud, 0); - pud_clear(pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); - } + pmd_t *pmd_table = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pmd_free(NULL, pmd_table); put_page(virt_to_page(pud)); } static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { - if (kvm_pmd_huge(*pmd)) { - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } else { - pte_t *pte_table = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pte_free_kernel(NULL, pte_table); - } + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(kvm_pmd_huge(*pmd)); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pte_free_kernel(NULL, pte_table); put_page(virt_to_page(pmd)); } -static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr) +/* + * Unmapping vs dcache management: + * + * If a guest maps certain memory pages as uncached, all writes will + * bypass the data cache and go directly to RAM. However, the CPUs + * can still speculate reads (not writes) and fill cache lines with + * data. + * + * Those cache lines will be *clean* cache lines though, so a + * clean+invalidate operation is equivalent to an invalidate + * operation, because no cache lines are marked dirty. + * + * Those clean cache lines could be filled prior to an uncached write + * by the guest, and the cache coherent IO subsystem would therefore + * end up writing old data to disk. + * + * This is why right after unmapping a page/section and invalidating + * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure + * the IO subsystem will never hit in the cache. + */ +static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, + phys_addr_t addr, phys_addr_t end) { - if (pte_present(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - kvm_tlb_flush_vmid_ipa(kvm, addr); - } + phys_addr_t start_addr = addr; + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + pte_t old_pte = *pte; + + kvm_set_pte(pte, __pte(0)); + kvm_tlb_flush_vmid_ipa(kvm, addr); + + /* No need to invalidate the cache for device mappings */ + if ((pte_val(old_pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE) + kvm_flush_dcache_pte(old_pte); + + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (kvm_pte_table_empty(kvm, start_pte)) + clear_pmd_entry(kvm, pmd, start_addr); } -static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - unsigned long long start, u64 size) +static void unmap_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long long addr = start, end = start + size; - u64 next; + phys_addr_t next, start_addr = addr; + pmd_t *pmd, *start_pmd; - while (addr < end) { - pgd = pgdp + pgd_index(addr); - pud = pud_offset(pgd, addr); - pte = NULL; - if (pud_none(*pud)) { - addr = kvm_pud_addr_end(addr, end); - continue; - } + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + pmd_t old_pmd = *pmd; - if (pud_huge(*pud)) { - /* - * If we are dealing with a huge pud, just clear it and - * move on. - */ - clear_pud_entry(kvm, pud, addr); - addr = kvm_pud_addr_end(addr, end); - continue; - } + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - addr = kvm_pmd_addr_end(addr, end); - continue; - } + kvm_flush_dcache_pmd(old_pmd); - if (!kvm_pmd_huge(*pmd)) { - pte = pte_offset_kernel(pmd, addr); - clear_pte_entry(kvm, pte, addr); - next = addr + PAGE_SIZE; + put_page(virt_to_page(pmd)); + } else { + unmap_ptes(kvm, pmd, addr, next); + } } + } while (pmd++, addr = next, addr != end); - /* - * If the pmd entry is to be cleared, walk back up the ladder - */ - if (kvm_pmd_huge(*pmd) || (pte && page_empty(pte))) { - clear_pmd_entry(kvm, pmd, addr); - next = kvm_pmd_addr_end(addr, end); - if (page_empty(pmd) && !page_empty(pud)) { - clear_pud_entry(kvm, pud, addr); - next = kvm_pud_addr_end(addr, end); + if (kvm_pmd_table_empty(kvm, start_pmd)) + clear_pud_entry(kvm, pud, start_addr); +} + +static void unmap_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = kvm_pud_addr_end(addr, end); + if (!pud_none(*pud)) { + if (pud_huge(*pud)) { + pud_t old_pud = *pud; + + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + + kvm_flush_dcache_pud(old_pud); + + put_page(virt_to_page(pud)); + } else { + unmap_pmds(kvm, pud, addr, next); } } + } while (pud++, addr = next, addr != end); - addr = next; - } + if (kvm_pud_table_empty(kvm, start_pud)) + clear_pgd_entry(kvm, pgd, start_addr); +} + + +static void unmap_range(struct kvm *kvm, pgd_t *pgdp, + phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + pgd = pgdp + kvm_pgd_index(addr); + do { + next = kvm_pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); } static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, @@ -197,10 +305,9 @@ static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, pte = pte_offset_kernel(pmd, addr); do { - if (!pte_none(*pte)) { - hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); - kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE); - } + if (!pte_none(*pte) && + (pte_val(*pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE) + kvm_flush_dcache_pte(*pte); } while (pte++, addr += PAGE_SIZE, addr != end); } @@ -214,12 +321,10 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, do { next = kvm_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { - hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); - kvm_flush_dcache_to_poc((void*)hva, PMD_SIZE); - } else { + if (kvm_pmd_huge(*pmd)) + kvm_flush_dcache_pmd(*pmd); + else stage2_flush_ptes(kvm, pmd, addr, next); - } } } while (pmd++, addr = next, addr != end); } @@ -234,12 +339,10 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, do { next = kvm_pud_addr_end(addr, end); if (!pud_none(*pud)) { - if (pud_huge(*pud)) { - hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); - kvm_flush_dcache_to_poc((void*)hva, PUD_SIZE); - } else { + if (pud_huge(*pud)) + kvm_flush_dcache_pud(*pud); + else stage2_flush_pmds(kvm, pud, addr, next); - } } } while (pud++, addr = next, addr != end); } @@ -252,7 +355,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); @@ -266,7 +369,7 @@ static void stage2_flush_memslot(struct kvm *kvm, * Go through the stage 2 page tables and invalidate any cache lines * backing memory already mapped to the VM. */ -void stage2_flush_vm(struct kvm *kvm) +static void stage2_flush_vm(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -295,16 +398,13 @@ void free_boot_hyp_pgd(void) if (boot_hyp_pgd) { unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); - free_pages((unsigned long)boot_hyp_pgd, pgd_order); + free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); boot_hyp_pgd = NULL; } if (hyp_pgd) unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); - free_page((unsigned long)init_bounce_page); - init_bounce_page = NULL; - mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -332,9 +432,14 @@ void free_hyp_pgds(void) for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); - free_pages((unsigned long)hyp_pgd, pgd_order); + free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; } + if (merged_hyp_pgd) { + clear_page(merged_hyp_pgd); + free_page((unsigned long)merged_hyp_pgd); + merged_hyp_pgd = NULL; + } mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -390,13 +495,46 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, return 0; } +static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + pud_t *pud; + pmd_t *pmd; + unsigned long addr, next; + int ret; + + addr = start; + do { + pud = pud_offset(pgd, addr); + + if (pud_none_or_clear_bad(pud)) { + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + kvm_err("Cannot allocate Hyp pmd\n"); + return -ENOMEM; + } + pud_populate(NULL, pud, pmd); + get_page(virt_to_page(pud)); + kvm_flush_dcache_to_poc(pud, sizeof(*pud)); + } + + next = pud_addr_end(addr, end); + ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); + if (ret) + return ret; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + static int __create_hyp_mappings(pgd_t *pgdp, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd; unsigned long addr, next; int err = 0; @@ -405,22 +543,21 @@ static int __create_hyp_mappings(pgd_t *pgdp, end = PAGE_ALIGN(end); do { pgd = pgdp + pgd_index(addr); - pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, addr); - if (!pmd) { - kvm_err("Cannot allocate Hyp pmd\n"); + if (pgd_none(*pgd)) { + pud = pud_alloc_one(NULL, addr); + if (!pud) { + kvm_err("Cannot allocate Hyp pud\n"); err = -ENOMEM; goto out; } - pud_populate(NULL, pud, pmd); - get_page(virt_to_page(pud)); - kvm_flush_dcache_to_poc(pud, sizeof(*pud)); + pgd_populate(NULL, pgd, pud); + get_page(virt_to_page(pgd)); + kvm_flush_dcache_to_poc(pgd, sizeof(*pgd)); } next = pgd_addr_end(addr, end); - err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); + err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); if (err) goto out; pfn += (next - addr) >> PAGE_SHIFT; @@ -497,6 +634,20 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } +/* Free the HW pgd, one page at a time */ +static void kvm_free_hwpgd(void *hwpgd) +{ + free_pages_exact(hwpgd, kvm_get_hwpgd_size()); +} + +/* Allocate the HW PGD, making sure that each page gets its own refcount */ +static void *kvm_alloc_hwpgd(void) +{ + unsigned int size = kvm_get_hwpgd_size(); + + return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); +} + /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -511,20 +662,62 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) int kvm_alloc_stage2_pgd(struct kvm *kvm) { pgd_t *pgd; + void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER); - if (!pgd) + hwpgd = kvm_alloc_hwpgd(); + if (!hwpgd) return -ENOMEM; - memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); + /* When the kernel uses more levels of page tables than the + * guest, we allocate a fake PGD and pre-populate it to point + * to the next-level page table, which will be the real + * initial page table pointed to by the VTTBR. + * + * When KVM_PREALLOC_LEVEL==2, we allocate a single page for + * the PMD and the kernel will use folded pud. + * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD + * pages. + */ + if (KVM_PREALLOC_LEVEL > 0) { + int i; + + /* + * Allocate fake pgd for the page table manipulation macros to + * work. This is not used by the hardware and we have no + * alignment requirement for this allocation. + */ + pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), + GFP_KERNEL | __GFP_ZERO); + + if (!pgd) { + kvm_free_hwpgd(hwpgd); + return -ENOMEM; + } + + /* Plug the HW PGD into the fake one. */ + for (i = 0; i < PTRS_PER_S2_PGD; i++) { + if (KVM_PREALLOC_LEVEL == 1) + pgd_populate(NULL, pgd + i, + (pud_t *)hwpgd + i * PTRS_PER_PUD); + else if (KVM_PREALLOC_LEVEL == 2) + pud_populate(NULL, pud_offset(pgd, 0) + i, + (pmd_t *)hwpgd + i * PTRS_PER_PMD); + } + } else { + /* + * Allocate actual first-level Stage-2 page table used by the + * hardware for Stage-2 page table walks. + */ + pgd = (pgd_t *)hwpgd; + } + kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; - return 0; } @@ -544,6 +737,71 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) unmap_range(kvm, kvm->arch.pgd, start, size); } +static void stage2_unmap_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + hva_t hva = memslot->userspace_addr; + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t size = PAGE_SIZE * memslot->npages; + hva_t reg_end = hva + size; + + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we should + * unmap any of them. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (!(vma->vm_flags & VM_PFNMAP)) { + gpa_t gpa = addr + (vm_start - memslot->userspace_addr); + unmap_stage2_range(kvm, gpa, vm_end - vm_start); + } + hva = vm_end; + } while (hva < reg_end); +} + +/** + * stage2_unmap_vm - Unmap Stage-2 RAM mappings + * @kvm: The struct kvm pointer + * + * Go through the memregions and unmap any reguler RAM + * backing memory already mapped to the VM. + */ +void stage2_unmap_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_unmap_memslot(kvm, memslot); + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + /** * kvm_free_stage2_pgd - free all stage-2 tables * @kvm: The KVM struct pointer for the VM. @@ -561,19 +819,38 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER); + kvm_free_hwpgd(kvm_get_hwpgd(kvm)); + if (KVM_PREALLOC_LEVEL > 0) + kfree(kvm->arch.pgd); + kvm->arch.pgd = NULL; } -static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr) { pgd_t *pgd; pud_t *pud; + + pgd = kvm->arch.pgd + kvm_pgd_index(addr); + if (WARN_ON(pgd_none(*pgd))) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); + pgd_populate(NULL, pgd, pud); + get_page(virt_to_page(pgd)); + } + + return pud_offset(pgd, addr); +} + +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + pud_t *pud; pmd_t *pmd; - pgd = kvm->arch.pgd + pgd_index(addr); - pud = pud_offset(pgd, addr); + pud = stage2_get_pud(kvm, cache, addr); if (pud_none(*pud)) { if (!cache) return NULL; @@ -614,12 +891,17 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, + unsigned long flags) { pmd_t *pmd; pte_t *pte, old_pte; + bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; + bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; - /* Create stage-2 page table mapping - Level 1 */ + VM_BUG_ON(logging_active && !cache); + + /* Create stage-2 page table mapping - Levels 0 and 1 */ pmd = stage2_get_pmd(kvm, cache, addr); if (!pmd) { /* @@ -629,6 +911,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* + * While dirty page logging - dissolve huge PMD, then continue on to + * allocate page. + */ + if (logging_active) + stage2_dissolve_pmd(kvm, addr, pmd); + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -664,7 +953,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, * @size: The size of the mapping */ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, - phys_addr_t pa, unsigned long size) + phys_addr_t pa, unsigned long size, bool writable) { phys_addr_t addr, end; int ret = 0; @@ -677,11 +966,16 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); - ret = mmu_topup_memory_cache(&cache, 2, 2); + if (writable) + kvm_set_s2pte_writable(&pte); + + ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, + KVM_NR_MEM_OBJS); if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); + ret = stage2_set_pte(kvm, &cache, addr, &pte, + KVM_S2PTE_FLAG_IS_IOMAP); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -735,21 +1029,202 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + +static bool kvm_is_device_pfn(unsigned long pfn) +{ + return !pfn_valid(pfn); +} + +/** + * stage2_wp_ptes - write protect PMD range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmds - write protect PUD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else { + stage2_wp_ptes(pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +/** + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + * + * Process PUD entries, for a huge PUD we cause a panic. + */ +static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = pud_offset(pgd, addr); + do { + next = kvm_pud_addr_end(addr, end); + if (!pud_none(*pud)) { + /* TODO:PUD not supported, revisit later if supported */ + BUG_ON(kvm_pud_huge(*pud)); + stage2_wp_pmds(pud, addr, next); + } + } while (pud++, addr = next, addr != end); +} + +/** + * stage2_wp_range() - write protect stage2 memory region range + * @kvm: The KVM pointer + * @addr: Start address of range + * @end: End address of range + */ +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + pgd_t *pgd; + phys_addr_t next; + + pgd = kvm->arch.pgd + kvm_pgd_index(addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is + * large. Otherwise, we may see kernel panics with + * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, + * CONFIG_LOCKDEP. Additionally, holding the lock too long + * will also starve other vCPUs. + */ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + + next = kvm_pgd_addr_end(addr, end); + if (pgd_present(*pgd)) + stage2_wp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); + phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +/** + * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + +static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, + unsigned long size, bool uncached) +{ + __coherent_cache_guest_page(vcpu, pfn, size, uncached); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_memory_slot *memslot, + struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { int ret; bool write_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; - unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + pgprot_t mem_type = PAGE_S2; + bool fault_ipa_uncached; + bool logging_active = memslot_is_logging(memslot); + unsigned long flags = 0; - write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); + write_fault = kvm_is_write_fault(vcpu); if (fault_status == FSC_PERM && !write_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; @@ -758,7 +1233,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); - if (is_vm_hugetlb_page(vma)) { + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + + if (is_vm_hugetlb_page(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { @@ -778,7 +1259,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, up_read(¤t->mm->mmap_sem); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); + ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + KVM_NR_MEM_OBJS); if (ret) return ret; @@ -798,38 +1280,103 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_pfn(pfn)) return -EFAULT; + if (kvm_is_device_pfn(pfn)) { + mem_type = PAGE_S2_DEVICE; + flags |= KVM_S2PTE_FLAG_IS_IOMAP; + } else if (logging_active) { + /* + * Faults on pages in a memslot with logging enabled + * should not be mapped with huge pages (it introduces churn + * and performance degradation), so force a pte mapping. + */ + force_pte = true; + flags |= KVM_S2_FLAG_LOGGING_ACTIVE; + + /* + * Only actually map the page as writable if this was a write + * fault. + */ + if (!write_fault) + writable = false; + } + spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; + if (!hugetlb && !force_pte) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT; + if (hugetlb) { - pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); + pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) { kvm_set_s2pmd_writable(&new_pmd); kvm_set_pfn_dirty(pfn); } - coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE); + coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { - pte_t new_pte = pfn_pte(pfn, PAGE_S2); + pte_t new_pte = pfn_pte(pfn, mem_type); + if (writable) { kvm_set_s2pte_writable(&new_pte); kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); } - coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); + coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } - out_unlock: spin_unlock(&kvm->mmu_lock); + kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); return ret; } +/* + * Resolve the access fault by making the page young again. + * Note that because the faulting entry is guaranteed not to be + * cached in the TLB, we don't need to invalidate anything. + */ +static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + pmd_t *pmd; + pte_t *pte; + pfn_t pfn; + bool pfn_valid = false; + + trace_kvm_access_fault(fault_ipa); + + spin_lock(&vcpu->kvm->mmu_lock); + + pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); + if (!pmd || pmd_none(*pmd)) /* Nothing there */ + goto out; + + if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + *pmd = pmd_mkyoung(*pmd); + pfn = pmd_pfn(*pmd); + pfn_valid = true; + goto out; + } + + pte = pte_offset_kernel(pmd, fault_ipa); + if (pte_none(*pte)) /* Nothing there either */ + goto out; + + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; +out: + spin_unlock(&vcpu->kvm->mmu_lock); + if (pfn_valid) + kvm_set_pfn_accessed(pfn); +} + /** * kvm_handle_guest_abort - handles all 2nd stage aborts * @vcpu: the VCPU pointer @@ -847,7 +1394,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) unsigned long fault_status; phys_addr_t fault_ipa; struct kvm_memory_slot *memslot; - bool is_iabt; + unsigned long hva; + bool is_iabt, write_fault, writable; gfn_t gfn; int ret, idx; @@ -858,17 +1406,23 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - fault_status = kvm_vcpu_trap_get_fault(vcpu); - if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { - kvm_err("Unsupported fault status: EC=%#x DFCS=%#lx\n", - kvm_vcpu_trap_get_class(vcpu), fault_status); + fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + if (fault_status != FSC_FAULT && fault_status != FSC_PERM && + fault_status != FSC_ACCESS) { + kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_hsr(vcpu)); return -EFAULT; } idx = srcu_read_lock(&vcpu->kvm->srcu); gfn = fault_ipa >> PAGE_SHIFT; - if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { + memslot = gfn_to_memslot(vcpu->kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); + write_fault = kvm_is_write_fault(vcpu); + if (kvm_is_error_hva(hva) || (write_fault && !writable)) { if (is_iabt) { /* Prefetch Abort on I/O address */ kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); @@ -876,13 +1430,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) goto out_unlock; } - if (fault_status != FSC_FAULT) { - kvm_err("Unsupported fault status on io memory: %#lx\n", - fault_status); - ret = -EFAULT; - goto out_unlock; - } - /* * The IPA is reported as [MAX:12], so we need to * complement it with the bottom 12 bits from the @@ -894,9 +1441,16 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) goto out_unlock; } - memslot = gfn_to_memslot(vcpu->kvm, gfn); + /* Userspace should not be able to register out-of-bounds IPAs */ + VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); + + if (fault_status == FSC_ACCESS) { + handle_access_fault(vcpu, fault_ipa); + ret = 1; + goto out_unlock; + } - ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); if (ret == 0) ret = 1; out_unlock: @@ -904,15 +1458,16 @@ out_unlock: return ret; } -static void handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - void (*handler)(struct kvm *kvm, - gpa_t gpa, void *data), - void *data) +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + gpa_t gpa, void *data), + void *data) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + int ret = 0; slots = kvm_memslots(kvm); @@ -936,14 +1491,17 @@ static void handle_hva_to_gpa(struct kvm *kvm, for (; gfn < gfn_end; ++gfn) { gpa_t gpa = gfn << PAGE_SHIFT; - handler(kvm, gpa, data); + ret |= handler(kvm, gpa, data); } } + + return ret; } -static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) { unmap_stage2_range(kvm, gpa, PAGE_SIZE); + return 0; } int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) @@ -969,11 +1527,19 @@ int kvm_unmap_hva_range(struct kvm *kvm, return 0; } -static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + /* + * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE + * flag clear because MMU notifiers will have unmapped a huge PMD before + * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore stage2_set_pte() never needs to clear out a huge PMD + * through this calling path. + */ + stage2_set_pte(kvm, NULL, gpa, pte, 0); + return 0; } @@ -990,6 +1556,67 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } +static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +{ + pmd_t *pmd; + pte_t *pte; + + pmd = stage2_get_pmd(kvm, NULL, gpa); + if (!pmd || pmd_none(*pmd)) /* Nothing there */ + return 0; + + if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_young(*pmd)) { + *pmd = pmd_mkold(*pmd); + return 1; + } + + return 0; + } + + pte = pte_offset_kernel(pmd, gpa); + if (pte_none(*pte)) + return 0; + + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); /* Just a page... */ + return 1; + } + + return 0; +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +{ + pmd_t *pmd; + pte_t *pte; + + pmd = stage2_get_pmd(kvm, NULL, gpa); + if (!pmd || pmd_none(*pmd)) /* Nothing there */ + return 0; + + if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ + return pmd_young(*pmd); + + pte = pte_offset_kernel(pmd, gpa); + if (!pte_none(*pte)) /* Just a page... */ + return pte_young(*pte); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + trace_kvm_age_hva(start, end); + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_test_age_hva(hva); + return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); +} + void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); @@ -997,12 +1624,18 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) phys_addr_t kvm_mmu_get_httbr(void) { - return virt_to_phys(hyp_pgd); + if (__kvm_cpu_uses_extended_idmap()) + return virt_to_phys(merged_hyp_pgd); + else + return virt_to_phys(hyp_pgd); } phys_addr_t kvm_mmu_get_boot_httbr(void) { - return virt_to_phys(boot_hyp_pgd); + if (__kvm_cpu_uses_extended_idmap()) + return virt_to_phys(merged_hyp_pgd); + else + return virt_to_phys(boot_hyp_pgd); } phys_addr_t kvm_get_idmap_vector(void) @@ -1018,42 +1651,14 @@ int kvm_mmu_init(void) hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); - if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) { - /* - * Our init code is crossing a page boundary. Allocate - * a bounce page, copy the code over and use that. - */ - size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start; - phys_addr_t phys_base; - - init_bounce_page = (void *)__get_free_page(GFP_KERNEL); - if (!init_bounce_page) { - kvm_err("Couldn't allocate HYP init bounce page\n"); - err = -ENOMEM; - goto out; - } - - memcpy(init_bounce_page, __hyp_idmap_text_start, len); - /* - * Warning: the code we just copied to the bounce page - * must be flushed to the point of coherency. - * Otherwise, the data may be sitting in L2, and HYP - * mode won't be able to observe it as it runs with - * caches off at that point. - */ - kvm_flush_dcache_to_poc(init_bounce_page, len); - - phys_base = kvm_virt_to_phys(init_bounce_page); - hyp_idmap_vector += phys_base - hyp_idmap_start; - hyp_idmap_start = phys_base; - hyp_idmap_end = phys_base + len; - - kvm_info("Using HYP init bounce page @%lx\n", - (unsigned long)phys_base); - } + /* + * We rely on the linker script to ensure at build time that the HYP + * init code does not cross a page boundary. + */ + BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, pgd_order); - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, pgd_order); + hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); + boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); if (!hyp_pgd || !boot_hyp_pgd) { kvm_err("Hyp mode PGD not allocated\n"); @@ -1073,6 +1678,17 @@ int kvm_mmu_init(void) goto out; } + if (__kvm_cpu_uses_extended_idmap()) { + merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!merged_hyp_pgd) { + kvm_err("Failed to allocate extra HYP pgd\n"); + goto out; + } + __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, + hyp_idmap_start); + return 0; + } + /* Map the very same page at the trampoline VA */ err = __create_hyp_mappings(boot_hyp_pgd, TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, @@ -1100,3 +1716,211 @@ out: free_hyp_pgds(); return err; } + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + enum kvm_mr_change change) +{ + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_wp_memory_region(kvm, mem->slot); +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) +{ + hva_t hva = mem->userspace_addr; + hva_t reg_end = hva + mem->memory_size; + bool writable = !(mem->flags & KVM_MEM_READONLY); + int ret = 0; + + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) + return 0; + + /* + * Prevent userspace from creating a memory region outside of the IPA + * space addressable by the KVM guest IPA space. + */ + if (memslot->base_gfn + memslot->npages >= + (KVM_PHYS_SIZE >> PAGE_SHIFT)) + return -EFAULT; + + /* + * A memory region could potentially cover multiple VMAs, and any holes + * between them, so iterate over all of them to find out if we can map + * any of them right now. + * + * +--------------------------------------------+ + * +---------------+----------------+ +----------------+ + * | : VMA 1 | VMA 2 | | VMA 3 : | + * +---------------+----------------+ +----------------+ + * | memory region | + * +--------------------------------------------+ + */ + do { + struct vm_area_struct *vma = find_vma(current->mm, hva); + hva_t vm_start, vm_end; + + if (!vma || vma->vm_start >= reg_end) + break; + + /* + * Mapping a read-only VMA is only allowed if the + * memory region is configured as read-only. + */ + if (writable && !(vma->vm_flags & VM_WRITE)) { + ret = -EPERM; + break; + } + + /* + * Take the intersection of this VMA with the memory region + */ + vm_start = max(hva, vma->vm_start); + vm_end = min(reg_end, vma->vm_end); + + if (vma->vm_flags & VM_PFNMAP) { + gpa_t gpa = mem->guest_phys_addr + + (vm_start - mem->userspace_addr); + phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) + + vm_start - vma->vm_start; + + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) + return -EINVAL; + + ret = kvm_phys_addr_ioremap(kvm, gpa, pa, + vm_end - vm_start, + writable); + if (ret) + break; + } + hva = vm_end; + } while (hva < reg_end); + + if (change == KVM_MR_FLAGS_ONLY) + return ret; + + spin_lock(&kvm->mmu_lock); + if (ret) + unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); + else + stage2_flush_memslot(kvm, memslot); + spin_unlock(&kvm->mmu_lock); + return ret; +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + /* + * Readonly memslots are not incoherent with the caches by definition, + * but in practice, they are used mostly to emulate ROMs or NOR flashes + * that the guest may consider devices and hence map as uncached. + * To prevent incoherency issues in these cases, tag all readonly + * regions as incoherent. + */ + if (slot->flags & KVM_MEM_READONLY) + slot->flags |= KVM_MEMSLOT_INCOHERENT; + return 0; +} + +void kvm_arch_memslots_updated(struct kvm *kvm) +{ +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + gpa_t gpa = slot->base_gfn << PAGE_SHIFT; + phys_addr_t size = slot->npages << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + unmap_stage2_range(kvm, gpa, size); + spin_unlock(&kvm->mmu_lock); +} + +/* + * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). + * + * Main problems: + * - S/W ops are local to a CPU (not broadcast) + * - We have line migration behind our back (speculation) + * - System caches don't support S/W at all (damn!) + * + * In the face of the above, the best we can do is to try and convert + * S/W ops to VA ops. Because the guest is not allowed to infer the + * S/W to PA mapping, it can only use S/W to nuke the whole cache, + * which is a rather good thing for us. + * + * Also, it is only used when turning caches on/off ("The expected + * usage of the cache maintenance instructions that operate by set/way + * is associated with the cache maintenance instructions associated + * with the powerdown and powerup of caches, if this is required by + * the implementation."). + * + * We use the following policy: + * + * - If we trap a S/W operation, we enable VM trapping to detect + * caches being turned on/off, and do a full clean. + * + * - We flush the caches on both caches being turned on and off. + * + * - Once the caches are enabled, we stop trapping VM ops. + */ +void kvm_set_way_flush(struct kvm_vcpu *vcpu) +{ + unsigned long hcr = vcpu_get_hcr(vcpu); + + /* + * If this is the first time we do a S/W operation + * (i.e. HCR_TVM not set) flush the whole memory, and set the + * VM trapping. + * + * Otherwise, rely on the VM trapping to wait for the MMU + + * Caches to be turned off. At that point, we'll be able to + * clean the caches again. + */ + if (!(hcr & HCR_TVM)) { + trace_kvm_set_way_flush(*vcpu_pc(vcpu), + vcpu_has_cache_enabled(vcpu)); + stage2_flush_vm(vcpu->kvm); + vcpu_set_hcr(vcpu, hcr | HCR_TVM); + } +} + +void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) +{ + bool now_enabled = vcpu_has_cache_enabled(vcpu); + + /* + * If switching the MMU+caches on, need to invalidate the caches. + * If switching it off, need to clean the caches. + * Clean + invalidate does the trick always. + */ + if (now_enabled != was_enabled) + stage2_flush_vm(vcpu->kvm); + + /* Caches are now on, stop trapping VM ops (until a S/W op) */ + if (now_enabled) + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM); + + trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); +} diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 09cf37737ee2..02fa8eff6ae1 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -15,12 +15,14 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/preempt.h> #include <linux/kvm_host.h> #include <linux/wait.h> #include <asm/cputype.h> #include <asm/kvm_emulate.h> #include <asm/kvm_psci.h> +#include <asm/kvm_host.h> /* * This is an implementation of the Power State Coordination Interface @@ -65,25 +67,17 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu = NULL, *tmp; + struct kvm_vcpu *vcpu = NULL; wait_queue_head_t *wq; unsigned long cpu_id; unsigned long context_id; - unsigned long mpidr; phys_addr_t target_pc; - int i; - cpu_id = *vcpu_reg(source_vcpu, 1); + cpu_id = *vcpu_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK; if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); - kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); - if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) { - vcpu = tmp; - break; - } - } + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); /* * Make sure the caller requested a valid CPU and that the CPU is @@ -154,7 +148,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) * then ON else OFF */ kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); + mpidr = kvm_vcpu_get_mpidr_aff(tmp); if (((mpidr & target_affinity_mask) == target_affinity) && !tmp->arch.pause) { return PSCI_0_2_AFFINITY_LEVEL_ON; @@ -166,6 +160,23 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) { + int i; + struct kvm_vcpu *tmp; + + /* + * The KVM ABI specifies that a system event exit may call KVM_RUN + * again and may perform shutdown/reboot at a later time that when the + * actual request is made. Since we are implementing PSCI and a + * caller of PSCI reboot and shutdown expects that the system shuts + * down or reboots immediately, let's make sure that VCPUs are not run + * after this call is handled and before the VCPUs have been + * re-initialized. + */ + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + tmp->arch.pause = true; + kvm_vcpu_kick(tmp); + } + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); vcpu->run->system_event.type = type; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index b1d640f78623..0ec35392d208 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -25,18 +25,22 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), + TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc), + TP_ARGS(exit_reason, vcpu_pc), TP_STRUCT__entry( + __field( unsigned int, exit_reason ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( + __entry->exit_reason = exit_reason; __entry->vcpu_pc = vcpu_pc; ), - TP_printk("PC: 0x%08lx", __entry->vcpu_pc) + TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx", + __entry->exit_reason, + __entry->vcpu_pc) ); TRACE_EVENT(kvm_guest_fault, @@ -64,6 +68,21 @@ TRACE_EVENT(kvm_guest_fault, __entry->hxfar, __entry->vcpu_pc) ); +TRACE_EVENT(kvm_access_fault, + TP_PROTO(unsigned long ipa), + TP_ARGS(ipa), + + TP_STRUCT__entry( + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->ipa = ipa; + ), + + TP_printk("IPA: %lx", __entry->ipa) +); + TRACE_EVENT(kvm_irq_line, TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), TP_ARGS(type, vcpu_idx, irq_num, level), @@ -140,19 +159,22 @@ TRACE_EVENT(kvm_emulate_cp15_imp, __entry->CRm, __entry->Op2) ); -TRACE_EVENT(kvm_wfi, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), +TRACE_EVENT(kvm_wfx, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), TP_STRUCT__entry( __field( unsigned long, vcpu_pc ) + __field( bool, is_wfe ) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; ), - TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc) + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); TRACE_EVENT(kvm_unmap_hva, @@ -203,6 +225,39 @@ TRACE_EVENT(kvm_set_spte_hva, TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) ); +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#08lx -- %#08lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) +); + TRACE_EVENT(kvm_hvc, TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), TP_ARGS(vcpu_pc, r0, imm), @@ -223,6 +278,45 @@ TRACE_EVENT(kvm_hvc, __entry->vcpu_pc, __entry->r0, __entry->imm) ); +TRACE_EVENT(kvm_set_way_flush, + TP_PROTO(unsigned long vcpu_pc, bool cache), + TP_ARGS(vcpu_pc, cache), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, cache ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->cache = cache; + ), + + TP_printk("S/W flush at 0x%016lx (cache %s)", + __entry->vcpu_pc, __entry->cache ? "on" : "off") +); + +TRACE_EVENT(kvm_toggle_cache, + TP_PROTO(unsigned long vcpu_pc, bool was, bool now), + TP_ARGS(vcpu_pc, was, now), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( bool, was ) + __field( bool, now ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->was = was; + __entry->now = now; + ), + + TP_printk("VM op at 0x%016lx (cache was %s, now %s)", + __entry->vcpu_pc, __entry->was ? "on" : "off", + __entry->now ? "on" : "off") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH |