diff options
Diffstat (limited to 'arch/arm/kvm')
-rw-r--r-- | arch/arm/kvm/Kconfig | 30 | ||||
-rw-r--r-- | arch/arm/kvm/Makefile | 12 | ||||
-rw-r--r-- | arch/arm/kvm/arm.c | 50 | ||||
-rw-r--r-- | arch/arm/kvm/guest.c | 18 | ||||
-rw-r--r-- | arch/arm/kvm/interrupts_head.S | 8 | ||||
-rw-r--r-- | arch/arm/kvm/mmio.c | 64 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 278 | ||||
-rw-r--r-- | arch/arm/kvm/trace.h | 58 |
8 files changed, 342 insertions, 176 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 338ace78ed18..f1f79d104309 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -18,6 +18,7 @@ if VIRTUALIZATION config KVM bool "Kernel-based Virtual Machine (KVM) support" + depends on MMU && OF select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT @@ -26,10 +27,12 @@ config KVM select KVM_ARM_HOST select KVM_GENERIC_DIRTYLOG_READ_PROTECT select SRCU - depends on ARM_VIRT_EXT && ARM_LPAE + select MMU_NOTIFIER + select HAVE_KVM_EVENTFD + select HAVE_KVM_IRQFD + depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER ---help--- - Support hosting virtualized guest machines. You will also - need to select one or more of the processor modules below. + Support hosting virtualized guest machines. This module provides access to the hardware capabilities through a character device node named /dev/kvm. @@ -37,10 +40,7 @@ config KVM If unsure, say N. config KVM_ARM_HOST - bool "KVM host support for ARM cpus." - depends on KVM - depends on MMU - select MMU_NOTIFIER + bool ---help--- Provides host support for ARM processors. @@ -55,20 +55,4 @@ config KVM_ARM_MAX_VCPUS large, so only choose a reasonable number that you expect to actually use. -config KVM_ARM_VGIC - bool "KVM support for Virtual GIC" - depends on KVM_ARM_HOST && OF - select HAVE_KVM_IRQCHIP - default y - ---help--- - Adds support for a hardware assisted, in-kernel GIC emulation. - -config KVM_ARM_TIMER - bool "KVM support for Architected Timers" - depends on KVM_ARM_VGIC && ARM_ARCH_TIMER - select HAVE_KVM_IRQCHIP - default y - ---help--- - Adds support for the Architected Timers in virtual machines - endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 443b8bea43e9..139e46c08b6e 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -7,7 +7,7 @@ ifeq ($(plus_virt),+virt) plus_virt_def := -DREQUIRES_VIRT=1 endif -ccflags-y += -Ivirt/kvm -Iarch/arm/kvm +ccflags-y += -Iarch/arm/kvm CFLAGS_arm.o := -I. $(plus_virt_def) CFLAGS_mmu.o := -I. @@ -15,12 +15,12 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) KVM := ../../../virt/kvm -kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o +kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o -obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o -obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o -obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o -obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o +obj-y += $(KVM)/arm/vgic.o +obj-y += $(KVM)/arm/vgic-v2.o +obj-y += $(KVM)/arm/vgic-v2-emul.o +obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 07e7eb1d7ab6..d9631ecddd56 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -61,8 +61,6 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u8 kvm_next_vmid; static DEFINE_SPINLOCK(kvm_vmid_lock); -static bool vgic_present; - static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { BUG_ON(preemptible()); @@ -173,8 +171,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int r; switch (ext) { case KVM_CAP_IRQCHIP: - r = vgic_present; - break; + case KVM_CAP_IRQFD: + case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: @@ -183,6 +181,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -268,7 +267,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return 0; + return kvm_timer_should_fire(vcpu); } int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) @@ -313,13 +312,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - return -EINVAL; + if (vcpu->arch.pause) + mp_state->mp_state = KVM_MP_STATE_STOPPED; + else + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - return -EINVAL; + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.pause = false; + break; + case KVM_MP_STATE_STOPPED: + vcpu->arch.pause = true; + break; + default: + return -EINVAL; + } + + return 0; } /** @@ -452,6 +467,11 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) return 0; } +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ + return vgic_initialized(kvm); +} + static void vcpu_pause(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); @@ -540,7 +560,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mode = OUTSIDE_GUEST_MODE; kvm_guest_exit(); - trace_kvm_exit(*vcpu_pc(vcpu)); + trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* * We may have taken a host interrupt in HYP mode (ie * while executing the guest). This interrupt is still @@ -651,8 +671,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, if (!irqchip_in_kernel(kvm)) return -ENXIO; - if (irq_num < VGIC_NR_PRIVATE_IRQS || - irq_num > KVM_ARM_IRQ_GIC_MAX) + if (irq_num < VGIC_NR_PRIVATE_IRQS) return -EINVAL; return kvm_vgic_inject_irq(kvm, 0, irq_num, level); @@ -831,8 +850,6 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, switch (dev_id) { case KVM_ARM_DEVICE_VGIC_V2: - if (!vgic_present) - return -ENXIO; return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); default: return -ENODEV; @@ -847,10 +864,7 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { - if (vgic_present) - return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); - else - return -ENXIO; + return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); } case KVM_ARM_SET_DEVICE_ADDR: { struct kvm_arm_device_addr dev_addr; @@ -1035,10 +1049,6 @@ static int init_hyp_mode(void) if (err) goto out_free_context; -#ifdef CONFIG_KVM_ARM_VGIC - vgic_present = true; -#endif - /* * Init HYP architected timer support */ diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 384bab67c462..d503fbb787d3 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -109,22 +109,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } -#ifndef CONFIG_KVM_ARM_TIMER - -#define NUM_TIMER_REGS 0 - -static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - return 0; -} - -static bool is_timer_reg(u64 index) -{ - return false; -} - -#else - #define NUM_TIMER_REGS 3 static bool is_timer_reg(u64 index) @@ -152,8 +136,6 @@ static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return 0; } -#endif - static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index 14d488388480..35e4a3a0c476 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -402,7 +402,6 @@ vcpu .req r0 @ vcpu pointer always in r0 * Assumes vcpu pointer in vcpu reg */ .macro save_vgic_state -#ifdef CONFIG_KVM_ARM_VGIC /* Get VGIC VCTRL base into r2 */ ldr r2, [vcpu, #VCPU_KVM] ldr r2, [r2, #KVM_VGIC_VCTRL] @@ -460,7 +459,6 @@ ARM_BE8(rev r6, r6 ) subs r4, r4, #1 bne 1b 2: -#endif .endm /* @@ -469,7 +467,6 @@ ARM_BE8(rev r6, r6 ) * Assumes vcpu pointer in vcpu reg */ .macro restore_vgic_state -#ifdef CONFIG_KVM_ARM_VGIC /* Get VGIC VCTRL base into r2 */ ldr r2, [vcpu, #VCPU_KVM] ldr r2, [r2, #KVM_VGIC_VCTRL] @@ -501,7 +498,6 @@ ARM_BE8(rev r6, r6 ) subs r4, r4, #1 bne 1b 2: -#endif .endm #define CNTHCTL_PL1PCTEN (1 << 0) @@ -515,7 +511,6 @@ ARM_BE8(rev r6, r6 ) * Clobbers r2-r5 */ .macro save_timer_state -#ifdef CONFIG_KVM_ARM_TIMER ldr r4, [vcpu, #VCPU_KVM] ldr r2, [r4, #KVM_TIMER_ENABLED] cmp r2, #0 @@ -537,7 +532,6 @@ ARM_BE8(rev r6, r6 ) mcrr p15, 4, r2, r2, c14 @ CNTVOFF 1: -#endif @ Allow physical timer/counter access for the host mrc p15, 4, r2, c14, c1, 0 @ CNTHCTL orr r2, r2, #(CNTHCTL_PL1PCEN | CNTHCTL_PL1PCTEN) @@ -559,7 +553,6 @@ ARM_BE8(rev r6, r6 ) bic r2, r2, #CNTHCTL_PL1PCEN mcr p15, 4, r2, c14, c1, 0 @ CNTHCTL -#ifdef CONFIG_KVM_ARM_TIMER ldr r4, [vcpu, #VCPU_KVM] ldr r2, [r4, #KVM_TIMER_ENABLED] cmp r2, #0 @@ -579,7 +572,6 @@ ARM_BE8(rev r6, r6 ) and r2, r2, #3 mcr p15, 0, r2, c14, c3, 1 @ CNTV_CTL 1: -#endif .endm .equ vmentry, 0 diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 5d3bfc0eb3f0..974b1c606d04 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -121,12 +121,11 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_exit_mmio *mmio) +static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) { unsigned long rt; - int len; - bool is_write, sign_extend; + int access_size; + bool sign_extend; if (kvm_vcpu_dabt_isextabt(vcpu)) { /* cache operation on I/O addr, tell guest unsupported */ @@ -140,17 +139,15 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return 1; } - len = kvm_vcpu_dabt_get_as(vcpu); - if (unlikely(len < 0)) - return len; + access_size = kvm_vcpu_dabt_get_as(vcpu); + if (unlikely(access_size < 0)) + return access_size; - is_write = kvm_vcpu_dabt_iswrite(vcpu); + *is_write = kvm_vcpu_dabt_iswrite(vcpu); sign_extend = kvm_vcpu_dabt_issext(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); - mmio->is_write = is_write; - mmio->phys_addr = fault_ipa; - mmio->len = len; + *len = access_size; vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; @@ -165,20 +162,20 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa) { - struct kvm_exit_mmio mmio; unsigned long data; unsigned long rt; int ret; + bool is_write; + int len; + u8 data_buf[8]; /* - * Prepare MMIO operation. First stash it in a private - * structure that we can use for in-kernel emulation. If the - * kernel can't handle it, copy it into run->mmio and let user - * space do its magic. + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. */ - if (kvm_vcpu_dabt_isvalid(vcpu)) { - ret = decode_hsr(vcpu, fault_ipa, &mmio); + ret = decode_hsr(vcpu, &is_write, &len); if (ret) return ret; } else { @@ -188,21 +185,34 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, rt = vcpu->arch.mmio_decode.rt; - if (mmio.is_write) { - data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), - mmio.len); + if (is_write) { + data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), len); + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); + mmio_write_buf(data_buf, len, data); - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, mmio.len, - fault_ipa, data); - mmio_write_buf(mmio.data, mmio.len, data); + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); } else { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, mmio.len, + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, fault_ipa, 0); + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, + data_buf); } - if (vgic_handle_mmio(vcpu, run, &mmio)) + /* Now prepare kvm_run for the potential return to userland. */ + run->mmio.is_write = is_write; + run->mmio.phys_addr = fault_ipa; + run->mmio.len = len; + memcpy(run->mmio.data, data_buf, len); + + if (!ret) { + /* We handled the access successfully in the kernel. */ + kvm_handle_mmio_return(vcpu, run); return 1; + } - kvm_prepare_mmio(run, &mmio); + run->exit_reason = KVM_EXIT_MMIO; return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 3e6859bc3e11..1d5accbd3dcf 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -35,9 +35,9 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; static pgd_t *boot_hyp_pgd; static pgd_t *hyp_pgd; +static pgd_t *merged_hyp_pgd; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); -static void *init_bounce_page; static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; @@ -290,7 +290,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + pgd_index(addr); + pgd = pgdp + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); if (!pgd_none(*pgd)) @@ -355,7 +355,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); @@ -405,9 +405,6 @@ void free_boot_hyp_pgd(void) if (hyp_pgd) unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); - free_page((unsigned long)init_bounce_page); - init_bounce_page = NULL; - mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -438,6 +435,11 @@ void free_hyp_pgds(void) free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; } + if (merged_hyp_pgd) { + clear_page(merged_hyp_pgd); + free_page((unsigned long)merged_hyp_pgd); + merged_hyp_pgd = NULL; + } mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -632,6 +634,20 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } +/* Free the HW pgd, one page at a time */ +static void kvm_free_hwpgd(void *hwpgd) +{ + free_pages_exact(hwpgd, kvm_get_hwpgd_size()); +} + +/* Allocate the HW PGD, making sure that each page gets its own refcount */ +static void *kvm_alloc_hwpgd(void) +{ + unsigned int size = kvm_get_hwpgd_size(); + + return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); +} + /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -645,15 +661,31 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { - int ret; pgd_t *pgd; + void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + hwpgd = kvm_alloc_hwpgd(); + if (!hwpgd) + return -ENOMEM; + + /* When the kernel uses more levels of page tables than the + * guest, we allocate a fake PGD and pre-populate it to point + * to the next-level page table, which will be the real + * initial page table pointed to by the VTTBR. + * + * When KVM_PREALLOC_LEVEL==2, we allocate a single page for + * the PMD and the kernel will use folded pud. + * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD + * pages. + */ if (KVM_PREALLOC_LEVEL > 0) { + int i; + /* * Allocate fake pgd for the page table manipulation macros to * work. This is not used by the hardware and we have no @@ -661,30 +693,32 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) */ pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO); + + if (!pgd) { + kvm_free_hwpgd(hwpgd); + return -ENOMEM; + } + + /* Plug the HW PGD into the fake one. */ + for (i = 0; i < PTRS_PER_S2_PGD; i++) { + if (KVM_PREALLOC_LEVEL == 1) + pgd_populate(NULL, pgd + i, + (pud_t *)hwpgd + i * PTRS_PER_PUD); + else if (KVM_PREALLOC_LEVEL == 2) + pud_populate(NULL, pud_offset(pgd, 0) + i, + (pmd_t *)hwpgd + i * PTRS_PER_PMD); + } } else { /* * Allocate actual first-level Stage-2 page table used by the * hardware for Stage-2 page table walks. */ - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, S2_PGD_ORDER); + pgd = (pgd_t *)hwpgd; } - if (!pgd) - return -ENOMEM; - - ret = kvm_prealloc_hwpgd(kvm, pgd); - if (ret) - goto out_err; - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; -out_err: - if (KVM_PREALLOC_LEVEL > 0) - kfree(pgd); - else - free_pages((unsigned long)pgd, S2_PGD_ORDER); - return ret; } /** @@ -785,11 +819,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - kvm_free_hwpgd(kvm); + kvm_free_hwpgd(kvm_get_hwpgd(kvm)); if (KVM_PREALLOC_LEVEL > 0) kfree(kvm->arch.pgd); - else - free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER); + kvm->arch.pgd = NULL; } @@ -799,7 +832,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); if (WARN_ON(pgd_none(*pgd))) { if (!cache) return NULL; @@ -1089,7 +1122,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1299,10 +1332,51 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, out_unlock: spin_unlock(&kvm->mmu_lock); + kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); return ret; } +/* + * Resolve the access fault by making the page young again. + * Note that because the faulting entry is guaranteed not to be + * cached in the TLB, we don't need to invalidate anything. + */ +static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + pmd_t *pmd; + pte_t *pte; + pfn_t pfn; + bool pfn_valid = false; + + trace_kvm_access_fault(fault_ipa); + + spin_lock(&vcpu->kvm->mmu_lock); + + pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); + if (!pmd || pmd_none(*pmd)) /* Nothing there */ + goto out; + + if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + *pmd = pmd_mkyoung(*pmd); + pfn = pmd_pfn(*pmd); + pfn_valid = true; + goto out; + } + + pte = pte_offset_kernel(pmd, fault_ipa); + if (pte_none(*pte)) /* Nothing there either */ + goto out; + + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; +out: + spin_unlock(&vcpu->kvm->mmu_lock); + if (pfn_valid) + kvm_set_pfn_accessed(pfn); +} + /** * kvm_handle_guest_abort - handles all 2nd stage aborts * @vcpu: the VCPU pointer @@ -1333,7 +1407,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Check the stage-2 fault is trans. fault or write fault */ fault_status = kvm_vcpu_trap_get_fault_type(vcpu); - if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { + if (fault_status != FSC_FAULT && fault_status != FSC_PERM && + fault_status != FSC_ACCESS) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1369,6 +1444,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Userspace should not be able to register out-of-bounds IPAs */ VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); + if (fault_status == FSC_ACCESS) { + handle_access_fault(vcpu, fault_ipa); + ret = 1; + goto out_unlock; + } + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); if (ret == 0) ret = 1; @@ -1377,15 +1458,16 @@ out_unlock: return ret; } -static void handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - void (*handler)(struct kvm *kvm, - gpa_t gpa, void *data), - void *data) +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + gpa_t gpa, void *data), + void *data) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + int ret = 0; slots = kvm_memslots(kvm); @@ -1409,14 +1491,17 @@ static void handle_hva_to_gpa(struct kvm *kvm, for (; gfn < gfn_end; ++gfn) { gpa_t gpa = gfn << PAGE_SHIFT; - handler(kvm, gpa, data); + ret |= handler(kvm, gpa, data); } } + + return ret; } -static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) { unmap_stage2_range(kvm, gpa, PAGE_SIZE); + return 0; } int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) @@ -1442,7 +1527,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, return 0; } -static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; @@ -1454,6 +1539,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) * through this calling path. */ stage2_set_pte(kvm, NULL, gpa, pte, 0); + return 0; } @@ -1470,6 +1556,67 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } +static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +{ + pmd_t *pmd; + pte_t *pte; + + pmd = stage2_get_pmd(kvm, NULL, gpa); + if (!pmd || pmd_none(*pmd)) /* Nothing there */ + return 0; + + if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_young(*pmd)) { + *pmd = pmd_mkold(*pmd); + return 1; + } + + return 0; + } + + pte = pte_offset_kernel(pmd, gpa); + if (pte_none(*pte)) + return 0; + + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); /* Just a page... */ + return 1; + } + + return 0; +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +{ + pmd_t *pmd; + pte_t *pte; + + pmd = stage2_get_pmd(kvm, NULL, gpa); + if (!pmd || pmd_none(*pmd)) /* Nothing there */ + return 0; + + if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ + return pmd_young(*pmd); + + pte = pte_offset_kernel(pmd, gpa); + if (!pte_none(*pte)) /* Just a page... */ + return pte_young(*pte); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + trace_kvm_age_hva(start, end); + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_test_age_hva(hva); + return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); +} + void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); @@ -1477,12 +1624,18 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) phys_addr_t kvm_mmu_get_httbr(void) { - return virt_to_phys(hyp_pgd); + if (__kvm_cpu_uses_extended_idmap()) + return virt_to_phys(merged_hyp_pgd); + else + return virt_to_phys(hyp_pgd); } phys_addr_t kvm_mmu_get_boot_httbr(void) { - return virt_to_phys(boot_hyp_pgd); + if (__kvm_cpu_uses_extended_idmap()) + return virt_to_phys(merged_hyp_pgd); + else + return virt_to_phys(boot_hyp_pgd); } phys_addr_t kvm_get_idmap_vector(void) @@ -1498,39 +1651,11 @@ int kvm_mmu_init(void) hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); - if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) { - /* - * Our init code is crossing a page boundary. Allocate - * a bounce page, copy the code over and use that. - */ - size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start; - phys_addr_t phys_base; - - init_bounce_page = (void *)__get_free_page(GFP_KERNEL); - if (!init_bounce_page) { - kvm_err("Couldn't allocate HYP init bounce page\n"); - err = -ENOMEM; - goto out; - } - - memcpy(init_bounce_page, __hyp_idmap_text_start, len); - /* - * Warning: the code we just copied to the bounce page - * must be flushed to the point of coherency. - * Otherwise, the data may be sitting in L2, and HYP - * mode won't be able to observe it as it runs with - * caches off at that point. - */ - kvm_flush_dcache_to_poc(init_bounce_page, len); - - phys_base = kvm_virt_to_phys(init_bounce_page); - hyp_idmap_vector += phys_base - hyp_idmap_start; - hyp_idmap_start = phys_base; - hyp_idmap_end = phys_base + len; - - kvm_info("Using HYP init bounce page @%lx\n", - (unsigned long)phys_base); - } + /* + * We rely on the linker script to ensure at build time that the HYP + * init code does not cross a page boundary. + */ + BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); @@ -1553,6 +1678,17 @@ int kvm_mmu_init(void) goto out; } + if (__kvm_cpu_uses_extended_idmap()) { + merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!merged_hyp_pgd) { + kvm_err("Failed to allocate extra HYP pgd\n"); + goto out; + } + __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, + hyp_idmap_start); + return 0; + } + /* Map the very same page at the trampoline VA */ err = __create_hyp_mappings(boot_hyp_pgd, TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index 881874b1a036..0ec35392d208 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -25,18 +25,22 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), + TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc), + TP_ARGS(exit_reason, vcpu_pc), TP_STRUCT__entry( + __field( unsigned int, exit_reason ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( + __entry->exit_reason = exit_reason; __entry->vcpu_pc = vcpu_pc; ), - TP_printk("PC: 0x%08lx", __entry->vcpu_pc) + TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx", + __entry->exit_reason, + __entry->vcpu_pc) ); TRACE_EVENT(kvm_guest_fault, @@ -64,6 +68,21 @@ TRACE_EVENT(kvm_guest_fault, __entry->hxfar, __entry->vcpu_pc) ); +TRACE_EVENT(kvm_access_fault, + TP_PROTO(unsigned long ipa), + TP_ARGS(ipa), + + TP_STRUCT__entry( + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->ipa = ipa; + ), + + TP_printk("IPA: %lx", __entry->ipa) +); + TRACE_EVENT(kvm_irq_line, TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), TP_ARGS(type, vcpu_idx, irq_num, level), @@ -206,6 +225,39 @@ TRACE_EVENT(kvm_set_spte_hva, TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) ); +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#08lx -- %#08lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) +); + TRACE_EVENT(kvm_hvc, TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), TP_ARGS(vcpu_pc, r0, imm), |