diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2018-12-19 22:33:55 +0300 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2018-12-19 22:33:55 +0300 |
commit | 8c5e14f438b8cee47ff01fc1cadd15e3eed44b59 (patch) | |
tree | 8f5028e0fce8f3a73cb000b47e571f3788429d23 | |
parent | 5132411985e14b8dda69173abee8ea9ba942698f (diff) | |
parent | 8c33df1afd86c611da8473dc6fc5f3af3dabe984 (diff) | |
download | linux-8c5e14f438b8cee47ff01fc1cadd15e3eed44b59.tar.xz |
Merge tag 'kvmarm-for-v4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm updates for 4.21
- Large PUD support for HugeTLB
- Single-stepping fixes
- Improved tracing
- Various timer and vgic fixups
29 files changed, 595 insertions, 296 deletions
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 231e87ad45d5..35491af87985 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -23,6 +23,10 @@ #define ARM_EXIT_WITH_ABORT_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) +#define ARM_EXCEPTION_IS_TRAP(x) \ + (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_PREF_ABORT || \ + ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_DATA_ABORT || \ + ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_HVC) #define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) #define ARM_EXCEPTION_RESET 0 diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 5ca5d9af0c26..c5634c6ffcea 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -296,11 +296,6 @@ static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, - struct kvm_run *run) -{ - return false; -} int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 1098ffc3d54b..3a875fc1b63c 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -82,6 +82,67 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) #define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) +#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) +#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pfn_pud(pfn, prot) (__pud(0)) + +#define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; }) + + +#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +/* No support for pud hugepages */ +#define kvm_pud_mkhuge(pud) ( {WARN_ON(1); pud; }) + +/* + * The following kvm_*pud*() functions are provided strictly to allow + * sharing code with arm64. They should never be called in practice. + */ +static inline void kvm_set_s2pud_readonly(pud_t *pud) +{ + WARN_ON(1); +} + +static inline bool kvm_s2pud_readonly(pud_t *pud) +{ + WARN_ON(1); + return false; +} + +static inline void kvm_set_pud(pud_t *pud, pud_t new_pud) +{ + WARN_ON(1); +} + +static inline pud_t kvm_s2pud_mkwrite(pud_t pud) +{ + WARN_ON(1); + return pud; +} + +static inline pud_t kvm_s2pud_mkexec(pud_t pud) +{ + WARN_ON(1); + return pud; +} + +static inline bool kvm_s2pud_exec(pud_t *pud) +{ + WARN_ON(1); + return false; +} + +static inline pud_t kvm_s2pud_mkyoung(pud_t pud) +{ + BUG(); + return pud; +} + +static inline bool kvm_s2pud_young(pud_t pud) +{ + WARN_ON(1); + return false; +} + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index f6a7ea805232..c4b1d4fb1797 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -68,4 +68,12 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) #define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) #define stage2_pud_table_empty(kvm, pudp) false +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return false; +} + +#define S2_PMD_MASK PMD_MASK +#define S2_PMD_SIZE PMD_SIZE + #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index cb094e55dc5f..222c1635bc7a 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -602,8 +602,8 @@ static int emulate_cp15(struct kvm_vcpu *vcpu, } } else { /* If access function fails, it should complain. */ - kvm_err("Unsupported guest CP15 access at: %08lx\n", - *vcpu_pc(vcpu)); + kvm_err("Unsupported guest CP15 access at: %08lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_cp_instr(params); kvm_inject_undefined(vcpu); } diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 6f602af5263c..9c1a065b78ea 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -104,7 +104,7 @@ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ -#define VTCR_EL2_RES1 (1 << 31) +#define VTCR_EL2_RES1 (1U << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) #define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT @@ -320,10 +320,6 @@ #define PAR_TO_HPFAR(par) \ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) -#define kvm_arm_exception_type \ - {0, "IRQ" }, \ - {1, "TRAP" } - #define ECN(x) { ESR_ELx_EC_##x, #x } #define kvm_arm_exception_class \ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index aea01a09eb94..f5b79e995f40 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -25,6 +25,7 @@ #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) +#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP) #define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) #define ARM_EXCEPTION_IRQ 0 @@ -34,6 +35,12 @@ /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR +#define kvm_arm_exception_type \ + {ARM_EXCEPTION_IRQ, "IRQ" }, \ + {ARM_EXCEPTION_EL1_SERROR, "SERROR" }, \ + {ARM_EXCEPTION_TRAP, "TRAP" }, \ + {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" } + #ifndef __ASSEMBLY__ #include <linux/mm.h> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 21247870def7..506386a3edde 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -24,6 +24,7 @@ #include <linux/kvm_host.h> +#include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> @@ -147,14 +148,6 @@ static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) return true; } -static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - if (vcpu_mode_is_32bit(vcpu)) - kvm_skip_instr32(vcpu, is_wide_instr); - else - *vcpu_pc(vcpu) += 4; -} - static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) { *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; @@ -424,4 +417,30 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, return data; /* Leave LE untouched */ } +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + if (vcpu_mode_is_32bit(vcpu)) + kvm_skip_instr32(vcpu, is_wide_instr); + else + *vcpu_pc(vcpu) += 4; + + /* advance the singlestep state machine */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; +} + +/* + * Skip an instruction which has been emulated at hyp while most guest sysregs + * are live. + */ +static inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(elr); + vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); + + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + + write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); + write_sysreg_el2(*vcpu_pc(vcpu), elr); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 52fbc823ff8c..c22e50ba3473 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -319,7 +319,7 @@ struct kvm_vcpu_arch { */ #define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) -u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg); +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); /* @@ -445,7 +445,6 @@ void kvm_arm_init_debug(void); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); -bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 658657367f2f..8af4b1befa42 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -184,6 +184,17 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pgd(pudp) \ __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_set_pud(pudp, pud) set_pud(pudp, pud) + +#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) +#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) + +#define kvm_pud_pfn(pud) pud_pfn(pud) + +#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +#define kvm_pud_mkhuge(pud) pud_mkhuge(pud) + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= PTE_S2_RDWR; @@ -196,6 +207,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) return pmd; } +static inline pud_t kvm_s2pud_mkwrite(pud_t pud) +{ + pud_val(pud) |= PUD_S2_RDWR; + return pud; +} + static inline pte_t kvm_s2pte_mkexec(pte_t pte) { pte_val(pte) &= ~PTE_S2_XN; @@ -208,6 +225,12 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) return pmd; } +static inline pud_t kvm_s2pud_mkexec(pud_t pud) +{ + pud_val(pud) &= ~PUD_S2_XN; + return pud; +} + static inline void kvm_set_s2pte_readonly(pte_t *ptep) { pteval_t old_pteval, pteval; @@ -246,6 +269,31 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } +static inline void kvm_set_s2pud_readonly(pud_t *pudp) +{ + kvm_set_s2pte_readonly((pte_t *)pudp); +} + +static inline bool kvm_s2pud_readonly(pud_t *pudp) +{ + return kvm_s2pte_readonly((pte_t *)pudp); +} + +static inline bool kvm_s2pud_exec(pud_t *pudp) +{ + return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); +} + +static inline pud_t kvm_s2pud_mkyoung(pud_t pud) +{ + return pud_mkyoung(pud); +} + +static inline bool kvm_s2pud_young(pud_t pud) +{ + return pud_young(pud); +} + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1d7d8da2ef9b..6f1c187f1c86 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -193,6 +193,10 @@ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ +#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ +#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ +#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 50b1ef8584c0..576128635f3c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -314,6 +314,11 @@ static inline pte_t pud_pte(pud_t pud) return __pte(pud_val(pud)); } +static inline pud_t pte_pud(pte_t pte) +{ + return __pud(pte_val(pte)); +} + static inline pmd_t pud_pmd(pud_t pud) { return __pmd(pud_val(pud)); @@ -381,8 +386,12 @@ static inline int pmd_protnone(pmd_t pmd) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) + #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index d352f6df8d2c..5412fa40825e 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -30,16 +30,14 @@ #define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) /* - * The hardware supports concatenation of up to 16 tables at stage2 entry level - * and we use the feature whenever possible. + * The hardware supports concatenation of up to 16 tables at stage2 entry + * level and we use the feature whenever possible, which means we resolve 4 + * additional bits of address at the entry level. * - * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). - * On arm64, the smallest PAGE_SIZE supported is 4k, which means - * (PAGE_SHIFT - 3) > 4 holds for all page sizes. - * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) - * in normal translations(e.g, stage1), since we cannot have another level in - * the range (IPA_SHIFT, IPA_SHIFT - 4). + * This implies, the total number of page table levels required for + * IPA_SHIFT at stage2 expected by the hardware can be calculated using + * the same logic used for the (non-collapsable) stage1 page tables but for + * (IPA_SHIFT - 4). */ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 00d422336a45..f39801e4136c 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -236,24 +236,3 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) } } } - - -/* - * After successfully emulating an instruction, we might want to - * return to user space with a KVM_EXIT_DEBUG. We can only do this - * once the emulation is complete, though, so for userspace emulations - * we have to wait until we have re-entered KVM before calling this - * helper. - * - * Return true (and set exit_reason) to return to userspace or false - * if no further action is required. - */ -bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - run->exit_reason = KVM_EXIT_DEBUG; - run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT; - return true; - } - return false; -} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 35a81bebd02b..b0643f9c4873 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -229,13 +229,6 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) handled = exit_handler(vcpu, run); } - /* - * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run - * structure if we need to return to userspace. - */ - if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run)) - handled = 0; - return handled; } @@ -269,12 +262,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_IRQ: return 1; case ARM_EXCEPTION_EL1_SERROR: - /* We may still need to return for single-step */ - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS) - && kvm_arm_handle_step_debug(vcpu, run)) - return 0; - else - return 1; + return 1; case ARM_EXCEPTION_TRAP: return handle_trap_exceptions(vcpu, run); case ARM_EXCEPTION_HYP_GONE: diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 7cc175c88a37..4282f05771c1 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -305,33 +305,6 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -/* Skip an instruction which has been emulated. Returns true if - * execution can continue or false if we need to exit hyp mode because - * single-step was in effect. - */ -static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu) -{ - *vcpu_pc(vcpu) = read_sysreg_el2(elr); - - if (vcpu_mode_is_32bit(vcpu)) { - vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); - kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); - } else { - *vcpu_pc(vcpu) += 4; - } - - write_sysreg_el2(*vcpu_pc(vcpu), elr); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - vcpu->arch.fault.esr_el2 = - (ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22; - return false; - } else { - return true; - } -} - static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) { struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; @@ -420,20 +393,12 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (valid) { int ret = __vgic_v2_perform_cpuif_access(vcpu); - if (ret == 1 && __skip_instr(vcpu)) + if (ret == 1) return true; - if (ret == -1) { - /* Promote an illegal access to an - * SError. If we would be returning - * due to single-step clear the SS - * bit so handle_exit knows what to - * do after dealing with the error. - */ - if (!__skip_instr(vcpu)) - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + /* Promote an illegal access to an SError.*/ + if (ret == -1) *exit_code = ARM_EXCEPTION_EL1_SERROR; - } goto exit; } @@ -444,7 +409,7 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { int ret = __vgic_v3_perform_cpuif_access(vcpu); - if (ret == 1 && __skip_instr(vcpu)) + if (ret == 1) return true; } diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 215c7c0eb3b0..9cbdd034a563 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -41,7 +41,7 @@ static bool __hyp_text __is_be(struct kvm_vcpu *vcpu) * Returns: * 1: GICV access successfully performed * 0: Not a GICV access - * -1: Illegal GICV access + * -1: Illegal GICV access successfully performed */ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) { @@ -61,12 +61,16 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return 0; /* Reject anything but a 32bit access */ - if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) + if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) { + __kvm_skip_instr(vcpu); return -1; + } /* Not aligned? Don't bother */ - if (fault_ipa & 3) + if (fault_ipa & 3) { + __kvm_skip_instr(vcpu); return -1; + } rd = kvm_vcpu_dabt_get_rd(vcpu); addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; @@ -88,5 +92,7 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) vcpu_set_reg(vcpu, rd, data); } + __kvm_skip_instr(vcpu); + return 1; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 22fbbdbece3c..a1f57fcb1bdb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -76,7 +76,7 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg) +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { if (!vcpu->arch.sysregs_loaded_on_cpu) goto immediate_read; @@ -1850,6 +1850,8 @@ static void perform_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { + trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies @@ -1912,8 +1914,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, WARN_ON(1); } - kvm_err("Unsupported guest CP%d access at: %08lx\n", - cp, *vcpu_pc(vcpu)); + kvm_err("Unsupported guest CP%d access at: %08lx [%08lx]\n", + cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); } @@ -2063,8 +2065,8 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, if (likely(r)) { perform_access(vcpu, params, r); } else { - kvm_err("Unsupported guest sys_reg access at: %lx\n", - *vcpu_pc(vcpu)); + kvm_err("Unsupported guest sys_reg access at: %lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cd710f8b63e0..3b1bc7f01d0b 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -35,6 +35,9 @@ struct sys_reg_params { }; struct sys_reg_desc { + /* Sysreg string for debug */ + const char *name; + /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; @@ -130,6 +133,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, #define Op2(_x) .Op2 = _x #define SYS_DESC(reg) \ + .name = #reg, \ Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h index 3b82fb1ddd09..eab91ad0effb 100644 --- a/arch/arm64/kvm/trace.h +++ b/arch/arm64/kvm/trace.h @@ -3,6 +3,7 @@ #define _TRACE_ARM64_KVM_H #include <linux/tracepoint.h> +#include "sys_regs.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -152,6 +153,40 @@ TRACE_EVENT(kvm_handle_sys_reg, TP_printk("HSR 0x%08lx", __entry->hsr) ); +TRACE_EVENT(kvm_sys_access, + TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), + TP_ARGS(vcpu_pc, params, reg), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_write) + __field(const char *, name) + __field(u8, Op0) + __field(u8, Op1) + __field(u8, CRn) + __field(u8, CRm) + __field(u8, Op2) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_write = params->is_write; + __entry->name = reg->name; + __entry->Op0 = reg->Op0; + __entry->Op0 = reg->Op0; + __entry->Op1 = reg->Op1; + __entry->CRn = reg->CRn; + __entry->CRm = reg->CRm; + __entry->Op2 = reg->Op2; + ), + + TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", + __entry->vcpu_pc, __entry->name ?: "UNKN", + __entry->Op0, __entry->Op1, __entry->CRn, + __entry->CRm, __entry->Op2, + __entry->is_write ? "write" : "read") +); + TRACE_EVENT(kvm_set_guest_debug, TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), TP_ARGS(vcpu, guest_debug), diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6502feb9524b..33771352dcd6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -21,7 +21,6 @@ #include <linux/clocksource.h> #include <linux/hrtimer.h> -#include <linux/workqueue.h> struct arch_timer_context { /* Registers: control register, timer value */ @@ -52,9 +51,6 @@ struct arch_timer_cpu { /* Background timer used when the guest is not running */ struct hrtimer bg_timer; - /* Work queued with the above timer expires */ - struct work_struct expired; - /* Physical timer emulation */ struct hrtimer phys_timer; diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17cecc96f735..b07ac4614e1c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -70,11 +70,9 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns) HRTIMER_MODE_ABS); } -static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) +static void soft_timer_cancel(struct hrtimer *hrt) { hrtimer_cancel(hrt); - if (work) - cancel_work_sync(work); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) @@ -102,23 +100,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * Work function for handling the backup timer that we schedule when a vcpu is - * no longer running, but had a timer programmed to fire in the future. - */ -static void kvm_timer_inject_irq_work(struct work_struct *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); - - /* - * If the vcpu is blocked we want to wake it up so that it will see - * the timer has expired when entering the guest. - */ - kvm_vcpu_wake_up(vcpu); -} - static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) { u64 cval, now; @@ -188,7 +169,7 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) return HRTIMER_RESTART; } - schedule_work(&timer->expired); + kvm_vcpu_wake_up(vcpu); return HRTIMER_NORESTART; } @@ -300,7 +281,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) * then we also don't need a soft timer. */ if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->phys_timer); return; } @@ -426,7 +407,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) vtimer_restore_state(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); + soft_timer_cancel(&timer->bg_timer); } static void set_cntvoff(u64 cntvoff) @@ -544,7 +525,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->phys_timer); /* * The kernel may decide to run userspace after calling vcpu_put, so @@ -637,7 +618,6 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); vcpu_ptimer(vcpu)->cntvoff = 0; - INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; @@ -792,11 +772,8 @@ out_free_irq: void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); - soft_timer_cancel(&timer->phys_timer, NULL); - kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); + soft_timer_cancel(&timer->bg_timer); } static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index e91adf77d99a..87ecb5f3637d 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -66,7 +66,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; static unsigned int kvm_vmid_bits __read_mostly; -static DEFINE_RWLOCK(kvm_vmid_lock); +static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; @@ -484,7 +484,9 @@ void force_vm_exit(const cpumask_t *mask) */ static bool need_new_vmid_gen(struct kvm *kvm) { - return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); + u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); + smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ + return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); } /** @@ -499,16 +501,11 @@ static void update_vttbr(struct kvm *kvm) { phys_addr_t pgd_phys; u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - bool new_gen; - read_lock(&kvm_vmid_lock); - new_gen = need_new_vmid_gen(kvm); - read_unlock(&kvm_vmid_lock); - - if (!new_gen) + if (!need_new_vmid_gen(kvm)) return; - write_lock(&kvm_vmid_lock); + spin_lock(&kvm_vmid_lock); /* * We need to re-check the vmid_gen here to ensure that if another vcpu @@ -516,7 +513,7 @@ static void update_vttbr(struct kvm *kvm) * use the same vmid. */ if (!need_new_vmid_gen(kvm)) { - write_unlock(&kvm_vmid_lock); + spin_unlock(&kvm_vmid_lock); return; } @@ -539,7 +536,6 @@ static void update_vttbr(struct kvm *kvm) kvm_call_hyp(__kvm_flush_vm_context); } - kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); kvm->arch.vmid = kvm_next_vmid; kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; @@ -550,7 +546,10 @@ static void update_vttbr(struct kvm *kvm) vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; - write_unlock(&kvm_vmid_lock); + smp_wmb(); + WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); + + spin_unlock(&kvm_vmid_lock); } static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) @@ -674,8 +673,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) return ret; - if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) - return 0; } if (run->immediate_exit) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 616e5a433ab0..9652c453480f 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) esr = kvm_vcpu_get_hsr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { - if (!kvm_condition_valid(vcpu)) + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); return 1; + } sysreg = esr_cp15_to_sysreg(esr); } else { @@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) rt = kvm_vcpu_sys_get_rt(vcpu); fn(vcpu, vmcr, rt); + __kvm_skip_instr(vcpu); + return 1; } diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index dac7ceb1a677..08443a15e6be 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 0; } @@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; - /* - * The MMIO instruction is emulated and should not be re-executed - * in the guest. - */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 0; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 5eca48bdb1a6..dee3dbd98712 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -115,6 +115,25 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) put_page(virt_to_page(pmd)); } +/** + * stage2_dissolve_pud() - clear and flush huge PUD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pud: pud pointer for IPA + * + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) +{ + if (!stage2_pud_huge(kvm, *pudp)) + return; + + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pudp)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -607,7 +626,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, addr = start; do { pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, pfn_pte(pfn, prot)); + kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); get_page(virt_to_page(pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -1022,7 +1041,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (!pud) + if (!pud || stage2_pud_huge(kvm, *pud)) return NULL; if (stage2_pud_none(kvm, *pud)) { @@ -1083,29 +1102,103 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pud_t *new_pudp) { + pud_t *pudp, old_pud; + + pudp = stage2_get_pud(kvm, cache, addr); + VM_BUG_ON(!pudp); + + old_pud = *pudp; + + /* + * A large number of vcpus faulting on the same stage 2 entry, + * can lead to a refault due to the + * stage2_pud_clear()/tlb_flush(). Skip updating the page + * tables if there is no change. + */ + if (pud_val(old_pud) == pud_val(*new_pudp)) + return 0; + + if (stage2_pud_present(kvm, old_pud)) { + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pudp)); + } + + kvm_set_pud(pudp, *new_pudp); + return 0; +} + +/* + * stage2_get_leaf_entry - walk the stage2 VM page tables and return + * true if a valid and present leaf-entry is found. A pointer to the + * leaf-entry is returned in the appropriate level variable - pudpp, + * pmdpp, ptepp. + */ +static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, + pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) +{ + pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - pmdp = stage2_get_pmd(kvm, NULL, addr); + *pudpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pudp = stage2_get_pud(kvm, NULL, addr); + if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) + return false; + + if (stage2_pud_huge(kvm, *pudp)) { + *pudpp = pudp; + return true; + } + + pmdp = stage2_pmd_offset(kvm, pudp, addr); if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) return false; - if (pmd_thp_or_huge(*pmdp)) - return kvm_s2pmd_exec(pmdp); + if (pmd_thp_or_huge(*pmdp)) { + *pmdpp = pmdp; + return true; + } ptep = pte_offset_kernel(pmdp, addr); if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) return false; - return kvm_s2pte_exec(ptep); + *ptepp = ptep; + return true; +} + +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + bool found; + + found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); + if (!found) + return false; + + if (pudp) + return kvm_s2pud_exec(pudp); + else if (pmdp) + return kvm_s2pmd_exec(pmdp); + else + return kvm_s2pte_exec(ptep); } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pte_t *new_pte, unsigned long flags) { + pud_t *pud; pmd_t *pmd; pte_t *pte, old_pte; bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; @@ -1114,7 +1207,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ - pmd = stage2_get_pmd(kvm, cache, addr); + pud = stage2_get_pud(kvm, cache, addr); + if (!pud) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PUD, then continue + * on to allocate page. + */ + if (logging_active) + stage2_dissolve_pud(kvm, addr, pud); + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + pmd = stage2_pmd_offset(kvm, pud, addr); if (!pmd) { /* * Ignore calls from kvm_set_spte_hva for unallocated @@ -1182,6 +1299,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) return stage2_ptep_test_and_clear_young((pte_t *)pmd); } +static int stage2_pudp_test_and_clear_young(pud_t *pud) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pud); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -1202,7 +1324,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pfn = __phys_to_pfn(pa); for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); + pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) pte = kvm_s2pte_mkwrite(pte); @@ -1234,7 +1356,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) struct page *page = pfn_to_page(pfn); /* - * PageTransCompoungMap() returns true for THP and + * PageTransCompoundMap() returns true for THP and * hugetlbfs. Make sure the adjustment is done only for THP * pages. */ @@ -1347,9 +1469,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { - /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_wp_pmds(kvm, pud, addr, next); + if (stage2_pud_huge(kvm, *pud)) { + if (!kvm_s2pud_readonly(pud)) + kvm_set_s2pud_readonly(pud); + } else { + stage2_wp_pmds(kvm, pud, addr, next); + } } } while (pud++, addr = next, addr != end); } @@ -1392,7 +1517,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) * * Called to start logging dirty pages after memory region * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns - * all present PMD and PTEs are write protected in the memory region. + * all present PUD, PMD and PTEs are write protected in the memory region. * Afterwards read of dirty page log can be called. * * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, @@ -1470,12 +1595,70 @@ static void kvm_send_hwpoison_signal(unsigned long address, send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } +static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, + unsigned long hva) +{ + gpa_t gpa_start, gpa_end; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + gpa_end = gpa_start + size; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and IPA cannot be mapped with stage-2 + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 PMDs, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit PMD size mappings + * for the beginning and end of a non-PMD aligned and non-PMD sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & S2_PMD_MASK) >= uaddr_start && + (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { int ret; - bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; + bool write_fault, writable, force_pte = false; + bool exec_fault, needs_exec; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1484,7 +1667,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long flags = 0; + unsigned long vma_pagesize, flags = 0; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_iabt(vcpu); @@ -1495,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + if (!fault_supports_stage2_pmd_mappings(memslot, hva)) + force_pte = true; + + if (logging_active) + force_pte = true; + /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); @@ -1504,22 +1693,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { - hugetlb = true; - gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; - } else { - /* - * Pages belonging to memslots that don't have the same - * alignment for userspace and IPA cannot be mapped using - * block descriptors even if the pages belong to a THP for - * the process, because the stage-2 block descriptor will - * cover more than a single THP and we loose atomicity for - * unmapping, updates, and splits of the THP or other pages - * in the stage-2 block range. - */ - if ((memslot->userspace_addr & ~PMD_MASK) != - ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) - force_pte = true; + vma_pagesize = vma_kernel_pagesize(vma); + /* + * PUD level may not exist for a VM but PMD is guaranteed to + * exist. + */ + if ((vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && + !force_pte) { + gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; } up_read(¤t->mm->mmap_sem); @@ -1558,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * should not be mapped with huge pages (it introduces churn * and performance degradation), so force a pte mapping. */ - force_pte = true; flags |= KVM_S2_FLAG_LOGGING_ACTIVE; /* @@ -1573,50 +1754,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb && !force_pte) - hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !force_pte) { + /* + * Only PMD_SIZE transparent hugepages(THP) are + * currently supported. This code will need to be + * updated to support other THP sizes. + */ + if (transparent_hugepage_adjust(&pfn, &fault_ipa)) + vma_pagesize = PMD_SIZE; + } + + if (writable) + kvm_set_pfn_dirty(pfn); - if (hugetlb) { - pmd_t new_pmd = pfn_pmd(pfn, mem_type); - new_pmd = pmd_mkhuge(new_pmd); - if (writable) { - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - kvm_set_pfn_dirty(pfn); - } + if (fault_status != FSC_PERM) + clean_dcache_guest_page(pfn, vma_pagesize); + + if (exec_fault) + invalidate_icache_guest_page(pfn, vma_pagesize); - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PMD_SIZE); + /* + * If we took an execution fault we have made the + * icache/dcache coherent above and should now let the s2 + * mapping be executable. + * + * Write faults (!exec_fault && FSC_PERM) are orthogonal to + * execute permissions, and we preserve whatever we have. + */ + needs_exec = exec_fault || + (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); - if (exec_fault) { + if (vma_pagesize == PUD_SIZE) { + pud_t new_pud = kvm_pfn_pud(pfn, mem_type); + + new_pud = kvm_pud_mkhuge(new_pud); + if (writable) + new_pud = kvm_s2pud_mkwrite(new_pud); + + if (needs_exec) + new_pud = kvm_s2pud_mkexec(new_pud); + + ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); + } else if (vma_pagesize == PMD_SIZE) { + pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); + + new_pmd = kvm_pmd_mkhuge(new_pmd); + + if (writable) + new_pmd = kvm_s2pmd_mkwrite(new_pmd); + + if (needs_exec) new_pmd = kvm_s2pmd_mkexec(new_pmd); - invalidate_icache_guest_page(pfn, PMD_SIZE); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - } ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { - pte_t new_pte = pfn_pte(pfn, mem_type); + pte_t new_pte = kvm_pfn_pte(pfn, mem_type); if (writable) { new_pte = kvm_s2pte_mkwrite(new_pte); - kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PAGE_SIZE); - - if (exec_fault) { + if (needs_exec) new_pte = kvm_s2pte_mkexec(new_pte); - invalidate_icache_guest_page(pfn, PAGE_SIZE); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pte = kvm_s2pte_mkexec(new_pte); - } ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } @@ -1637,6 +1837,7 @@ out_unlock: */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + pud_t *pud; pmd_t *pmd; pte_t *pte; kvm_pfn_t pfn; @@ -1646,24 +1847,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) spin_lock(&vcpu->kvm->mmu_lock); - pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) goto out; - if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ + if (pud) { /* HugeTLB */ + *pud = kvm_s2pud_mkyoung(*pud); + pfn = kvm_pud_pfn(*pud); + pfn_valid = true; + } else if (pmd) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; - goto out; + } else { + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; } - pte = pte_offset_kernel(pmd, fault_ipa); - if (pte_none(*pte)) /* Nothing there either */ - goto out; - - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; out: spin_unlock(&vcpu->kvm->mmu_lock); if (pfn_valid) @@ -1865,48 +2065,44 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = pfn_pte(pfn, PAGE_S2); + stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return stage2_pudp_test_and_clear_young(pud); + else if (pmd) return stage2_pmdp_test_and_clear_young(pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (pte_none(*pte)) - return 0; - - return stage2_ptep_test_and_clear_young(pte); + else + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return kvm_s2pud_young(*pud); + else if (pmd) return pmd_young(*pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (!pte_none(*pte)) /* Just a page... */ + else return pte_young(*pte); - - return 0; } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 57b3edebbb40..3828beab93f2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -26,25 +26,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(idx, exit_reason, vcpu_pc), + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), TP_STRUCT__entry( - __field( int, idx ) - __field( unsigned int, exit_reason ) + __field( int, ret ) + __field( unsigned int, esr_ec ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( - __entry->idx = idx; - __entry->exit_reason = exit_reason; + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ), TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", - __print_symbolic(__entry->idx, kvm_arm_exception_type), - __entry->exit_reason, - __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), __entry->vcpu_pc) ); diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index f56ff1cf52ec..ceeda7e04a4d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -313,36 +313,30 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, spin_lock_irqsave(&irq->irq_lock, flags); - /* - * If this virtual IRQ was written into a list register, we - * have to make sure the CPU that runs the VCPU thread has - * synced back the LR state to the struct vgic_irq. - * - * As long as the conditions below are true, we know the VCPU thread - * may be on its way back from the guest (we kicked the VCPU thread in - * vgic_change_active_prepare) and still has to sync back this IRQ, - * so we release and re-acquire the spin_lock to let the other thread - * sync back the IRQ. - * - * When accessing VGIC state from user space, requester_vcpu is - * NULL, which is fine, because we guarantee that no VCPUs are running - * when accessing VGIC state from user space so irq->vcpu->cpu is - * always -1. - */ - while (irq->vcpu && /* IRQ may have state in an LR somewhere */ - irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ - irq->vcpu->cpu != -1) /* VCPU thread is running */ - cond_resched_lock(&irq->irq_lock); - if (irq->hw) { vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else { u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && active && vgic_irq_is_sgi(irq->intid)) - irq->active_source = requester_vcpu->vcpu_id; + irq->active_source = active_source; } if (irq->active) @@ -368,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, */ static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } /* See vgic_change_active_prepare */ static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7cfdfbc910e0..a6b135491b6c 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -103,13 +103,13 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, { /* SGIs and PPIs */ if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE); + intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); return &vcpu->arch.vgic_cpu.private_irqs[intid]; } /* SPIs */ - if (intid <= VGIC_MAX_SPI) { - intid = array_index_nospec(intid, VGIC_MAX_SPI); + if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; } @@ -908,6 +908,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) struct vgic_irq *irq; bool pending = false; unsigned long flags; + struct vgic_vmcr vmcr; if (!vcpu->kvm->arch.vgic.enabled) return false; @@ -915,11 +916,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) return true; + vgic_get_vmcr(vcpu, &vmcr); + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - pending = irq_is_pending(irq) && irq->enabled; + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; spin_unlock(&irq->irq_lock); if (pending) |