summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst22
-rw-r--r--Documentation/virt/kvm/devices/vcpu.rst57
-rw-r--r--arch/arm64/include/asm/kvm_asm.h115
-rw-r--r--arch/arm64/include/asm/kvm_host.h37
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h9
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h251
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h309
-rw-r--r--arch/arm64/include/asm/kvm_ptrauth.h6
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h24
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h19
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h215
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h16
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arm.c19
-rw-r--r--arch/arm64/kvm/hyp.S34
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/entry.S95
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S76
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h15
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S187
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S67
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c117
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c50
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c2
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c883
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c30
-rw-r--r--arch/arm64/kvm/inject_fault.c1
-rw-r--r--arch/arm64/kvm/mmu.c1605
-rw-r--r--arch/arm64/kvm/pmu-emul.c195
-rw-r--r--arch/arm64/kvm/pvtime.c29
-rw-r--r--arch/arm64/kvm/reset.c40
-rw-r--r--arch/arm64/kvm/sys_regs.c5
-rw-r--r--arch/arm64/kvm/trace_arm.h16
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h6
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c24
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c4
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--include/kvm/arm_pmu.h5
-rw-r--r--include/linux/arm-smccc.h74
-rw-r--r--include/linux/kvm_host.h31
-rw-r--r--include/uapi/linux/kvm.h1
42 files changed, 2348 insertions, 2352 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index eb3a1316f03e..d2b733dc7892 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6130,7 +6130,7 @@ HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
-----------------------------------
-:Architecture: x86
+:Architectures: x86
This capability indicates that KVM running on top of Hyper-V hypervisor
enables Direct TLB flush for its guests meaning that TLB flush
@@ -6143,19 +6143,33 @@ in CPUID and only exposes Hyper-V identification. In this case, guest
thinks it's running on Hyper-V and only use Hyper-V hypercalls.
8.22 KVM_CAP_S390_VCPU_RESETS
+-----------------------------
-Architectures: s390
+:Architectures: s390
This capability indicates that the KVM_S390_NORMAL_RESET and
KVM_S390_CLEAR_RESET ioctls are available.
8.23 KVM_CAP_S390_PROTECTED
+---------------------------
-Architecture: s390
-
+:Architectures: s390
This capability indicates that the Ultravisor has been initialized and
KVM can therefore start protected VMs.
This capability governs the KVM_S390_PV_COMMAND ioctl and the
KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
guests when the state change is invalid.
+
+8.24 KVM_CAP_STEAL_TIME
+-----------------------
+
+:Architectures: arm64, x86
+
+This capability indicates that KVM supports steal time accounting.
+When steal time accounting is supported it may be enabled with
+architecture-specific interfaces. This capability and the architecture-
+specific interfaces must be consistent, i.e. if one says the feature
+is supported, than the other should as well and vice versa. For arm64
+see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL".
+For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME".
diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index ca374d3fe085..da7c2ef7dafc 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -25,8 +25,10 @@ Returns:
======= ========================================================
-EBUSY The PMU overflow interrupt is already set
- -ENXIO The overflow interrupt not set when attempting to get it
- -ENODEV PMUv3 not supported
+ -EFAULT Error reading interrupt number
+ -ENXIO PMUv3 not supported or the overflow interrupt not set
+ when attempting to get it
+ -ENODEV KVM_ARM_VCPU_PMU_V3 feature missing from VCPU
-EINVAL Invalid PMU overflow interrupt number supplied or
trying to set the IRQ number without using an in-kernel
irqchip.
@@ -45,9 +47,10 @@ all vcpus, while as an SPI it must be a separate number per vcpu.
Returns:
======= ======================================================
+ -EEXIST Interrupt number already used
-ENODEV PMUv3 not supported or GIC not initialized
- -ENXIO PMUv3 not properly configured or in-kernel irqchip not
- configured as required prior to calling this attribute
+ -ENXIO PMUv3 not supported, missing VCPU feature or interrupt
+ number not set
-EBUSY PMUv3 already initialized
======= ======================================================
@@ -55,6 +58,52 @@ Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel
virtual GIC implementation, this must be done after initializing the in-kernel
irqchip.
+1.3 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_FILTER
+-----------------------------------------
+
+:Parameters: in kvm_device_attr.addr the address for a PMU event filter is a
+ pointer to a struct kvm_pmu_event_filter
+
+:Returns:
+
+ ======= ======================================================
+ -ENODEV: PMUv3 not supported or GIC not initialized
+ -ENXIO: PMUv3 not properly configured or in-kernel irqchip not
+ configured as required prior to calling this attribute
+ -EBUSY: PMUv3 already initialized
+ -EINVAL: Invalid filter range
+ ======= ======================================================
+
+Request the installation of a PMU event filter described as follows:
+
+struct kvm_pmu_event_filter {
+ __u16 base_event;
+ __u16 nevents;
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+ __u8 action;
+ __u8 pad[3];
+};
+
+A filter range is defined as the range [@base_event, @base_event + @nevents),
+together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The
+first registered range defines the global policy (global ALLOW if the first
+@action is DENY, global DENY if the first @action is ALLOW). Multiple ranges
+can be programmed, and must fit within the event space defined by the PMU
+architecture (10 bits on ARMv8.0, 16 bits from ARMv8.1 onwards).
+
+Note: "Cancelling" a filter by registering the opposite action for the same
+range doesn't change the default action. For example, installing an ALLOW
+filter for event range [0:10) as the first filter and then applying a DENY
+action for the same range will leave the whole range as disabled.
+
+Restrictions: Event 0 (SW_INCR) is never filtered, as it doesn't count a
+hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it
+isn't strictly speaking an event. Filtering the cycle counter is possible
+using event 0x11 (CPU_CYCLES).
+
2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
=================================
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 3438e85e1df6..54387ccd1ab2 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -36,6 +36,30 @@
#define __SMCCC_WORKAROUND_1_SMC_SZ 36
+#define KVM_HOST_SMCCC_ID(id) \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ ARM_SMCCC_OWNER_VENDOR_HYP, \
+ (id))
+
+#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name)
+
+#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0
+#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1
+#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
+#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
+#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
+#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
+
#ifndef __ASSEMBLY__
#include <linux/mm.h>
@@ -60,9 +84,6 @@
DECLARE_KVM_VHE_PER_CPU(type, sym); \
DECLARE_KVM_NVHE_PER_CPU(type, sym)
-#define CHOOSE_VHE_SYM(sym) sym
-#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
-
/*
* Compute pointer to a symbol defined in nVHE percpu region.
* Returns NULL if percpu memory has not been allocated yet.
@@ -77,7 +98,30 @@
base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
})
-#ifndef __KVM_NVHE_HYPERVISOR__
+#if defined(__KVM_NVHE_HYPERVISOR__)
+
+#define CHOOSE_NVHE_SYM(sym) sym
+#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym)
+
+/* The nVHE hypervisor shouldn't even try to access VHE symbols */
+extern void *__nvhe_undefined_symbol;
+#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol)
+
+#elif defined(__KVM_VHE_HYPERVISOR__)
+
+#define CHOOSE_VHE_SYM(sym) sym
+#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym)
+
+/* The VHE hypervisor shouldn't even try to access nVHE symbols */
+extern void *__vhe_undefined_symbol;
+#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol)
+
+#else
+
/*
* BIG FAT WARNINGS:
*
@@ -92,18 +136,18 @@
#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \
? CHOOSE_VHE_SYM(sym) \
: CHOOSE_NVHE_SYM(sym))
+
#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \
? this_cpu_ptr(&sym) \
: this_cpu_ptr_nvhe_sym(sym))
+
#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \
? per_cpu_ptr(&sym, cpu) \
: per_cpu_ptr_nvhe_sym(sym, cpu))
-#else
-/* The nVHE hypervisor shouldn't even try to access anything */
-extern void *__nvhe_undefined_symbol;
-#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol
-#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol)
-#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol)
+
+#define CHOOSE_VHE_SYM(sym) sym
+#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
+
#endif
/* Translate a kernel address @ptr into its equivalent linear mapping */
@@ -121,8 +165,10 @@ struct kvm_vcpu;
struct kvm_s2_mmu;
DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
+DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
+#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
@@ -214,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
.endm
+.macro get_loaded_vcpu vcpu, ctxt
+ adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu
+ ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
+.macro set_loaded_vcpu vcpu, ctxt, tmp
+ adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp
+ str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
/*
* KVM extable for unexpected exceptions.
* In the same format _asm_extable, but output to a different section so that
@@ -229,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
.popsection
.endm
+#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
+#define CPU_LR_OFFSET CPU_XREG_OFFSET(30)
+#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8)
+
+/*
+ * We treat x18 as callee-saved as the host may use it as a platform
+ * register (e.g. for shadow call stack).
+ */
+.macro save_callee_saved_regs ctxt
+ str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
+ stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+ stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+ stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+ stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+ stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+ stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro restore_callee_saved_regs ctxt
+ // We require \ctxt is not x18-x28
+ ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
+ ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+ ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+ ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+ ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+ ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+ ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro save_sp_el0 ctxt, tmp
+ mrs \tmp, sp_el0
+ str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+.endm
+
+.macro restore_sp_el0 ctxt, tmp
+ ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+ msr sp_el0, \tmp
+.endm
+
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1247d1f30cb3..0aecbab6a7fb 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -11,6 +11,7 @@
#ifndef __ARM64_KVM_HOST_H__
#define __ARM64_KVM_HOST_H__
+#include <linux/arm-smccc.h>
#include <linux/bitmap.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -79,8 +80,8 @@ struct kvm_s2_mmu {
* for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the
* canonical stage-2 page tables.
*/
- pgd_t *pgd;
phys_addr_t pgd_phys;
+ struct kvm_pgtable *pgt;
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
@@ -110,6 +111,13 @@ struct kvm_arch {
* supported.
*/
bool return_nisv_io_abort_to_user;
+
+ /*
+ * VM-wide PMU filter, implemented as a bitmap and big enough for
+ * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
+ */
+ unsigned long *pmu_filter;
+ unsigned int pmuver;
};
struct kvm_vcpu_fault_info {
@@ -262,8 +270,6 @@ struct kvm_host_data {
struct kvm_pmu_events pmu_events;
};
-typedef struct kvm_host_data kvm_host_data_t;
-
struct vcpu_reset_state {
unsigned long pc;
unsigned long r0;
@@ -368,7 +374,6 @@ struct kvm_vcpu_arch {
/* Guest PV state */
struct {
- u64 steal;
u64 last_steal;
gpa_t base;
} steal;
@@ -481,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
-u64 __kvm_call_hyp(void *hypfn, ...);
-
-#define kvm_call_hyp_nvhe(f, ...) \
- do { \
- DECLARE_KVM_NVHE_SYM(f); \
- __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
- } while(0)
-
-#define kvm_call_hyp_nvhe_ret(f, ...) \
+#define kvm_call_hyp_nvhe(f, ...) \
({ \
- DECLARE_KVM_NVHE_SYM(f); \
- __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
+ struct arm_smccc_res res; \
+ \
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \
+ ##__VA_ARGS__, &res); \
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \
+ \
+ res.a1; \
})
/*
@@ -518,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...);
ret = f(__VA_ARGS__); \
isb(); \
} else { \
- ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \
+ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
} \
\
ret; \
@@ -544,6 +546,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
void kvm_update_stolen_time(struct kvm_vcpu *vcpu);
+bool kvm_arm_pvtime_supported(void);
int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
@@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-DECLARE_KVM_HYP_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
{
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 46689e7db46c..6b664de5ec1f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -12,6 +12,9 @@
#include <asm/alternative.h>
#include <asm/sysreg.h>
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
+
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
@@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
void deactivate_traps_vhe_put(void);
#endif
-u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+u64 __guest_enter(struct kvm_vcpu *vcpu);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt);
+void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
-void __noreturn __hyp_do_panic(unsigned long, ...);
+void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index cff1cebc7590..331394306cce 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -44,16 +44,6 @@
* HYP_VA_MIN = 1 << (VA_BITS - 1)
* HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
*
- * This of course assumes that the trampoline page exists within the
- * VA_BITS range. If it doesn't, then it means we're in the odd case
- * where the kernel idmap (as well as HYP) uses more levels than the
- * kernel runtime page tables (as seen when the kernel is configured
- * for 4k pages, 39bits VA, and yet memory lives just above that
- * limit, forcing the idmap to use 4 levels of page tables while the
- * kernel itself only uses 3). In this particular case, it doesn't
- * matter which side of VA_BITS we use, as we're guaranteed not to
- * conflict with anything.
- *
* When using VHE, there are no separate hyp mappings and all KVM
* functionality is already mapped as part of the main kernel
* mappings, and none of this applies in that case.
@@ -118,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
-static inline bool kvm_page_empty(void *ptr)
-{
- struct page *ptr_page = virt_to_page(ptr);
- return page_count(ptr_page) == 1;
-}
-
+#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to, pgprot_t prot);
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
void __iomem **haddr);
@@ -142,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
-
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
int kvm_mmu_init(void);
-void kvm_clear_hyp_idmap(void);
-
-#define kvm_mk_pmd(ptep) \
- __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
-#define kvm_mk_pud(pmdp) \
- __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_p4d(pmdp) \
- __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
-
-#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
-
-#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
-#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
-#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot)
-
-#define kvm_pud_pfn(pud) pud_pfn(pud)
-
-#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
-#define kvm_pud_mkhuge(pud) pud_mkhuge(pud)
-
-static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
-{
- pte_val(pte) |= PTE_S2_RDWR;
- return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
-{
- pmd_val(pmd) |= PMD_S2_RDWR;
- return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
-{
- pud_val(pud) |= PUD_S2_RDWR;
- return pud;
-}
-
-static inline pte_t kvm_s2pte_mkexec(pte_t pte)
-{
- pte_val(pte) &= ~PTE_S2_XN;
- return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
-{
- pmd_val(pmd) &= ~PMD_S2_XN;
- return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkexec(pud_t pud)
-{
- pud_val(pud) &= ~PUD_S2_XN;
- return pud;
-}
-
-static inline void kvm_set_s2pte_readonly(pte_t *ptep)
-{
- pteval_t old_pteval, pteval;
-
- pteval = READ_ONCE(pte_val(*ptep));
- do {
- old_pteval = pteval;
- pteval &= ~PTE_S2_RDWR;
- pteval |= PTE_S2_RDONLY;
- pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
- } while (pteval != old_pteval);
-}
-
-static inline bool kvm_s2pte_readonly(pte_t *ptep)
-{
- return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
-}
-
-static inline bool kvm_s2pte_exec(pte_t *ptep)
-{
- return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
-}
-
-static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
-{
- kvm_set_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
-{
- return kvm_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
-{
- return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
-}
-
-static inline void kvm_set_s2pud_readonly(pud_t *pudp)
-{
- kvm_set_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_readonly(pud_t *pudp)
-{
- return kvm_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_exec(pud_t *pudp)
-{
- return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
-}
-
-static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
-{
- return pud_mkyoung(pud);
-}
-
-static inline bool kvm_s2pud_young(pud_t pud)
-{
- return pud_young(pud);
-}
-
-#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
-
-#ifdef __PAGETABLE_PMD_FOLDED
-#define hyp_pmd_table_empty(pmdp) (0)
-#else
-#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#endif
-
-#ifdef __PAGETABLE_PUD_FOLDED
-#define hyp_pud_table_empty(pudp) (0)
-#else
-#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
-#endif
-
-#ifdef __PAGETABLE_P4D_FOLDED
-#define hyp_p4d_table_empty(p4dp) (0)
-#else
-#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
-#endif
struct kvm;
@@ -326,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
}
}
-static inline void __kvm_flush_dcache_pte(pte_t pte)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pte_page(pte);
- kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
- }
-}
-
-static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pmd_page(pmd);
- kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
- }
-}
-
-static inline void __kvm_flush_dcache_pud(pud_t pud)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pud_page(pud);
- kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
- }
-}
-
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
-static inline bool __kvm_cpu_uses_extended_idmap(void)
-{
- return __cpu_uses_extended_idmap_level();
-}
-
-static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
-{
- return idmap_ptrs_per_pgd;
-}
-
-/*
- * Can't use pgd_populate here, because the extended idmap adds an extra level
- * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
- * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
- */
-static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
- pgd_t *hyp_pgd,
- pgd_t *merged_hyp_pgd,
- unsigned long hyp_idmap_start)
-{
- int idmap_idx;
- u64 pgd_addr;
-
- /*
- * Use the first entry to access the HYP mappings. It is
- * guaranteed to be free, otherwise we wouldn't use an
- * extended idmap.
- */
- VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
- pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
- merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-
- /*
- * Create another extended level entry that points to the boot HYP map,
- * which contains an ID mapping of the HYP init code. We essentially
- * merge the boot and runtime HYP maps by doing so, but they don't
- * overlap anyway, so this is fine.
- */
- idmap_idx = hyp_idmap_start >> VA_BITS;
- VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
- pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
- merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-}
-
static inline unsigned int kvm_get_vmid_bits(void)
{
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
@@ -479,30 +256,6 @@ static inline void *kvm_get_hyp_vector(void)
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
-/*
- * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
- * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
- * 52bit IPS.
- */
-static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
-{
- int x = ARM64_VTTBR_X(ipa_shift, levels);
-
- return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
-}
-
-static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
-{
- unsigned int x = arm64_vttbr_x(ipa_shift, levels);
-
- return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
-}
-
-static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
-{
- return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
-}
-
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
struct kvm_vmid *vmid = &mmu->vmid;
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
new file mode 100644
index 000000000000..52ab38db04c7
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits: Maximum input address size, in bits.
+ * @start_level: Level at which the page-table walk starts.
+ * @pgd: Pointer to the first top-level entry of the page-table.
+ * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+ u32 ia_bits;
+ u32 start_level;
+ kvm_pte_t *pgd;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_X: Execute permission.
+ * @KVM_PGTABLE_PROT_W: Write permission.
+ * @KVM_PGTABLE_PROT_R: Read permission.
+ * @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ */
+enum kvm_pgtable_prot {
+ KVM_PGTABLE_PROT_X = BIT(0),
+ KVM_PGTABLE_PROT_W = BIT(1),
+ KVM_PGTABLE_PROT_R = BIT(2),
+
+ KVM_PGTABLE_PROT_DEVICE = BIT(3),
+};
+
+#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
+#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
+#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
+ * entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
+ * children.
+ * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
+ * children.
+ */
+enum kvm_pgtable_walk_flags {
+ KVM_PGTABLE_WALK_LEAF = BIT(0),
+ KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
+ KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb: Callback function to invoke during the walk.
+ * @arg: Argument passed to the callback function.
+ * @flags: Bitwise-OR of flags to identify the entry types on which to
+ * invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+ const kvm_pgtable_visitor_fn_t cb;
+ void * const arg;
+ const enum kvm_pgtable_walk_flags flags;
+};
+
+/**
+ * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
+ * @pgt: Uninitialised page-table structure to initialise.
+ * @va_bits: Maximum virtual address bits.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+
+/**
+ * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
+ * @addr: Virtual address at which to place the mapping.
+ * @size: Size of the mapping.
+ * @phys: Physical address of the memory to map.
+ * @prot: Permissions and attributes for the mapping.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable. Attempts to install a new mapping
+ * for a virtual address that is already mapped will be rejected with an
+ * error and a WARN().
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * @pgt: Uninitialised page-table structure to initialise.
+ * @kvm: KVM structure representing the guest virtual machine.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+
+/**
+ * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address at which to place the mapping.
+ * @size: Size of the mapping.
+ * @phys: Physical address of the memory to map.
+ * @prot: Permissions and attributes for the mapping.
+ * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to
+ * allocate page-table pages.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable.
+ *
+ * Note that this function will both coalesce existing table entries and split
+ * existing block mappings, relying on page-faults to fault back areas outside
+ * of the new mapping lazily.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc);
+
+/**
+ * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to remove the mapping.
+ * @size: Size of the mapping.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * TLB invalidation is performed for each page-table entry cleared during the
+ * unmapping operation and the reference count for the page-table page
+ * containing the cleared entry is decremented, with unreferenced pages being
+ * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
+ * FWB is not supported by the CPU.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
+ * without TLB invalidation.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to write-protect,
+ * @size: Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * set the access flag in that entry.
+ *
+ * Return: The old page-table entry prior to setting the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * clear the access flag in that entry.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
+ * page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ * @prot: Additional permissions to grant for the mapping.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * relax the permissions in that entry according to the read, write and
+ * execute permissions specified by @prot. No permissions are removed, and
+ * TLB invalidation is performed after updating the entry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
+ * access flag set.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * Return: True if the page-table entry has the access flag set, false otherwise.
+ */
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
+ * of Coherency for guest stage-2 address
+ * range.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to flush.
+ * @size: Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr: Input address for the start of the walk.
+ * @size: Size of the range to walk.
+ * @walker: Walker callback description.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker);
+
+#endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h
index 0ddf98c3ba9f..0cd0965255d2 100644
--- a/arch/arm64/include/asm/kvm_ptrauth.h
+++ b/arch/arm64/include/asm/kvm_ptrauth.h
@@ -60,7 +60,7 @@
.endm
/*
- * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
+ * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will
* check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as
* (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and
* then proceed ahead with the save/restore of Pointer Authentication
@@ -78,7 +78,7 @@ alternative_else_nop_endif
.L__skip_switch\@:
.endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
alternative_if_not ARM64_HAS_ADDRESS_AUTH
b .L__skip_switch\@
alternative_else_nop_endif
@@ -96,7 +96,7 @@ alternative_else_nop_endif
#else /* !CONFIG_ARM64_PTR_AUTH */
.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
.endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
.endm
#endif /* CONFIG_ARM64_PTR_AUTH */
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d400a4d9aee2..2b5f822fb033 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -156,7 +156,6 @@
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
-#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
#ifdef CONFIG_ARM64_PA_BITS_52
@@ -173,34 +172,11 @@
#define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2)
/*
- * 2nd stage PTE definitions
- */
-#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
-#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
-#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */
-#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */
-
-#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
-#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
-#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
-#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
-
-#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
-#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
-#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */
-
-/*
* Memory Attribute override for Stage-2 (MemAttr[3:0])
*/
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
/*
- * EL2/HYP PTE/PMD definitions
- */
-#define PMD_HYP PMD_SECT_USER
-#define PTE_HYP PTE_USER
-
-/*
* Highest possible physical address supported.
*/
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 4d867c6446c4..8f094c43072a 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -56,7 +56,6 @@ extern bool arm64_use_ng_mappings;
#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT
#define PAGE_KERNEL __pgprot(PROT_NORMAL)
#define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
@@ -64,11 +63,6 @@ extern bool arm64_use_ng_mappings;
#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN)
#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
-#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
-#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
-#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
-#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
-
#define PAGE_S2_MEMATTR(attr) \
({ \
u64 __val; \
@@ -79,19 +73,6 @@ extern bool arm64_use_ng_mappings;
__val; \
})
-#define PAGE_S2_XN \
- ({ \
- u64 __val; \
- if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \
- __val = 0; \
- else \
- __val = PTE_S2_XN; \
- __val; \
- })
-
-#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
-#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
-
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 996bf98f0cab..fe341a6578c3 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -8,7 +8,6 @@
#ifndef __ARM64_S2_PGTABLE_H_
#define __ARM64_S2_PGTABLE_H_
-#include <linux/hugetlb.h>
#include <linux/pgtable.h>
/*
@@ -37,217 +36,12 @@
#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
/*
- * The number of PTRS across all concatenated stage2 tables given by the
- * number of bits resolved at the initial level.
- * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
- * in which case, stage2_pgd_ptrs will have one entry.
- */
-#define pgd_ptrs_shift(ipa, pgdir_shift) \
- ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
-#define __s2_pgd_ptrs(ipa, lvls) \
- (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
-#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
-
-#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-
-/*
* kvm_mmmu_cache_min_pages() is the number of pages required to install
* a stage-2 translation. We pre-allocate the entry level page table at
* the VM creation.
*/
#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
-/* Stage2 PUD definitions when the level is present */
-static inline bool kvm_stage2_has_pud(struct kvm *kvm)
-{
- return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
-}
-
-#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
-#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
-
-#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
-#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
-#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
-#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d)
-
-static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
- pgd_t *pgd, unsigned long address)
-{
- return p4d_offset(pgd, address);
-}
-
-static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
-{
-}
-
-static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
-{
- return false;
-}
-
-static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
- phys_addr_t addr, phys_addr_t end)
-{
- return end;
-}
-
-static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
-{
- if (kvm_stage2_has_pud(kvm))
- return p4d_none(p4d);
- else
- return 0;
-}
-
-static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
-{
- if (kvm_stage2_has_pud(kvm))
- p4d_clear(p4dp);
-}
-
-static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
-{
- if (kvm_stage2_has_pud(kvm))
- return p4d_present(p4d);
- else
- return 1;
-}
-
-static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
-{
- if (kvm_stage2_has_pud(kvm))
- p4d_populate(NULL, p4d, pud);
-}
-
-static inline pud_t *stage2_pud_offset(struct kvm *kvm,
- p4d_t *p4d, unsigned long address)
-{
- if (kvm_stage2_has_pud(kvm))
- return pud_offset(p4d, address);
- else
- return (pud_t *)p4d;
-}
-
-static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
-{
- if (kvm_stage2_has_pud(kvm))
- free_page((unsigned long)pud);
-}
-
-static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
-{
- if (kvm_stage2_has_pud(kvm))
- return kvm_page_empty(pudp);
- else
- return false;
-}
-
-static inline phys_addr_t
-stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
- if (kvm_stage2_has_pud(kvm)) {
- phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
-
- return (boundary - 1 < end - 1) ? boundary : end;
- } else {
- return end;
- }
-}
-
-/* Stage2 PMD definitions when the level is present */
-static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
-{
- return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
-}
-
-#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
-
-static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_none(pud);
- else
- return 0;
-}
-
-static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- pud_clear(pud);
-}
-
-static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_present(pud);
- else
- return 1;
-}
-
-static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
-{
- if (kvm_stage2_has_pmd(kvm))
- pud_populate(NULL, pud, pmd);
-}
-
-static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
- pud_t *pud, unsigned long address)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pmd_offset(pud, address);
- else
- return (pmd_t *)pud;
-}
-
-static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
-{
- if (kvm_stage2_has_pmd(kvm))
- free_page((unsigned long)pmd);
-}
-
-static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_huge(pud);
- else
- return 0;
-}
-
-static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
-{
- if (kvm_stage2_has_pmd(kvm))
- return kvm_page_empty(pmdp);
- else
- return 0;
-}
-
-static inline phys_addr_t
-stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
- if (kvm_stage2_has_pmd(kvm)) {
- phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-
- return (boundary - 1 < end - 1) ? boundary : end;
- } else {
- return end;
- }
-}
-
-static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
-{
- return kvm_page_empty(ptep);
-}
-
-static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
-{
- return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
-}
-
static inline phys_addr_t
stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
@@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
return (boundary - 1 < end - 1) ? boundary : end;
}
-/*
- * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
- * the architectural page-table level.
- */
-#define S2_NO_LEVEL_HINT 0
-#define S2_PUD_LEVEL 1
-#define S2_PMD_LEVEL 2
-#define S2_PTE_LEVEL 3
-
#endif /* __ARM64_S2_PGTABLE_H_ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 7d804fd0a682..1c17c3a24411 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -159,6 +159,21 @@ struct kvm_sync_regs {
struct kvm_arch_memory_slot {
};
+/*
+ * PMU filter structure. Describe a range of events with a particular
+ * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
+ */
+struct kvm_pmu_event_filter {
+ __u16 base_event;
+ __u16 nevents;
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+ __u8 action;
+ __u8 pad[3];
+};
+
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
struct {
@@ -338,6 +353,7 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
#define KVM_ARM_VCPU_PMU_V3_INIT 1
+#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 99977c1972cc..1504c81fbf5d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
- inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
+ inject_fault.o regmap.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
aarch32.o arch_timer.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index f8388da6f3c7..f56122eedffc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,6 +46,8 @@
__asm__(".arch_extension virt");
#endif
+DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
+
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
@@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
int i;
+ bitmap_free(kvm->arch.pmu_filter);
+
kvm_vgic_destroy(kvm);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
@@ -206,6 +210,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = 1;
break;
+ case KVM_CAP_STEAL_TIME:
+ r = kvm_arm_pvtime_supported();
+ break;
default:
r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
break;
@@ -283,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
static_branch_dec(&userspace_irqchip_in_use);
- kvm_mmu_free_memory_caches(vcpu);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
@@ -1309,6 +1316,7 @@ static void cpu_init_hyp_mode(void)
unsigned long hyp_stack_ptr;
unsigned long vector_ptr;
unsigned long tpidr_el2;
+ struct arm_smccc_res res;
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
@@ -1323,7 +1331,8 @@ static void cpu_init_hyp_mode(void)
pgd_ptr = kvm_mmu_get_httbr();
hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
- vector_ptr = (unsigned long)kvm_get_hyp_vector();
+ hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
+ vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
/*
* Call initialization code, and switch to the full blown HYP code.
@@ -1332,7 +1341,9 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
- __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
+ pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
/*
* Disabling SSBD on a non-VHE system requires us to enable SSBS
@@ -1356,6 +1367,8 @@ static void cpu_hyp_reinit(void)
cpu_hyp_reset();
+ *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
+
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
else
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
deleted file mode 100644
index 3c79a1124af2..000000000000
--- a/arch/arm64/kvm/hyp.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-
-/*
- * u64 __kvm_call_hyp(void *hypfn, ...);
- *
- * This is not really a variadic function in the classic C-way and care must
- * be taken when calling this to ensure parameters are passed in registers
- * only, since the stack will change between the caller and the callee.
- *
- * Call the function with the first argument containing a pointer to the
- * function you wish to call in Hyp mode, and subsequent arguments will be
- * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
- * function pointer can be passed). The function being called must be mapped
- * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
- * passed in x0.
- *
- * A function pointer with a value less than 0xfff has a special meaning,
- * and is used to implement hyp stubs in the same way as in
- * arch/arm64/kernel/hyp_stub.S.
- */
-SYM_FUNC_START(__kvm_call_hyp)
- hvc #0
- ret
-SYM_FUNC_END(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index d898f0da5802..4a81eddabcd8 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 76e7eaf4675e..b0afad7a99c6 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/alternative.h>
-#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/fpsimdmacros.h>
#include <asm/kvm.h>
@@ -16,66 +15,28 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_ptrauth.h>
-#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
-#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
-
.text
/*
- * We treat x18 as callee-saved as the host may use it as a platform
- * register (e.g. for shadow call stack).
- */
-.macro save_callee_saved_regs ctxt
- str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro restore_callee_saved_regs ctxt
- // We require \ctxt is not x18-x28
- ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro save_sp_el0 ctxt, tmp
- mrs \tmp, sp_el0
- str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
-.endm
-
-.macro restore_sp_el0 ctxt, tmp
- ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
- msr sp_el0, \tmp
-.endm
-
-/*
- * u64 __guest_enter(struct kvm_vcpu *vcpu,
- * struct kvm_cpu_context *host_ctxt);
+ * u64 __guest_enter(struct kvm_vcpu *vcpu);
*/
SYM_FUNC_START(__guest_enter)
// x0: vcpu
- // x1: host context
- // x2-x17: clobbered by macros
+ // x1-x17: clobbered by macros
// x29: guest context
- // Store the host regs
+ adr_this_cpu x1, kvm_hyp_ctxt, x2
+
+ // Store the hyp regs
save_callee_saved_regs x1
- // Save the host's sp_el0
+ // Save hyp's sp_el0
save_sp_el0 x1, x2
- // Now the host state is stored if we have a pending RAS SError it must
- // affect the host. If any asynchronous exception is pending we defer
- // the guest entry. The DSB isn't necessary before v8.2 as any SError
- // would be fatal.
+ // Now the hyp state is stored if we have a pending RAS SError it must
+ // affect the host or hyp. If any asynchronous exception is pending we
+ // defer the guest entry. The DSB isn't necessary before v8.2 as any
+ // SError would be fatal.
alternative_if ARM64_HAS_RAS_EXTN
dsb nshst
isb
@@ -86,6 +47,8 @@ alternative_else_nop_endif
ret
1:
+ set_loaded_vcpu x0, x1, x2
+
add x29, x0, #VCPU_CONTEXT
// Macro ptrauth_switch_to_guest format:
@@ -116,6 +79,26 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ // If the hyp context is loaded, go straight to hyp_panic
+ get_loaded_vcpu x0, x1
+ cbz x0, hyp_panic
+
+ // The hyp context is saved so make sure it is restored to allow
+ // hyp_panic to run at hyp and, subsequently, panic to run in the host.
+ // This makes use of __guest_exit to avoid duplication but sets the
+ // return address to tail call into hyp_panic. As a side effect, the
+ // current state is saved to the guest context but it will only be
+ // accurate if the guest had been completely restored.
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ adr x1, hyp_panic
+ str x1, [x0, #CPU_XREG_OFFSET(30)]
+
+ get_vcpu_ptr x1, x0
+
SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// x0: return code
// x1: vcpu
@@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest's sp_el0
save_sp_el0 x1, x2
- get_host_ctxt x2, x3
+ adr_this_cpu x2, kvm_hyp_ctxt, x3
- // Macro ptrauth_switch_to_guest format:
- // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
+ // Macro ptrauth_switch_to_hyp format:
+ // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
// The below macro to save/restore keys is not implemented in C code
// as it may cause Pointer Authentication key signing mismatch errors
// when this feature is enabled for kernel code.
- ptrauth_switch_to_host x1, x2, x3, x4, x5
+ ptrauth_switch_to_hyp x1, x2, x3, x4, x5
- // Restore the hosts's sp_el0
+ // Restore hyp's sp_el0
restore_sp_el0 x2, x3
- // Now restore the host regs
+ // Now restore the hyp regs
restore_callee_saved_regs x2
+ set_loaded_vcpu xzr, x1, x2
+
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error
// without an unmask-SError and isb. The ESB-instruction consumed any
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 7ea277b82967..0a5b36eb54b3 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -12,7 +12,6 @@
#include <asm/cpufeature.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
#include <asm/mmu.h>
.macro save_caller_saved_regs_vect
@@ -41,20 +40,6 @@
.text
-.macro do_el2_call
- /*
- * Shuffle the parameters before calling the function
- * pointed to in x0. Assumes parameters in x[1,2,3].
- */
- str lr, [sp, #-16]!
- mov lr, x0
- mov x0, x1
- mov x1, x2
- mov x2, x3
- blr lr
- ldr lr, [sp], #16
-.endm
-
el1_sync: // Guest trapped into EL2
mrs x0, esr_el2
@@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
-#ifdef __KVM_NVHE_HYPERVISOR__
- mrs x1, vttbr_el2 // If vttbr is valid, the guest
- cbnz x1, el1_hvc_guest // called HVC
-
- /* Here, we're pretty sure the host called HVC. */
- ldp x0, x1, [sp], #16
-
- /* Check for a stub HVC call */
- cmp x0, #HVC_STUB_HCALL_NR
- b.hs 1f
-
- /*
- * Compute the idmap address of __kvm_handle_stub_hvc and
- * jump there. Since we use kimage_voffset, do not use the
- * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
- * (by loading it from the constant pool).
- *
- * Preserve x0-x4, which may contain stub parameters.
- */
- ldr x5, =__kvm_handle_stub_hvc
- ldr_l x6, kimage_voffset
-
- /* x5 = __pa(x5) */
- sub x5, x5, x6
- br x5
-
-1:
- /*
- * Perform the EL2 call
- */
- kern_hyp_va x0
- do_el2_call
-
- eret
- sb
-#endif /* __KVM_NVHE_HYPERVISOR__ */
-
-el1_hvc_guest:
/*
* Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
* The workaround has already been applied on the host,
@@ -169,24 +116,7 @@ el2_error:
eret
sb
-#ifdef __KVM_NVHE_HYPERVISOR__
-SYM_FUNC_START(__hyp_do_panic)
- mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
- PSR_MODE_EL1h)
- msr spsr_el2, lr
- ldr lr, =panic
- msr elr_el2, lr
- eret
- sb
-SYM_FUNC_END(__hyp_do_panic)
-#endif
-
-SYM_CODE_START(__hyp_panic)
- get_host_ctxt x0, x1
- b hyp_panic
-SYM_CODE_END(__hyp_panic)
-
-.macro invalid_vector label, target = __hyp_panic
+.macro invalid_vector label, target = __guest_exit_panic
.align 2
SYM_CODE_START(\label)
b \target
@@ -198,7 +128,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_irq_invalid
invalid_vector el2t_fiq_invalid
invalid_vector el2t_error_invalid
- invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
invalid_vector el1_fiq_invalid
@@ -228,10 +157,9 @@ check_preamble_length 661b, 662b
.macro invalid_vect target
.align 7
661:
- b \target
nop
+ stp x0, x1, [sp, #-16]!
662:
- ldp x0, x1, [sp], #16
b \target
check_preamble_length 661b, 662b
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 0d656914f421..eeac62b685a9 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
}
}
-static inline void __activate_vm(struct kvm_s2_mmu *mmu)
-{
- __load_guest_stage2(mmu);
-}
-
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
u64 par, tmp;
@@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
} while(0)
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *ctxt;
@@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
!esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
return false;
- ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
__ptrauth_save_key(ctxt, APIA);
__ptrauth_save_key(ctxt, APIB);
__ptrauth_save_key(ctxt, APDA);
@@ -481,14 +478,13 @@ exit:
static inline void __kvm_unexpected_el2_exception(void)
{
+ extern char __guest_exit_panic[];
unsigned long addr, fixup;
- struct kvm_cpu_context *host_ctxt;
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
entry = hyp_symbol_addr(__start___kvm_ex_table);
end = hyp_symbol_addr(__stop___kvm_ex_table);
- host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
while (entry < end) {
addr = (unsigned long)&entry->insn + entry->insn;
@@ -503,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void)
return;
}
- hyp_panic(host_ctxt);
+ /* Trigger a panic after restoring the hyp context. */
+ write_sysreg(__guest_exit_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 2b27b60182f9..ddde15fe85f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,7 +6,7 @@
asflags-y := -D__KVM_NVHE_HYPERVISOR__
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
new file mode 100644
index 000000000000..ff9a0f547b9f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/assembler.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+ .text
+
+SYM_FUNC_START(__host_exit)
+ stp x0, x1, [sp, #-16]!
+
+ get_host_ctxt x0, x1
+
+ ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+
+ /* Store the host regs x2 and x3 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
+
+ /* Retrieve the host regs x0-x1 from the stack */
+ ldp x2, x3, [sp], #16 // x0, x1
+
+ /* Store the host regs x0-x1 and x4-x17 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
+ stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
+
+ /* Store the host regs x18-x29, lr */
+ save_callee_saved_regs x0
+
+ /* Save the host context pointer in x29 across the function call */
+ mov x29, x0
+ bl handle_trap
+
+ /* Restore host regs x0-x17 */
+ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
+
+ /* x0-7 are use for panic arguments */
+__host_enter_for_panic:
+ ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
+
+ /* Restore host regs x18-x29, lr */
+ restore_callee_saved_regs x29
+
+ /* Do not touch any register after this! */
+__host_enter_without_restoring:
+ eret
+ sb
+SYM_FUNC_END(__host_exit)
+
+/*
+ * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+ */
+SYM_FUNC_START(__hyp_do_panic)
+ /* Load the format arguments into x1-7 */
+ mov x6, x3
+ get_vcpu_ptr x7, x3
+
+ mrs x3, esr_el2
+ mrs x4, far_el2
+ mrs x5, hpfar_el2
+
+ /* Prepare and exit to the host's panic funciton. */
+ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+ PSR_MODE_EL1h)
+ msr spsr_el2, lr
+ ldr lr, =panic
+ msr elr_el2, lr
+
+ /*
+ * Set the panic format string and enter the host, conditionally
+ * restoring the host context.
+ */
+ cmp x0, xzr
+ ldr x0, =__hyp_panic_string
+ b.eq __host_enter_without_restoring
+ b __host_enter_for_panic
+SYM_FUNC_END(__hyp_do_panic)
+
+.macro host_el1_sync_vect
+ .align 7
+.L__vect_start\@:
+ stp x0, x1, [sp, #-16]!
+ mrs x0, esr_el2
+ lsr x0, x0, #ESR_ELx_EC_SHIFT
+ cmp x0, #ESR_ELx_EC_HVC64
+ ldp x0, x1, [sp], #16
+ b.ne __host_exit
+
+ /* Check for a stub HVC call */
+ cmp x0, #HVC_STUB_HCALL_NR
+ b.hs __host_exit
+
+ /*
+ * Compute the idmap address of __kvm_handle_stub_hvc and
+ * jump there. Since we use kimage_voffset, do not use the
+ * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
+ * (by loading it from the constant pool).
+ *
+ * Preserve x0-x4, which may contain stub parameters.
+ */
+ ldr x5, =__kvm_handle_stub_hvc
+ ldr_l x6, kimage_voffset
+
+ /* x5 = __pa(x5) */
+ sub x5, x5, x6
+ br x5
+.L__vect_end\@:
+.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
+ .error "host_el1_sync_vect larger than vector entry"
+.endif
+.endm
+
+.macro invalid_host_el2_vect
+ .align 7
+ /* If a guest is loaded, panic out of it. */
+ stp x0, x1, [sp, #-16]!
+ get_loaded_vcpu x0, x1
+ cbnz x0, __guest_exit_panic
+ add sp, sp, #16
+
+ /*
+ * The panic may not be clean if the exception is taken before the host
+ * context has been saved by __host_exit or after the hyp context has
+ * been partially clobbered by __host_enter.
+ */
+ b hyp_panic
+.endm
+
+.macro invalid_host_el1_vect
+ .align 7
+ mov x0, xzr /* restore_host = false */
+ mrs x1, spsr_el2
+ mrs x2, elr_el2
+ mrs x3, par_el1
+ b __hyp_do_panic
+.endm
+
+/*
+ * The host vector does not use an ESB instruction in order to avoid consuming
+ * SErrors that should only be consumed by the host. Guest entry is deferred by
+ * __guest_enter if there are any pending asynchronous exceptions so hyp will
+ * always return to the host without having consumerd host SErrors.
+ *
+ * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
+ * host knows about the EL2 vectors already, and there is no point in hiding
+ * them.
+ */
+ .align 11
+SYM_CODE_START(__kvm_hyp_host_vector)
+ invalid_host_el2_vect // Synchronous EL2t
+ invalid_host_el2_vect // IRQ EL2t
+ invalid_host_el2_vect // FIQ EL2t
+ invalid_host_el2_vect // Error EL2t
+
+ invalid_host_el2_vect // Synchronous EL2h
+ invalid_host_el2_vect // IRQ EL2h
+ invalid_host_el2_vect // FIQ EL2h
+ invalid_host_el2_vect // Error EL2h
+
+ host_el1_sync_vect // Synchronous 64-bit EL1
+ invalid_host_el1_vect // IRQ 64-bit EL1
+ invalid_host_el1_vect // FIQ 64-bit EL1
+ invalid_host_el1_vect // Error 64-bit EL1
+
+ invalid_host_el1_vect // Synchronous 32-bit EL1
+ invalid_host_el1_vect // IRQ 32-bit EL1
+ invalid_host_el1_vect // FIQ 32-bit EL1
+ invalid_host_el1_vect // Error 32-bit EL1
+SYM_CODE_END(__kvm_hyp_host_vector)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index d9434e90c06d..47224dc62c51 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -4,11 +4,13 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/arm-smccc.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sysreg.h>
@@ -44,27 +46,37 @@ __invalid:
b .
/*
- * x0: HYP pgd
- * x1: HYP stack
- * x2: HYP vectors
- * x3: per-CPU offset
+ * x0: SMCCC function ID
+ * x1: HYP pgd
+ * x2: per-CPU offset
+ * x3: HYP stack
+ * x4: HYP vectors
*/
__do_hyp_init:
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
- phys_to_ttbr x4, x0
+ /* Set tpidr_el2 for use by HYP to free a register */
+ msr tpidr_el2, x2
+
+ mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+ cmp x0, x2
+ b.eq 1f
+ mov x0, #SMCCC_RET_NOT_SUPPORTED
+ eret
+
+1: phys_to_ttbr x0, x1
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x0, x0, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x0
- mrs x4, tcr_el1
- mov_q x5, TCR_EL2_MASK
- and x4, x4, x5
- mov x5, #TCR_EL2_RES1
- orr x4, x4, x5
+ mrs x0, tcr_el1
+ mov_q x1, TCR_EL2_MASK
+ and x0, x0, x1
+ mov x1, #TCR_EL2_RES1
+ orr x0, x0, x1
/*
* The ID map may be configured to use an extended virtual address
@@ -80,18 +92,18 @@ alternative_else_nop_endif
*
* So use the same T0SZ value we use for the ID map.
*/
- ldr_l x5, idmap_t0sz
- bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+ ldr_l x1, idmap_t0sz
+ bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
/*
* Set the PS bits in TCR_EL2.
*/
- tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
+ tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
- msr tcr_el2, x4
+ msr tcr_el2, x0
- mrs x4, mair_el1
- msr mair_el2, x4
+ mrs x0, mair_el1
+ msr mair_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
@@ -103,25 +115,22 @@ alternative_else_nop_endif
* as well as the EE bit on BE. Drop the A flag since the compiler
* is allowed to generate unaligned accesses.
*/
- mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
+ mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
+CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
alternative_if ARM64_HAS_ADDRESS_AUTH
- mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+ mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
- orr x4, x4, x5
+ orr x0, x0, x1
alternative_else_nop_endif
- msr sctlr_el2, x4
+ msr sctlr_el2, x0
isb
/* Set the stack and new vectors */
- kern_hyp_va x1
- mov sp, x1
- msr vbar_el2, x2
-
- /* Set tpidr_el2 for use by HYP */
- msr tpidr_el2, x3
+ mov sp, x3
+ msr vbar_el2, x4
/* Hello, World! */
+ mov x0, #SMCCC_RET_SUCCESS
eret
SYM_CODE_END(__kvm_hyp_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
new file mode 100644
index 000000000000..e2eafe2c93af
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <hyp/switch.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#include <kvm/arm_hypercalls.h>
+
+static void handle_host_hcall(unsigned long func_id,
+ struct kvm_cpu_context *host_ctxt)
+{
+ unsigned long ret = 0;
+
+ switch (func_id) {
+ case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
+
+ ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
+ __kvm_flush_vm_context();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+ phys_addr_t ipa = host_ctxt->regs.regs[2];
+ int level = host_ctxt->regs.regs[3];
+
+ __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
+ u64 cntvoff = host_ctxt->regs.regs[1];
+
+ __kvm_timer_set_cntvoff(cntvoff);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
+ __kvm_enable_ssbs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
+ ret = __vgic_v3_get_ich_vtr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
+ ret = __vgic_v3_read_vmcr();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
+ u32 vmcr = host_ctxt->regs.regs[1];
+
+ __vgic_v3_write_vmcr(vmcr);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
+ __vgic_v3_init_lrs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
+ ret = __kvm_get_mdcr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ default:
+ /* Invalid host HVC. */
+ host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
+ host_ctxt->regs.regs[1] = ret;
+}
+
+void handle_trap(struct kvm_cpu_context *host_ctxt)
+{
+ u64 esr = read_sysreg_el2(SYS_ESR);
+ unsigned long func_id;
+
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
+ hyp_panic();
+
+ func_id = host_ctxt->regs.regs[0];
+ handle_host_hcall(func_id, host_ctxt);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 4472558cbdd9..a457a0306e03 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -27,11 +27,10 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
-/* Non-VHE copy of the kernel symbol. */
-DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-/* Non-VHE instance of kvm_host_data. */
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+/* Non-VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
@@ -48,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
write_sysreg(val, cptr_el2);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
@@ -66,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
+ extern char __kvm_hyp_host_vector[];
u64 mdcr_el2;
___deactivate_traps(vcpu);
@@ -97,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+ write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
-static void __deactivate_vm(struct kvm_vcpu *vcpu)
+static void __load_host_stage2(void)
{
write_sysreg(0, vttbr_el2);
}
@@ -179,8 +181,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
- vcpu = kern_hyp_va(vcpu);
-
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -200,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
- __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
+ __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
__activate_traps(vcpu);
__hyp_vgic_restore_state(vcpu);
@@ -210,7 +210,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
@@ -221,7 +221,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__hyp_vgic_save_state(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
@@ -241,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (system_uses_irq_prio_masking())
gic_write_pmr(GIC_PRIO_IRQOFF);
+ host_ctxt->__hyp_running_vcpu = NULL;
+
return exit_code;
}
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
- unsigned long str_va;
+ bool restore_host = true;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
- if (read_sysreg(vttbr_el2)) {
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ if (vcpu) {
__timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
}
- /*
- * Force the panic string to be loaded from the literal pool,
- * making sure it is a kernel address and not a PC-relative
- * reference.
- */
- asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
-
- __hyp_do_panic(str_va,
- spsr, elr,
- read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
- read_sysreg(hpfar_el2), par, vcpu);
+ __hyp_do_panic(restore_host, spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 69eae608d670..544bca3072b7 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -54,7 +54,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
/*
@@ -108,7 +107,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalls12e1is);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644
index 000000000000..603d6b415337
--- /dev/null
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS 4U
+
+#define KVM_PTE_VALID BIT(0)
+
+#define KVM_PTE_TYPE BIT(1)
+#define KVM_PTE_TYPE_BLOCK 0
+#define KVM_PTE_TYPE_PAGE 1
+#define KVM_PTE_TYPE_TABLE 1
+
+#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+
+struct kvm_pgtable_walk_data {
+ struct kvm_pgtable *pgt;
+ struct kvm_pgtable_walker *walker;
+
+ u64 addr;
+ u64 end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+ /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+ return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+ u64 granule = kvm_granule_size(level);
+
+ /*
+ * Reject invalid block mappings and don't bother with 4TB mappings for
+ * 52-bit PAs.
+ */
+ if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+ return false;
+
+ if (granule > (end - addr))
+ return false;
+
+ return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+ u64 shift = kvm_granule_shift(level);
+ u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+ return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+ u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+ u64 mask = BIT(pgt->ia_bits) - 1;
+
+ return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+ return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+ struct kvm_pgtable pgt = {
+ .ia_bits = ia_bits,
+ .start_level = start_level,
+ };
+
+ return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+ return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+ u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+ return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+ kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+ return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+ return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+ kvm_pte_t pte = *ptep;
+ WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+ pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+ pte |= KVM_PTE_VALID;
+
+ WARN_ON(kvm_pte_valid(old));
+ smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+ u32 level)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+ u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+ KVM_PTE_TYPE_BLOCK;
+
+ pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+ pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+ pte |= KVM_PTE_VALID;
+
+ /* Tolerate KVM recreating the exact same mapping. */
+ if (kvm_pte_valid(old))
+ return old == pte;
+
+ smp_store_release(ptep, pte);
+ return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+ u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag)
+{
+ struct kvm_pgtable_walker *walker = data->walker;
+ return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *ptep, u32 level)
+{
+ int ret = 0;
+ u64 addr = data->addr;
+ kvm_pte_t *childp, pte = *ptep;
+ bool table = kvm_pte_table(pte, level);
+ enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+ if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_PRE);
+ }
+
+ if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_LEAF);
+ pte = *ptep;
+ table = kvm_pte_table(pte, level);
+ }
+
+ if (ret)
+ goto out;
+
+ if (!table) {
+ data->addr += kvm_granule_size(level);
+ goto out;
+ }
+
+ childp = kvm_pte_follow(pte);
+ ret = __kvm_pgtable_walk(data, childp, level + 1);
+ if (ret)
+ goto out;
+
+ if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_POST);
+ }
+
+out:
+ return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level)
+{
+ u32 idx;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+ return -EINVAL;
+
+ for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+ kvm_pte_t *ptep = &pgtable[idx];
+
+ if (data->addr >= data->end)
+ break;
+
+ ret = __kvm_pgtable_visit(data, ptep, level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+ u32 idx;
+ int ret = 0;
+ struct kvm_pgtable *pgt = data->pgt;
+ u64 limit = BIT(pgt->ia_bits);
+
+ if (data->addr > limit || data->end > limit)
+ return -ERANGE;
+
+ if (!pgt->pgd)
+ return -EINVAL;
+
+ for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+ kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+ ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker)
+{
+ struct kvm_pgtable_walk_data walk_data = {
+ .pgt = pgt,
+ .addr = ALIGN_DOWN(addr, PAGE_SIZE),
+ .end = PAGE_ALIGN(walk_data.addr + size),
+ .walker = walker,
+ };
+
+ return _kvm_pgtable_walk(&walk_data);
+}
+
+struct hyp_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+};
+
+static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct hyp_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
+ kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+ u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
+ KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+
+ if (!(prot & KVM_PGTABLE_PROT_R))
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_X) {
+ if (prot & KVM_PGTABLE_PROT_W)
+ return -EINVAL;
+
+ if (device)
+ return -EINVAL;
+ } else {
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
+ }
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep, struct hyp_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+ data->phys += granule;
+ return true;
+}
+
+static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ kvm_pte_t *childp;
+
+ if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+ return 0;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!childp)
+ return -ENOMEM;
+
+ kvm_set_table_pte(ptep, childp);
+ return 0;
+}
+
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ struct hyp_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &map_data,
+ };
+
+ ret = hyp_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ isb();
+ return ret;
+}
+
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+{
+ u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+ pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = va_bits;
+ pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
+ pgt->mmu = NULL;
+ return 0;
+}
+
+static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ return 0;
+}
+
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_free_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ free_page((unsigned long)pgt->pgd);
+ pgt->pgd = NULL;
+}
+
+struct stage2_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+
+ kvm_pte_t *anchor;
+
+ struct kvm_s2_mmu *mmu;
+ struct kvm_mmu_memory_cache *memcache;
+};
+
+static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct stage2_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
+ PAGE_S2_MEMATTR(NORMAL);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+
+ if (!(prot & KVM_PGTABLE_PROT_X))
+ attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ else if (device)
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
+ goto out;
+
+ /* There's an existing valid leaf entry, so perform break-before-make */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+out:
+ data->phys += granule;
+ return true;
+}
+
+static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ if (data->anchor)
+ return 0;
+
+ if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+ return 0;
+
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+ data->anchor = ptep;
+ return 0;
+}
+
+static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ kvm_pte_t *childp, pte = *ptep;
+ struct page *page = virt_to_page(ptep);
+
+ if (data->anchor) {
+ if (kvm_pte_valid(pte))
+ put_page(page);
+
+ return 0;
+ }
+
+ if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
+ goto out_get_page;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ if (!data->memcache)
+ return -ENOMEM;
+
+ childp = kvm_mmu_memory_cache_alloc(data->memcache);
+ if (!childp)
+ return -ENOMEM;
+
+ /*
+ * If we've run into an existing block mapping then replace it with
+ * a table. Accesses beyond 'end' that fall within the new table
+ * will be mapped lazily.
+ */
+ if (kvm_pte_valid(pte)) {
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ put_page(page);
+ }
+
+ kvm_set_table_pte(ptep, childp);
+
+out_get_page:
+ get_page(page);
+ return 0;
+}
+
+static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ int ret = 0;
+
+ if (!data->anchor)
+ return 0;
+
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ put_page(virt_to_page(ptep));
+
+ if (data->anchor == ptep) {
+ data->anchor = NULL;
+ ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+ }
+
+ return ret;
+}
+
+/*
+ * This is a little fiddly, as we use all three of the walk flags. The idea
+ * is that the TABLE_PRE callback runs for table entries on the way down,
+ * looking for table entries which we could conceivably replace with a
+ * block entry for this mapping. If it finds one, then it sets the 'anchor'
+ * field in 'struct stage2_map_data' to point at the table entry, before
+ * clearing the entry to zero and descending into the now detached table.
+ *
+ * The behaviour of the LEAF callback then depends on whether or not the
+ * anchor has been set. If not, then we're not using a block mapping higher
+ * up the table and we perform the mapping at the existing leaves instead.
+ * If, on the other hand, the anchor _is_ set, then we drop references to
+ * all valid leaves so that the pages beneath the anchor can be freed.
+ *
+ * Finally, the TABLE_POST callback does nothing if the anchor has not
+ * been set, but otherwise frees the page-table pages while walking back up
+ * the page-table, installing the block entry when it revisits the anchor
+ * pointer and clearing the anchor to NULL.
+ */
+static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ struct stage2_map_data *data = arg;
+
+ switch (flag) {
+ case KVM_PGTABLE_WALK_TABLE_PRE:
+ return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_map_walk_leaf(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_map_walk_table_post(addr, end, level, ptep, data);
+ }
+
+ return -EINVAL;
+}
+
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc)
+{
+ int ret;
+ struct stage2_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_PRE |
+ KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = &map_data,
+ };
+
+ ret = stage2_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return;
+
+ __flush_dcache_area(addr, size);
+}
+
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+ u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+ return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ struct kvm_s2_mmu *mmu = arg;
+ kvm_pte_t pte = *ptep, *childp = NULL;
+ bool need_flush = false;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ if (kvm_pte_table(pte, level)) {
+ childp = kvm_pte_follow(pte);
+
+ if (page_count(virt_to_page(childp)) != 1)
+ return 0;
+ } else if (stage2_pte_cacheable(pte)) {
+ need_flush = true;
+ }
+
+ /*
+ * This is similar to the map() path in that we unmap the entire
+ * block entry and rely on the remaining portions being faulted
+ * back lazily.
+ */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+ put_page(virt_to_page(ptep));
+
+ if (need_flush) {
+ stage2_flush_dcache(kvm_pte_follow(pte),
+ kvm_granule_size(level));
+ }
+
+ if (childp)
+ free_page((unsigned long)childp);
+
+ return 0;
+}
+
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_unmap_walker,
+ .arg = pgt->mmu,
+ .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+struct stage2_attr_data {
+ kvm_pte_t attr_set;
+ kvm_pte_t attr_clr;
+ kvm_pte_t pte;
+};
+
+static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+ struct stage2_attr_data *data = arg;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ data->pte = pte;
+ pte &= ~data->attr_clr;
+ pte |= data->attr_set;
+
+ /*
+ * We may race with the CPU trying to set the access flag here,
+ * but worst-case the access flag update gets lost and will be
+ * set on the next access instead.
+ */
+ if (data->pte != pte)
+ WRITE_ONCE(*ptep, pte);
+
+ return 0;
+}
+
+static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, kvm_pte_t attr_set,
+ kvm_pte_t attr_clr, kvm_pte_t *orig_pte)
+{
+ int ret;
+ kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
+ struct stage2_attr_data data = {
+ .attr_set = attr_set & attr_mask,
+ .attr_clr = attr_clr & attr_mask,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_attr_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ if (ret)
+ return ret;
+
+ if (orig_pte)
+ *orig_pte = data.pte;
+ return 0;
+}
+
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ return stage2_update_leaf_attrs(pgt, addr, size, 0,
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL);
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+ &pte);
+ dsb(ishst);
+ return pte;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ &pte);
+ /*
+ * "But where's the TLBI?!", you scream.
+ * "Over in the core code", I sigh.
+ *
+ * See the '->clear_flush_young()' callback on the KVM mmu notifier.
+ */
+ return pte;
+}
+
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte);
+ return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+}
+
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ kvm_pte_t set = 0, clr = 0;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ if (prot & KVM_PGTABLE_PROT_X)
+ clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, 0);
+ return ret;
+}
+
+static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+ return 0;
+
+ stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+ return 0;
+}
+
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_flush_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return 0;
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+{
+ size_t pgd_sz;
+ u64 vtcr = kvm->arch.vtcr;
+ u32 ia_bits = VTCR_EL2_IPA(vtcr);
+ u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+ pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+ pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = ia_bits;
+ pgt->start_level = start_level;
+ pgt->mmu = &kvm->arch.mmu;
+
+ /* Ensure zeroed PGD pages are visible to the hardware walker */
+ dsb(ishst);
+ return 0;
+}
+
+static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ put_page(virt_to_page(ptep));
+
+ if (kvm_pte_table(pte, level))
+ free_page((unsigned long)kvm_pte_follow(pte));
+
+ return 0;
+}
+
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_free_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
+ free_pages_exact(pgt->pgd, pgd_sz);
+ pgt->pgd = NULL;
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index a8d407532798..fe69de16dadc 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -28,8 +28,10 @@
const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
-/* VHE instance of kvm_host_data. */
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+/* VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
@@ -62,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
write_sysreg(val, cpacr_el1);
- write_sysreg(kvm_get_hyp_vector(), vbar_el1);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
NOKPROBE_SYMBOL(__activate_traps);
@@ -123,12 +125,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
* HCR_EL2.TGE.
*
* We have already configured the guest's stage 1 translation in
- * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm
- * before __activate_traps, because __activate_vm configures
- * stage 2 translation, and __activate_traps clear HCR_EL2.TGE
- * (among other things).
+ * kvm_vcpu_load_sysregs_vhe above. We must now call
+ * __load_guest_stage2 before __activate_traps, because
+ * __load_guest_stage2 configures stage 2 translation, and
+ * __activate_traps clear HCR_EL2.TGE (among other things).
*/
- __activate_vm(vcpu->arch.hw_mmu);
+ __load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
@@ -136,7 +138,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
@@ -191,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
- struct kvm_cpu_context *host_ctxt)
+static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
+ struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
@@ -207,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
}
NOKPROBE_SYMBOL(__hyp_call_panic);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- __hyp_call_panic(spsr, elr, par, host_ctxt);
+ __hyp_call_panic(spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index ebfdfc27b2bd..34a96ab244fa 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_undefined - inject an undefined instruction into the guest
+ * @vcpu: The vCPU in which to inject the exception
*
* It is assumed that this code is called from the VCPU thread and that the
* VCPU therefore is not currently executing guest code.
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index ba00bcc0c884..c5c26a9cb85b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -14,6 +14,7 @@
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
@@ -21,9 +22,7 @@
#include "trace.h"
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
+static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
static unsigned long hyp_idmap_start;
@@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector;
static unsigned long io_map_base;
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
-
-static bool is_iomap(unsigned long flags)
+/*
+ * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
+ * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
+ * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
+ * long will also starve other vCPUs. We have to also make sure that the page
+ * tables are not freed while we released the lock.
+ */
+static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end,
+ int (*fn)(struct kvm_pgtable *, u64, u64),
+ bool resched)
{
- return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+ int ret;
+ u64 next;
+
+ do {
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ ret = fn(pgt, addr, next - addr);
+ if (ret)
+ break;
+
+ if (resched && next != end)
+ cond_resched_lock(&kvm->mmu_lock);
+ } while (addr = next, addr != end);
+
+ return ret;
}
+#define stage2_apply_range_resched(kvm, addr, end, fn) \
+ stage2_apply_range(kvm, addr, end, fn, true)
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
}
-static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
- int level)
-{
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
- __kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
- __kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
- __kvm_flush_dcache_pud(pud);
-}
-
static bool kvm_is_device_pfn(unsigned long pfn)
{
return !pfn_valid(pfn);
}
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pmd: pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
-{
- if (!pmd_thp_or_huge(*pmd))
- return;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pud: pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
-{
- struct kvm *kvm = mmu->kvm;
-
- if (!stage2_pud_huge(kvm, *pudp))
- return;
-
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- put_page(virt_to_page(pudp));
-}
-
-static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
- stage2_pgd_clear(kvm, pgd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_p4d_free(kvm, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
- stage2_p4d_clear(kvm, p4d);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pud_free(kvm, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-
- VM_BUG_ON(stage2_pud_huge(kvm, *pud));
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pmd_free(kvm, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- free_page((unsigned long)pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
- WRITE_ONCE(*ptep, new_pte);
- dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
- WRITE_ONCE(*pmdp, new_pmd);
- dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
- kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
- WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
- dsb(ishst);
-}
-
-static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
-{
- WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
- dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
-{
-#ifndef __PAGETABLE_P4D_FOLDED
- WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
- dsb(ishst);
-#endif
-}
-
/*
* Unmapping vs dcache management:
*
@@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
* end up writing old data to disk.
*
* This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
+ * the corresponding TLBs, we flush to make sure the IO subsystem will
+ * never hit in the cache.
*
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
* we then fully enforce cacheability of RAM, no matter what the guest
* does.
*/
-static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t start_addr = addr;
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- pte_t old_pte = *pte;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-
- /* No need to invalidate the cache for device mappings */
- if (!kvm_is_device_pfn(pte_pfn(old_pte)))
- kvm_flush_dcache_pte(old_pte);
-
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (stage2_pte_table_empty(mmu->kvm, start_pte))
- clear_stage2_pmd_entry(mmu, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- pmd_t old_pmd = *pmd;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-
- kvm_flush_dcache_pmd(old_pmd);
-
- put_page(virt_to_page(pmd));
- } else {
- unmap_stage2_ptes(mmu, pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-
- if (stage2_pmd_table_empty(kvm, start_pmd))
- clear_stage2_pud_entry(mmu, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pud_t *pud, *start_pud;
-
- start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- pud_t old_pud = *pud;
-
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- kvm_flush_dcache_pud(old_pud);
- put_page(virt_to_page(pud));
- } else {
- unmap_stage2_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-
- if (stage2_pud_table_empty(kvm, start_pud))
- clear_stage2_p4d_entry(mmu, p4d, start_addr);
-}
-
-static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- unmap_stage2_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (stage2_p4d_table_empty(kvm, start_p4d))
- clear_stage2_pgd_entry(mmu, pgd, start_addr);
-}
-
/**
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm: The VM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
+ * @may_block: Whether or not we are permitted to block
*
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
@@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
bool may_block)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
+ phys_addr_t end = start + size;
assert_spin_locked(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Make sure the page table is still active, as another thread
- * could have possibly freed the page table, while we released
- * the lock.
- */
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- unmap_stage2_p4ds(mmu, pgd, addr, next);
- /*
- * If the range is too large, release the kvm->mmu_lock
- * to prevent starvation and lockup detector warnings.
- */
- if (may_block && next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ may_block));
}
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
@@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
__unmap_stage2_range(mmu, start, size, true);
}
-static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
- kvm_flush_dcache_pte(*pte);
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd))
- kvm_flush_dcache_pmd(*pmd);
- else
- stage2_flush_ptes(mmu, pmd, addr, next);
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud))
- kvm_flush_dcache_pud(*pud);
- else
- stage2_flush_pmds(mmu, pud, addr, next);
- }
- } while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_flush_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
static void stage2_flush_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- phys_addr_t next;
- pgd_t *pgd;
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- stage2_flush_p4ds(mmu, pgd, addr, next);
-
- if (next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
}
/**
@@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
- p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
- pgd_clear(pgd);
- p4d_free(NULL, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_p4d_entry(p4d_t *p4d)
-{
- pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
- VM_BUG_ON(p4d_huge(*p4d));
- p4d_clear(p4d);
- pud_free(NULL, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
- pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
- VM_BUG_ON(pud_huge(*pud));
- pud_clear(pud);
- pmd_free(NULL, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- pte_free_kernel(NULL, pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- kvm_set_pte(pte, __pte(0));
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (hyp_pte_table_empty(start_pte))
- clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- /* Hyp doesn't use huge pmds */
- if (!pmd_none(*pmd))
- unmap_hyp_ptes(pmd, addr, next);
- } while (pmd++, addr = next, addr != end);
-
- if (hyp_pmd_table_empty(start_pmd))
- clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pud_t *pud, *start_pud;
-
- start_pud = pud = pud_offset(p4d, addr);
- do {
- next = pud_addr_end(addr, end);
- /* Hyp doesn't use huge puds */
- if (!pud_none(*pud))
- unmap_hyp_pmds(pud, addr, next);
- } while (pud++, addr = next, addr != end);
-
- if (hyp_pud_table_empty(start_pud))
- clear_hyp_p4d_entry(p4d);
-}
-
-static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = p4d_offset(pgd, addr);
- do {
- next = p4d_addr_end(addr, end);
- /* Hyp doesn't use huge p4ds */
- if (!p4d_none(*p4d))
- unmap_hyp_puds(p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (hyp_p4d_table_empty(start_p4d))
- clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
- return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- phys_addr_t start, u64 size)
-{
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
-
- /*
- * We don't unmap anything from HYP, except at the hyp tear down.
- * Hence, we don't have to invalidate the TLBs here.
- */
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
- do {
- next = pgd_addr_end(addr, end);
- if (!pgd_none(*pgd))
- unmap_hyp_p4ds(pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
/**
* free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
*/
void free_hyp_pgds(void)
{
- pgd_t *id_pgd;
-
mutex_lock(&kvm_hyp_pgd_mutex);
-
- id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
- if (id_pgd) {
- /* In case we never called hyp_mmu_init() */
- if (!io_map_base)
- io_map_base = hyp_idmap_start;
- unmap_hyp_idmap_range(id_pgd, io_map_base,
- hyp_idmap_start + PAGE_SIZE - io_map_base);
- }
-
- if (boot_hyp_pgd) {
- free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
- boot_hyp_pgd = NULL;
- }
-
- if (hyp_pgd) {
- unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
- (uintptr_t)high_memory - PAGE_OFFSET);
-
- free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
- hyp_pgd = NULL;
- }
- if (merged_hyp_pgd) {
- clear_page(merged_hyp_pgd);
- free_page((unsigned long)merged_hyp_pgd);
- merged_hyp_pgd = NULL;
+ if (hyp_pgtable) {
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+ kfree(hyp_pgtable);
}
-
mutex_unlock(&kvm_hyp_pgd_mutex);
}
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
+static int __create_hyp_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
{
- pte_t *pte;
- unsigned long addr;
-
- addr = start;
- do {
- pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
- get_page(virt_to_page(pte));
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr, next;
-
- addr = start;
- do {
- pmd = pmd_offset(pud, addr);
-
- BUG_ON(pmd_sect(*pmd));
-
- if (pmd_none(*pmd)) {
- pte = pte_alloc_one_kernel(NULL);
- if (!pte) {
- kvm_err("Cannot allocate Hyp pte\n");
- return -ENOMEM;
- }
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- next = pmd_addr_end(addr, end);
-
- create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- pud = pud_offset(p4d, addr);
-
- if (pud_none_or_clear_bad(pud)) {
- pmd = pmd_alloc_one(NULL, addr);
- if (!pmd) {
- kvm_err("Cannot allocate Hyp pmd\n");
- return -ENOMEM;
- }
- kvm_pud_populate(pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- next = pud_addr_end(addr, end);
- ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- p4d_t *p4d;
- pud_t *pud;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- p4d = p4d_offset(pgd, addr);
-
- if (p4d_none(*p4d)) {
- pud = pud_alloc_one(NULL, addr);
- if (!pud) {
- kvm_err("Cannot allocate Hyp pud\n");
- return -ENOMEM;
- }
- kvm_p4d_populate(p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- next = p4d_addr_end(addr, end);
- ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- unsigned long start, unsigned long end,
- unsigned long pfn, pgprot_t prot)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long addr, next;
- int err = 0;
+ int err;
mutex_lock(&kvm_hyp_pgd_mutex);
- addr = start & PAGE_MASK;
- end = PAGE_ALIGN(end);
- do {
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
- if (pgd_none(*pgd)) {
- p4d = p4d_alloc_one(NULL, addr);
- if (!p4d) {
- kvm_err("Cannot allocate Hyp p4d\n");
- err = -ENOMEM;
- goto out;
- }
- kvm_pgd_populate(pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- next = pgd_addr_end(addr, end);
- err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
- if (err)
- goto out;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-out:
+ err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
mutex_unlock(&kvm_hyp_pgd_mutex);
+
return err;
}
@@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
@@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
int err;
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
- err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
- virt_addr, virt_addr + PAGE_SIZE,
- __phys_to_pfn(phys_addr),
+ err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
prot);
if (err)
return err;
@@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
}
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
- unsigned long *haddr, pgprot_t prot)
+ unsigned long *haddr,
+ enum kvm_pgtable_prot prot)
{
- pgd_t *pgd = hyp_pgd;
unsigned long base;
int ret = 0;
@@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
if (ret)
goto out;
- if (__kvm_cpu_uses_extended_idmap())
- pgd = boot_hyp_pgd;
-
- ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- base, base + size,
- __phys_to_pfn(phys_addr), prot);
+ ret = __create_hyp_mappings(base, size, phys_addr, prot);
if (ret)
goto out;
*haddr = base + offset_in_page(phys_addr);
-
out:
return ret;
}
@@ -989,47 +356,48 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
*
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(mmu->kvm).
- *
+ * Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
- phys_addr_t pgd_phys;
- pgd_t *pgd;
- int cpu;
+ int cpu, err;
+ struct kvm_pgtable *pgt;
- if (mmu->pgd != NULL) {
+ if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
- if (!pgd)
+ pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+ if (!pgt)
return -ENOMEM;
- pgd_phys = virt_to_phys(pgd);
- if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
- return -EINVAL;
+ err = kvm_pgtable_stage2_init(pgt, kvm);
+ if (err)
+ goto out_free_pgtable;
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out_destroy_pgtable;
}
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
mmu->kvm = kvm;
- mmu->pgd = pgd;
- mmu->pgd_phys = pgd_phys;
+ mmu->pgt = pgt;
+ mmu->pgd_phys = __pa(pgt->pgd);
mmu->vmid.vmid_gen = 0;
-
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_stage2_destroy(pgt);
+out_free_pgtable:
+ kfree(pgt);
+ return err;
}
static void stage2_unmap_memslot(struct kvm *kvm,
@@ -1102,363 +470,21 @@ void stage2_unmap_vm(struct kvm *kvm)
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
struct kvm *kvm = mmu->kvm;
- void *pgd = NULL;
+ struct kvm_pgtable *pgt = NULL;
spin_lock(&kvm->mmu_lock);
- if (mmu->pgd) {
- unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
- pgd = READ_ONCE(mmu->pgd);
- mmu->pgd = NULL;
- }
- spin_unlock(&kvm->mmu_lock);
-
- /* Free the HW pgd, one page at a time */
- if (pgd) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
+ pgt = mmu->pgt;
+ if (pgt) {
+ mmu->pgd_phys = 0;
+ mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
-}
-
-static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- p4d_t *p4d;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- if (stage2_pgd_none(kvm, *pgd)) {
- if (!cache)
- return NULL;
- p4d = kvm_mmu_memory_cache_alloc(cache);
- stage2_pgd_populate(kvm, pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- return stage2_p4d_offset(kvm, pgd, addr);
-}
-
-static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- pud_t *pud;
-
- p4d = stage2_get_p4d(mmu, cache, addr);
- if (stage2_p4d_none(kvm, *p4d)) {
- if (!cache)
- return NULL;
- pud = kvm_mmu_memory_cache_alloc(cache);
- stage2_p4d_populate(kvm, p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- return stage2_pud_offset(kvm, p4d, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
-
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud || stage2_pud_huge(kvm, *pud))
- return NULL;
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return NULL;
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pmd_t *new_pmd)
-{
- pmd_t *pmd, old_pmd;
-
-retry:
- pmd = stage2_get_pmd(mmu, cache, addr);
- VM_BUG_ON(!pmd);
-
- old_pmd = *pmd;
- /*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
- *
- * Skip updating the page table if the entry is
- * unchanged.
- */
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
- if (pmd_present(old_pmd)) {
- /*
- * If we already have PTE level mapping for this block,
- * we must unmap it to avoid inconsistent TLB state and
- * leaking the table page. We could end up in this situation
- * if the memory slot was marked for dirty logging and was
- * reverted, leaving PTE level mappings for the pages accessed
- * during the period. So, unmap the PTE level mapping for this
- * block and retry, as we could have released the upper level
- * table in the process.
- *
- * Normal THP split/merge follows mmu_notifier callbacks and do
- * get handled accordingly.
- */
- if (!pmd_thp_or_huge(old_pmd)) {
- unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
- goto retry;
- }
- /*
- * Mapping in huge pages should only happen through a
- * fault. If a page is merged into a transparent huge
- * page, the individual subpages of that huge page
- * should be unmapped through MMU notifiers before we
- * get here.
- *
- * Merging of CompoundPages is not supported; they
- * should become splitting first, unmapped, merged,
- * and mapped back in on-demand.
- */
- WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- } else {
- get_page(virt_to_page(pmd));
- }
-
- kvm_set_pmd(pmd, *new_pmd);
- return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pud_t *new_pudp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp, old_pud;
-
-retry:
- pudp = stage2_get_pud(mmu, cache, addr);
- VM_BUG_ON(!pudp);
-
- old_pud = *pudp;
-
- /*
- * A large number of vcpus faulting on the same stage 2 entry,
- * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
- * Skip updating the page tables if there is no change.
- */
- if (pud_val(old_pud) == pud_val(*new_pudp))
- return 0;
-
- if (stage2_pud_present(kvm, old_pud)) {
- /*
- * If we already have table level mapping for this block, unmap
- * the range for this block and retry.
- */
- if (!stage2_pud_huge(kvm, old_pud)) {
- unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
- goto retry;
- }
-
- WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- } else {
- get_page(virt_to_page(pudp));
- }
-
- kvm_set_pud(pudp, *new_pudp);
- return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
- pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- *pudpp = NULL;
- *pmdpp = NULL;
- *ptepp = NULL;
-
- pudp = stage2_get_pud(mmu, NULL, addr);
- if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
- return false;
-
- if (stage2_pud_huge(kvm, *pudp)) {
- *pudpp = pudp;
- return true;
- }
-
- pmdp = stage2_pmd_offset(kvm, pudp, addr);
- if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
- return false;
-
- if (pmd_thp_or_huge(*pmdp)) {
- *pmdpp = pmdp;
- return true;
- }
-
- ptep = pte_offset_kernel(pmdp, addr);
- if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
- return false;
-
- *ptepp = ptep;
- return true;
-}
-
-static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
-{
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
- bool found;
-
- found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
- if (!found)
- return false;
-
- if (pudp)
- return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
- else if (pmdp)
- return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
- else
- return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte,
- unsigned long flags)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte, old_pte;
- bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
- bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
- VM_BUG_ON(logging_active && !cache);
-
- /* Create stage-2 page table mapping - Levels 0 and 1 */
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PUD, then continue
- * on to allocate page.
- */
- if (logging_active)
- stage2_dissolve_pud(mmu, addr, pud);
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- if (!pmd) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PMD, then continue on to
- * allocate page.
- */
- if (logging_active)
- stage2_dissolve_pmd(mmu, addr, pmd);
-
- /* Create stage-2 page mappings - Level 2 */
- if (pmd_none(*pmd)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pte = kvm_mmu_memory_cache_alloc(cache);
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- pte = pte_offset_kernel(pmd, addr);
-
- if (iomap && pte_present(*pte))
- return -EFAULT;
-
- /* Create 2nd stage page table mapping - Level 3 */
- old_pte = *pte;
- if (pte_present(old_pte)) {
- /* Skip page table update if there is no change */
- if (pte_val(old_pte) == pte_val(*new_pte))
- return 0;
+ spin_unlock(&kvm->mmu_lock);
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
- } else {
- get_page(virt_to_page(pte));
+ if (pgt) {
+ kvm_pgtable_stage2_destroy(pgt);
+ kfree(pgt);
}
-
- kvm_set_pte(pte, *new_pte);
- return 0;
-}
-
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- if (pte_young(*pte)) {
- *pte = pte_mkold(*pte);
- return 1;
- }
- return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pud);
}
/**
@@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
* @guest_ipa: The IPA at which to insert the mapping
* @pa: The physical address of the device
* @size: The size of the mapping
+ * @writable: Whether or not to create a writable mapping
*/
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable)
{
- phys_addr_t addr, end;
+ phys_addr_t addr;
int ret = 0;
- unsigned long pfn;
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_R |
+ (writable ? KVM_PGTABLE_PROT_W : 0);
- end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
- pfn = __phys_to_pfn(pa);
-
- for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
- if (writable)
- pte = kvm_s2pte_mkwrite(pte);
+ size += offset_in_page(guest_ipa);
+ guest_ipa &= PAGE_MASK;
+ for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
- goto out;
+ break;
+
spin_lock(&kvm->mmu_lock);
- ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
- KVM_S2PTE_FLAG_IS_IOMAP);
+ ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+ &cache);
spin_unlock(&kvm->mmu_lock);
if (ret)
- goto out;
+ break;
- pfn++;
+ pa += PAGE_SIZE;
}
-out:
kvm_mmu_free_memory_cache(&cache);
return ret;
}
/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd: pointer to pmd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- if (!kvm_s2pte_readonly(pte))
- kvm_set_s2pte_readonly(pte);
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm: kvm instance for the VM
- * @pud: pointer to pud entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
-
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- if (!kvm_s2pmd_readonly(pmd))
- kvm_set_s2pmd_readonly(pmd);
- } else {
- stage2_wp_ptes(pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect P4D range
- * @p4d: pointer to p4d entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- if (!kvm_s2pud_readonly(pud))
- kvm_set_s2pud_readonly(pud);
- } else {
- stage2_wp_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_p4ds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_wp_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
-/**
* stage2_wp_range() - write protect stage2 memory region range
- * @kvm: The KVM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t next;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Release kvm_mmu_lock periodically if the memory region is
- * large. Otherwise, we may see kernel panics with
- * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
- * CONFIG_LOCKDEP. Additionally, holding the lock too long
- * will also starve other vCPUs. We have to also make sure
- * that the page tables are not freed while we released
- * the lock.
- */
- cond_resched_lock(&kvm->mmu_lock);
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (stage2_pgd_present(kvm, *pgd))
- stage2_wp_p4ds(mmu, pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
@@ -1835,18 +744,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
{
int ret;
bool write_fault, writable, force_pte = false;
- bool exec_fault, needs_exec;
+ bool exec_fault;
+ bool device = false;
unsigned long mmu_seq;
- gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ gfn_t gfn;
kvm_pfn_t pfn;
- pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long vma_pagesize, flags = 0;
- struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+ unsigned long vma_pagesize;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1871,30 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else
vma_shift = PAGE_SHIFT;
- vma_pagesize = 1ULL << vma_shift;
if (logging_active ||
- (vma->vm_flags & VM_PFNMAP) ||
- !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ (vma->vm_flags & VM_PFNMAP)) {
force_pte = true;
- vma_pagesize = PAGE_SIZE;
+ vma_shift = PAGE_SHIFT;
}
- /*
- * The stage2 has a minimum of 2 level table (For arm64 see
- * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
- * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
- * As for PUD huge maps, we must make sure that we have at least
- * 3 levels, i.e, PMD is not folded.
- */
- if (vma_pagesize == PMD_SIZE ||
- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
- gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+ if (vma_shift == PUD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+ vma_shift = PMD_SHIFT;
+
+ if (vma_shift == PMD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ force_pte = true;
+ vma_shift = PAGE_SHIFT;
+ }
+
+ vma_pagesize = 1UL << vma_shift;
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+ fault_ipa &= ~(vma_pagesize - 1);
+
+ gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
- /* We need minimum second+third level pages */
- ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ if (fault_status != FSC_PERM || (logging_active && write_fault)) {
+ ret = kvm_mmu_topup_memory_cache(memcache,
+ kvm_mmu_cache_min_pages(kvm));
+ if (ret)
+ return ret;
+ }
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
@@ -1917,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
- mem_type = PAGE_S2_DEVICE;
- flags |= KVM_S2PTE_FLAG_IS_IOMAP;
- } else if (logging_active) {
- /*
- * Faults on pages in a memslot with logging enabled
- * should not be mapped with huge pages (it introduces churn
- * and performance degradation), so force a pte mapping.
- */
- flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
+ device = true;
+ } else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
* fault.
*/
- if (!write_fault)
- writable = false;
+ writable = false;
}
- if (exec_fault && is_iomap(flags))
+ if (exec_fault && device)
return -ENOEXEC;
spin_lock(&kvm->mmu_lock);
+ pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
@@ -1949,62 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
- if (writable)
+ if (writable) {
+ prot |= KVM_PGTABLE_PROT_W;
kvm_set_pfn_dirty(pfn);
+ mark_page_dirty(kvm, gfn);
+ }
- if (fault_status != FSC_PERM && !is_iomap(flags))
+ if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
- if (exec_fault)
+ if (exec_fault) {
+ prot |= KVM_PGTABLE_PROT_X;
invalidate_icache_guest_page(pfn, vma_pagesize);
+ }
- /*
- * If we took an execution fault we have made the
- * icache/dcache coherent above and should now let the s2
- * mapping be executable.
- *
- * Write faults (!exec_fault && FSC_PERM) are orthogonal to
- * execute permissions, and we preserve whatever we have.
- */
- needs_exec = exec_fault ||
- (fault_status == FSC_PERM &&
- stage2_is_exec(mmu, fault_ipa, vma_pagesize));
-
- if (vma_pagesize == PUD_SIZE) {
- pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
- new_pud = kvm_pud_mkhuge(new_pud);
- if (writable)
- new_pud = kvm_s2pud_mkwrite(new_pud);
-
- if (needs_exec)
- new_pud = kvm_s2pud_mkexec(new_pud);
-
- ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
- } else if (vma_pagesize == PMD_SIZE) {
- pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
- new_pmd = kvm_pmd_mkhuge(new_pmd);
-
- if (writable)
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
- if (needs_exec)
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
+ if (device)
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
- ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
+ if (fault_status == FSC_PERM && !(logging_active && writable)) {
+ ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
- pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
- if (writable) {
- new_pte = kvm_s2pte_mkwrite(new_pte);
- mark_page_dirty(kvm, gfn);
- }
-
- if (needs_exec)
- new_pte = kvm_s2pte_mkexec(new_pte);
-
- ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
+ ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ __pfn_to_phys(pfn), prot,
+ memcache);
}
out_unlock:
@@ -2014,46 +896,23 @@ out_unlock:
return ret;
}
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
+/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- kvm_pfn_t pfn;
- bool pfn_valid = false;
+ pte_t pte;
+ kvm_pte_t kpte;
+ struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
spin_lock(&vcpu->kvm->mmu_lock);
-
- if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
- goto out;
-
- if (pud) { /* HugeTLB */
- *pud = kvm_s2pud_mkyoung(*pud);
- pfn = kvm_pud_pfn(*pud);
- pfn_valid = true;
- } else if (pmd) { /* THP, HugeTLB */
- *pmd = pmd_mkyoung(*pmd);
- pfn = pmd_pfn(*pmd);
- pfn_valid = true;
- } else {
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
- }
-
-out:
+ mmu = vcpu->arch.hw_mmu;
+ kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
spin_unlock(&vcpu->kvm->mmu_lock);
- if (pfn_valid)
- kvm_set_pfn_accessed(pfn);
+
+ pte = __pte(kpte);
+ if (pte_valid(pte))
+ kvm_set_pfn_accessed(pte_pfn(pte));
}
/**
@@ -2224,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end, unsigned flags)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_unmap_hva_range(start, end);
@@ -2234,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pte_t *pte = (pte_t *)data;
+ kvm_pfn_t *pfn = (kvm_pfn_t *)data;
WARN_ON(size != PAGE_SIZE);
+
/*
- * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
- * flag clear because MMU notifiers will have unmapped a huge PMD before
- * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore stage2_set_pte() never needs to clear out a huge PMD
- * through this calling path.
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
*/
- stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
+ __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
- pte_t stage2_pte;
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_set_spte_hva(hva);
@@ -2265,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
+ handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pte_t pte;
+ kvm_pte_t kpte;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return stage2_pudp_test_and_clear_young(pud);
- else if (pmd)
- return stage2_pmdp_test_and_clear_young(pmd);
- else
- return stage2_ptep_test_and_clear_young(pte);
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+ pte = __pte(kpte);
+ return pte_valid(pte) && pte_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return kvm_s2pud_young(*pud);
- else if (pmd)
- return pmd_young(*pmd);
- else
- return pte_young(*pte);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_age_hva(start, end);
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
@@ -2317,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_test_age_hva(hva);
return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
kvm_test_age_hva_handler, NULL);
}
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
phys_addr_t kvm_mmu_get_httbr(void)
{
- if (__kvm_cpu_uses_extended_idmap())
- return virt_to_phys(merged_hyp_pgd);
- else
- return virt_to_phys(hyp_pgd);
+ return __pa(hyp_pgtable->pgd);
}
phys_addr_t kvm_get_idmap_vector(void)
@@ -2342,15 +1171,11 @@ phys_addr_t kvm_get_idmap_vector(void)
return hyp_idmap_vector;
}
-static int kvm_map_idmap_text(pgd_t *pgd)
+static int kvm_map_idmap_text(void)
{
- int err;
-
- /* Create the idmap in the boot page tables */
- err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- hyp_idmap_start, hyp_idmap_end,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP_EXEC);
+ unsigned long size = hyp_idmap_end - hyp_idmap_start;
+ int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
+ PAGE_HYP_EXEC);
if (err)
kvm_err("Failed to idmap %lx-%lx\n",
hyp_idmap_start, hyp_idmap_end);
@@ -2361,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
int kvm_mmu_init(void)
{
int err;
+ u32 hyp_va_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -2374,6 +1200,8 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+ hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
kern_hyp_va(PAGE_OFFSET),
@@ -2391,43 +1219,30 @@ int kvm_mmu_init(void)
goto out;
}
- hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
- if (!hyp_pgd) {
- kvm_err("Hyp mode PGD not allocated\n");
+ hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+ if (!hyp_pgtable) {
+ kvm_err("Hyp mode page-table not allocated\n");
err = -ENOMEM;
goto out;
}
- if (__kvm_cpu_uses_extended_idmap()) {
- boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- hyp_pgd_order);
- if (!boot_hyp_pgd) {
- kvm_err("Hyp boot PGD not allocated\n");
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_map_idmap_text(boot_hyp_pgd);
- if (err)
- goto out;
+ err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+ if (err)
+ goto out_free_pgtable;
- merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- if (!merged_hyp_pgd) {
- kvm_err("Failed to allocate extra HYP pgd\n");
- goto out;
- }
- __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
- hyp_idmap_start);
- } else {
- err = kvm_map_idmap_text(hyp_pgd);
- if (err)
- goto out;
- }
+ err = kvm_map_idmap_text();
+ if (err)
+ goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+out_free_pgtable:
+ kfree(hyp_pgtable);
+ hyp_pgtable = NULL;
out:
- free_hyp_pgds();
return err;
}
@@ -2531,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
- else
+ else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
out:
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f0d0312c0a55..ee13c5eecd3d 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+static u32 kvm_pmu_event_mask(struct kvm *kvm)
+{
+ switch (kvm->arch.pmuver) {
+ case 1: /* ARMv8.0 */
+ return GENMASK(9, 0);
+ case 4: /* ARMv8.1 */
+ case 5: /* ARMv8.4 */
+ case 6: /* ARMv8.5 */
+ return GENMASK(15, 0);
+ default: /* Shouldn't be here, just for sanity */
+ WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
+ return 0;
+ }
+}
+
/**
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
* @vcpu: The vcpu pointer
@@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
return false;
reg = PMEVTYPER0_EL0 + select_idx;
- eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+ eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
}
@@ -495,7 +510,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
/* PMSWINC only applies to ... SW_INC! */
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
- type &= ARMV8_PMU_EVTYPE_EVENT;
+ type &= kvm_pmu_event_mask(vcpu->kvm);
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
continue;
@@ -578,11 +593,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
data = __vcpu_sys_reg(vcpu, reg);
kvm_pmu_stop_counter(vcpu, pmc);
- eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+ if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+ eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ else
+ eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
- /* Software increment event does't need to be backed by a perf event */
- if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
- pmc->idx != ARMV8_PMU_CYCLE_IDX)
+ /* Software increment event doesn't need to be backed by a perf event */
+ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
+ return;
+
+ /*
+ * If we have a filter in place and that the event isn't allowed, do
+ * not install a perf event either.
+ */
+ if (vcpu->kvm->arch.pmu_filter &&
+ !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
memset(&attr, 0, sizeof(struct perf_event_attr));
@@ -594,8 +619,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
- attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
- ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+ attr.config = eventsel;
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
@@ -679,17 +703,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
- u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+ u64 reg, mask;
+
+ mask = ARMV8_PMU_EVTYPE_MASK;
+ mask &= ~ARMV8_PMU_EVTYPE_EVENT;
+ mask |= kvm_pmu_event_mask(vcpu->kvm);
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
- __vcpu_sys_reg(vcpu, reg) = event_type;
+ __vcpu_sys_reg(vcpu, reg) = data & mask;
kvm_pmu_update_pmc_chained(vcpu, select_idx);
kvm_pmu_create_perf_event(vcpu, select_idx);
}
+static int kvm_pmu_probe_pmuver(void)
+{
+ struct perf_event_attr attr = { };
+ struct perf_event *event;
+ struct arm_pmu *pmu;
+ int pmuver = 0xf;
+
+ /*
+ * Create a dummy event that only counts user cycles. As we'll never
+ * leave this function with the event being live, it will never
+ * count anything. But it allows us to probe some of the PMU
+ * details. Yes, this is terrible.
+ */
+ attr.type = PERF_TYPE_RAW;
+ attr.size = sizeof(attr);
+ attr.pinned = 1;
+ attr.disabled = 0;
+ attr.exclude_user = 0;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.exclude_host = 1;
+ attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ attr.sample_period = GENMASK(63, 0);
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_pmu_perf_overflow, &attr);
+
+ if (IS_ERR(event)) {
+ pr_err_once("kvm: pmu event creation failed %ld\n",
+ PTR_ERR(event));
+ return 0xf;
+ }
+
+ if (event->pmu) {
+ pmu = to_arm_pmu(event->pmu);
+ if (pmu->pmuver)
+ pmuver = pmu->pmuver;
+ }
+
+ perf_event_disable(event);
+ perf_event_release_kernel(event);
+
+ return pmuver;
+}
+
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+ unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
+ u64 val, mask = 0;
+ int base, i;
+
+ if (!pmceid1) {
+ val = read_sysreg(pmceid0_el0);
+ base = 0;
+ } else {
+ val = read_sysreg(pmceid1_el0);
+ base = 32;
+ }
+
+ if (!bmap)
+ return val;
+
+ for (i = 0; i < 32; i += 8) {
+ u64 byte;
+
+ byte = bitmap_get_value8(bmap, base + i);
+ mask |= byte << i;
+ byte = bitmap_get_value8(bmap, 0x4000 + base + i);
+ mask |= byte << (32 + i);
+ }
+
+ return val & mask;
+}
+
bool kvm_arm_support_pmu_v3(void)
{
/*
@@ -735,15 +837,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
{
- if (!kvm_arm_support_pmu_v3())
- return -ENODEV;
-
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENXIO;
-
- if (vcpu->arch.pmu.created)
- return -EBUSY;
-
if (irqchip_in_kernel(vcpu->kvm)) {
int ret;
@@ -796,6 +889,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
+ if (!kvm_arm_support_pmu_v3() ||
+ !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENODEV;
+
+ if (vcpu->arch.pmu.created)
+ return -EBUSY;
+
+ if (!vcpu->kvm->arch.pmuver)
+ vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
+
+ if (vcpu->kvm->arch.pmuver == 0xf)
+ return -ENODEV;
+
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
int __user *uaddr = (int __user *)(long)attr->addr;
@@ -804,9 +910,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENODEV;
-
if (get_user(irq, uaddr))
return -EFAULT;
@@ -824,6 +927,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
vcpu->arch.pmu.irq_num = irq;
return 0;
}
+ case KVM_ARM_VCPU_PMU_V3_FILTER: {
+ struct kvm_pmu_event_filter __user *uaddr;
+ struct kvm_pmu_event_filter filter;
+ int nr_events;
+
+ nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
+
+ uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
+
+ if (copy_from_user(&filter, uaddr, sizeof(filter)))
+ return -EFAULT;
+
+ if (((u32)filter.base_event + filter.nevents) > nr_events ||
+ (filter.action != KVM_PMU_EVENT_ALLOW &&
+ filter.action != KVM_PMU_EVENT_DENY))
+ return -EINVAL;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ if (!vcpu->kvm->arch.pmu_filter) {
+ vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
+ if (!vcpu->kvm->arch.pmu_filter) {
+ mutex_unlock(&vcpu->kvm->lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * The default depends on the first applied filter.
+ * If it allows events, the default is to deny.
+ * Conversely, if the first filter denies a set of
+ * events, the default is to allow.
+ */
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
+ else
+ bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
+ }
+
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+ else
+ bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return 0;
+ }
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@@ -860,6 +1010,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ:
case KVM_ARM_VCPU_PMU_V3_INIT:
+ case KVM_ARM_VCPU_PMU_V3_FILTER:
if (kvm_arm_support_pmu_v3() &&
test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return 0;
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
index f7b52ce1557e..920ac43077ad 100644
--- a/arch/arm64/kvm/pvtime.c
+++ b/arch/arm64/kvm/pvtime.c
@@ -13,25 +13,22 @@
void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- u64 steal;
- __le64 steal_le;
- u64 offset;
- int idx;
u64 base = vcpu->arch.steal.base;
+ u64 last_steal = vcpu->arch.steal.last_steal;
+ u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+ u64 steal = 0;
+ int idx;
if (base == GPA_INVALID)
return;
- /* Let's do the local bookkeeping */
- steal = vcpu->arch.steal.steal;
- steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
- vcpu->arch.steal.last_steal = current->sched_info.run_delay;
- vcpu->arch.steal.steal = steal;
-
- steal_le = cpu_to_le64(steal);
idx = srcu_read_lock(&kvm->srcu);
- offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
- kvm_put_guest(kvm, base + offset, steal_le, u64);
+ if (!kvm_get_guest(kvm, base + offset, steal)) {
+ steal = le64_to_cpu(steal);
+ vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay);
+ steal += vcpu->arch.steal.last_steal - last_steal;
+ kvm_put_guest(kvm, base + offset, cpu_to_le64(steal));
+ }
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -43,7 +40,8 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
switch (feature) {
case ARM_SMCCC_HV_PV_TIME_FEATURES:
case ARM_SMCCC_HV_PV_TIME_ST:
- val = SMCCC_RET_SUCCESS;
+ if (vcpu->arch.steal.base != GPA_INVALID)
+ val = SMCCC_RET_SUCCESS;
break;
}
@@ -64,7 +62,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
* Start counting stolen time from the time the guest requests
* the feature enabled.
*/
- vcpu->arch.steal.steal = 0;
vcpu->arch.steal.last_steal = current->sched_info.run_delay;
idx = srcu_read_lock(&kvm->srcu);
@@ -74,7 +71,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
return base;
}
-static bool kvm_arm_pvtime_supported(void)
+bool kvm_arm_pvtime_supported(void)
{
return !!sched_info_on();
}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index f6e8b4a75cbb..f32490229a4c 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -335,7 +335,7 @@ u32 get_kvm_ipa_limit(void)
int kvm_set_ipa_limit(void)
{
- unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+ unsigned int parange, tgran_2;
u64 mmfr0;
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
@@ -372,39 +372,11 @@ int kvm_set_ipa_limit(void)
break;
}
- pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
-
- /* Clamp the IPA limit to the PA size supported by the kernel */
- ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
- /*
- * Since our stage2 table is dependent on the stage1 page table code,
- * we must always honor the following condition:
- *
- * Number of levels in Stage1 >= Number of levels in Stage2.
- *
- * So clamp the ipa limit further down to limit the number of levels.
- * Since we can concatenate upto 16 tables at entry level, we could
- * go upto 4bits above the maximum VA addressable with the current
- * number of levels.
- */
- va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
- va_max += 4;
-
- if (va_max < ipa_max)
- ipa_max = va_max;
-
- /*
- * If the final limit is lower than the real physical address
- * limit of the CPUs, report the reason.
- */
- if (ipa_max < pa_max)
- pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
- (va_max < pa_max) ? "Virtual" : "Physical");
-
- WARN(ipa_max < KVM_PHYS_SHIFT,
- "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
- kvm_ipa_limit = ipa_max;
- kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+ kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+ WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+ "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+ kvm_ipa_limit);
+ kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 7b8a8f6169d0..3c203cb8c103 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
- if (!(p->Op2 & 1))
- pmceid = read_sysreg(pmceid0_el0);
- else
- pmceid = read_sysreg(pmceid1_el0);
+ pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
p->regval = pmceid;
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index 4691053c5ee4..ff0444352bba 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -23,7 +23,7 @@ TRACE_EVENT(kvm_entry,
__entry->vcpu_pc = vcpu_pc;
),
- TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+ TP_printk("PC: 0x%016lx", __entry->vcpu_pc)
);
TRACE_EVENT(kvm_exit,
@@ -42,7 +42,7 @@ TRACE_EVENT(kvm_exit,
__entry->vcpu_pc = vcpu_pc;
),
- TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+ TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx",
__print_symbolic(__entry->ret, kvm_arm_exception_type),
__entry->esr_ec,
__print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
@@ -69,7 +69,7 @@ TRACE_EVENT(kvm_guest_fault,
__entry->ipa = ipa;
),
- TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+ TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx",
__entry->ipa, __entry->hsr,
__entry->hxfar, __entry->vcpu_pc)
);
@@ -131,7 +131,7 @@ TRACE_EVENT(kvm_mmio_emulate,
__entry->cpsr = cpsr;
),
- TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+ TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)",
__entry->vcpu_pc, __entry->instr, __entry->cpsr)
);
@@ -149,7 +149,7 @@ TRACE_EVENT(kvm_unmap_hva_range,
__entry->end = end;
),
- TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
+ TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
__entry->start, __entry->end)
);
@@ -165,7 +165,7 @@ TRACE_EVENT(kvm_set_spte_hva,
__entry->hva = hva;
),
- TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
+ TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
);
TRACE_EVENT(kvm_age_hva,
@@ -182,7 +182,7 @@ TRACE_EVENT(kvm_age_hva,
__entry->end = end;
),
- TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+ TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
__entry->start, __entry->end)
);
@@ -198,7 +198,7 @@ TRACE_EVENT(kvm_test_age_hva,
__entry->hva = hva;
),
- TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+ TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
);
TRACE_EVENT(kvm_set_way_flush,
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index 2c56d1e0f5bd..8d78acc4fba7 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -22,7 +22,7 @@ TRACE_EVENT(kvm_wfx_arm64,
__entry->is_wfe = is_wfe;
),
- TP_printk("guest executed wf%c at: 0x%08lx",
+ TP_printk("guest executed wf%c at: 0x%016lx",
__entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
);
@@ -42,7 +42,7 @@ TRACE_EVENT(kvm_hvc_arm64,
__entry->imm = imm;
),
- TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+ TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)",
__entry->vcpu_pc, __entry->r0, __entry->imm)
);
@@ -135,7 +135,7 @@ TRACE_EVENT(trap_reg,
__entry->write_value = write_value;
),
- TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+ TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
);
TRACE_EVENT(kvm_handle_sys_reg,
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index b13a9e3f99dd..f38c40a76251 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
return 0;
}
-static const struct seq_operations vgic_debug_seq_ops = {
+static const struct seq_operations vgic_debug_sops = {
.start = vgic_debug_start,
.next = vgic_debug_next,
.stop = vgic_debug_stop,
.show = vgic_debug_show
};
-static int debug_open(struct inode *inode, struct file *file)
-{
- int ret;
- ret = seq_open(file, &vgic_debug_seq_ops);
- if (!ret) {
- struct seq_file *seq;
- /* seq_open will have modified file->private_data */
- seq = file->private_data;
- seq->private = inode->i_private;
- }
-
- return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
- .owner = THIS_MODULE,
- .open = debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(vgic_debug);
void vgic_debug_init(struct kvm *kvm)
{
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 76e2d85789ed..9cdf39a94a63 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
if (likely(cpu_if->vgic_sre))
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
- kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
@@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
vgic_v3_vmcr_sync(vcpu);
- kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d39d6cf1d473..75270229a8bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3578,6 +3578,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SMALLER_MAXPHYADDR:
r = (int) allow_smaller_maxphyaddr;
break;
+ case KVM_CAP_STEAL_TIME:
+ r = sched_info_on();
+ break;
default:
break;
}
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 6db030439e29..98cbfe885a53 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -34,6 +34,7 @@ struct kvm_pmu {
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx);
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val);
u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu);
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1);
void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu);
@@ -108,6 +109,10 @@ static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
return 0;
}
+static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+ return 0;
+}
#endif
#endif
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 15c706fb0a37..885c9ffc835c 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -49,6 +49,7 @@
#define ARM_SMCCC_OWNER_OEM 3
#define ARM_SMCCC_OWNER_STANDARD 4
#define ARM_SMCCC_OWNER_STANDARD_HYP 5
+#define ARM_SMCCC_OWNER_VENDOR_HYP 6
#define ARM_SMCCC_OWNER_TRUSTED_APP 48
#define ARM_SMCCC_OWNER_TRUSTED_APP_END 49
#define ARM_SMCCC_OWNER_TRUSTED_OS 50
@@ -227,87 +228,67 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define __count_args(...) \
___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0)
-#define __constraint_write_0 \
- "+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3)
-#define __constraint_write_1 \
- "+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3)
-#define __constraint_write_2 \
- "+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3)
-#define __constraint_write_3 \
- "+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3)
-#define __constraint_write_4 __constraint_write_3
-#define __constraint_write_5 __constraint_write_4
-#define __constraint_write_6 __constraint_write_5
-#define __constraint_write_7 __constraint_write_6
-
-#define __constraint_read_0
-#define __constraint_read_1
-#define __constraint_read_2
-#define __constraint_read_3
-#define __constraint_read_4 "r" (r4)
-#define __constraint_read_5 __constraint_read_4, "r" (r5)
-#define __constraint_read_6 __constraint_read_5, "r" (r6)
-#define __constraint_read_7 __constraint_read_6, "r" (r7)
+#define __constraint_read_0 "r" (arg0)
+#define __constraint_read_1 __constraint_read_0, "r" (arg1)
+#define __constraint_read_2 __constraint_read_1, "r" (arg2)
+#define __constraint_read_3 __constraint_read_2, "r" (arg3)
+#define __constraint_read_4 __constraint_read_3, "r" (arg4)
+#define __constraint_read_5 __constraint_read_4, "r" (arg5)
+#define __constraint_read_6 __constraint_read_5, "r" (arg6)
+#define __constraint_read_7 __constraint_read_6, "r" (arg7)
#define __declare_arg_0(a0, res) \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1"); \
- register unsigned long r2 asm("r2"); \
- register unsigned long r3 asm("r3")
+ register unsigned long arg0 asm("r0") = (u32)a0
#define __declare_arg_1(a0, a1, res) \
typeof(a1) __a1 = a1; \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1") = __a1; \
- register unsigned long r2 asm("r2"); \
- register unsigned long r3 asm("r3")
+ register unsigned long arg0 asm("r0") = (u32)a0; \
+ register typeof(a1) arg1 asm("r1") = __a1
#define __declare_arg_2(a0, a1, a2, res) \
typeof(a1) __a1 = a1; \
typeof(a2) __a2 = a2; \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1") = __a1; \
- register unsigned long r2 asm("r2") = __a2; \
- register unsigned long r3 asm("r3")
+ register unsigned long arg0 asm("r0") = (u32)a0; \
+ register typeof(a1) arg1 asm("r1") = __a1; \
+ register typeof(a2) arg2 asm("r2") = __a2
#define __declare_arg_3(a0, a1, a2, a3, res) \
typeof(a1) __a1 = a1; \
typeof(a2) __a2 = a2; \
typeof(a3) __a3 = a3; \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1") = __a1; \
- register unsigned long r2 asm("r2") = __a2; \
- register unsigned long r3 asm("r3") = __a3
+ register unsigned long arg0 asm("r0") = (u32)a0; \
+ register typeof(a1) arg1 asm("r1") = __a1; \
+ register typeof(a2) arg2 asm("r2") = __a2; \
+ register typeof(a3) arg3 asm("r3") = __a3
#define __declare_arg_4(a0, a1, a2, a3, a4, res) \
typeof(a4) __a4 = a4; \
__declare_arg_3(a0, a1, a2, a3, res); \
- register unsigned long r4 asm("r4") = __a4
+ register typeof(a4) arg4 asm("r4") = __a4
#define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \
typeof(a5) __a5 = a5; \
__declare_arg_4(a0, a1, a2, a3, a4, res); \
- register unsigned long r5 asm("r5") = __a5
+ register typeof(a5) arg5 asm("r5") = __a5
#define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \
typeof(a6) __a6 = a6; \
__declare_arg_5(a0, a1, a2, a3, a4, a5, res); \
- register unsigned long r6 asm("r6") = __a6
+ register typeof(a6) arg6 asm("r6") = __a6
#define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \
typeof(a7) __a7 = a7; \
__declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \
- register unsigned long r7 asm("r7") = __a7
+ register typeof(a7) arg7 asm("r7") = __a7
#define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__)
#define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__)
#define ___constraints(count) \
- : __constraint_write_ ## count \
: __constraint_read_ ## count \
: "memory"
#define __constraints(count) ___constraints(count)
@@ -319,8 +300,13 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
*/
#define __arm_smccc_1_1(inst, ...) \
do { \
+ register unsigned long r0 asm("r0"); \
+ register unsigned long r1 asm("r1"); \
+ register unsigned long r2 asm("r2"); \
+ register unsigned long r3 asm("r3"); \
__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
- asm volatile(inst "\n" \
+ asm volatile(inst "\n" : \
+ "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3) \
__constraints(__count_args(__VA_ARGS__))); \
if (___res) \
*___res = (typeof(*___res)){r0, r1, r2, r3}; \
@@ -366,7 +352,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define __fail_smccc_1_1(...) \
do { \
__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
- asm ("" __constraints(__count_args(__VA_ARGS__))); \
+ asm ("" : __constraints(__count_args(__VA_ARGS__))); \
if (___res) \
___res->a0 = SMCCC_RET_NOT_SUPPORTED; \
} while (0)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a23076765b4c..05e3c2fb3ef7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -749,25 +749,46 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len);
-#define __kvm_put_guest(kvm, gfn, offset, value, type) \
+#define __kvm_get_guest(kvm, gfn, offset, v) \
({ \
unsigned long __addr = gfn_to_hva(kvm, gfn); \
- type __user *__uaddr = (type __user *)(__addr + offset); \
+ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \
int __ret = -EFAULT; \
\
if (!kvm_is_error_hva(__addr)) \
- __ret = put_user(value, __uaddr); \
+ __ret = get_user(v, __uaddr); \
+ __ret; \
+})
+
+#define kvm_get_guest(kvm, gpa, v) \
+({ \
+ gpa_t __gpa = gpa; \
+ struct kvm *__kvm = kvm; \
+ \
+ __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \
+ offset_in_page(__gpa), v); \
+})
+
+#define __kvm_put_guest(kvm, gfn, offset, v) \
+({ \
+ unsigned long __addr = gfn_to_hva(kvm, gfn); \
+ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \
+ int __ret = -EFAULT; \
+ \
+ if (!kvm_is_error_hva(__addr)) \
+ __ret = put_user(v, __uaddr); \
if (!__ret) \
mark_page_dirty(kvm, gfn); \
__ret; \
})
-#define kvm_put_guest(kvm, gpa, value, type) \
+#define kvm_put_guest(kvm, gpa, v) \
({ \
gpa_t __gpa = gpa; \
struct kvm *__kvm = kvm; \
+ \
__kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \
- offset_in_page(__gpa), (value), type); \
+ offset_in_page(__gpa), v); \
})
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f6d86033c4fa..3d8023474f2a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_LAST_CPU 184
#define KVM_CAP_SMALLER_MAXPHYADDR 185
#define KVM_CAP_S390_DIAG318 186
+#define KVM_CAP_STEAL_TIME 187
#ifdef KVM_CAP_IRQ_ROUTING