summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarc Zyngier <maz@kernel.org>2026-04-08 14:25:39 +0300
committerMarc Zyngier <maz@kernel.org>2026-04-08 14:25:39 +0300
commit83a3980750e3cc25cb7ded90f11c157eb3f9f428 (patch)
tree2380e33d4cc63a5cfbb8fb3c2f2ead9e60afaef1
parent73bb0bc2f439cdc80f3d980cd353c7f4b3115466 (diff)
parentbc20692f528b2ac8226bafe5b1db9a1f8be96dbf (diff)
downloadlinux-83a3980750e3cc25cb7ded90f11c157eb3f9f428.tar.xz
Merge branch kvm-arm64/pkvm-protected-guest into kvmarm-master/next
* kvm-arm64/pkvm-protected-guest: (41 commits) : . : pKVM support for protected guests, implementing the very long : awaited support for anonymous memory, as the elusive guestmem : has failed to deliver on its promises despite a multi-year : effort. Patches courtesy of Will Deacon. From the initial cover : letter: : : "[...] this patch series implements support for protected guest : memory with pKVM, where pages are unmapped from the host as they are : faulted into the guest and can be shared back from the guest using pKVM : hypercalls. Protected guests are created using a new machine type : identifier and can be booted to a shell using the kvmtool patches : available at [2], which finally means that we are able to test the pVM : logic in pKVM. Since this is an incremental step towards full isolation : from the host (for example, the CPU register state and DMA accesses are : not yet isolated), creating a pVM requires a developer Kconfig option to : be enabled in addition to booting with 'kvm-arm.mode=protected' and : results in a kernel taint." : . KVM: arm64: Don't hold 'vm_table_lock' across guest page reclaim KVM: arm64: Allow get_pkvm_hyp_vm() to take a reference to a dying VM KVM: arm64: Prevent teardown finalisation of referenced 'hyp_vm' drivers/virt: pkvm: Add Kconfig dependency on DMA_RESTRICTED_POOL KVM: arm64: Rename PKVM_PAGE_STATE_MASK KVM: arm64: Extend pKVM page ownership selftests to cover guest hvcs KVM: arm64: Extend pKVM page ownership selftests to cover forced reclaim KVM: arm64: Register 'selftest_vm' in the VM table KVM: arm64: Extend pKVM page ownership selftests to cover guest donation KVM: arm64: Add some initial documentation for pKVM KVM: arm64: Allow userspace to create protected VMs when pKVM is enabled KVM: arm64: Implement the MEM_UNSHARE hypercall for protected VMs KVM: arm64: Implement the MEM_SHARE hypercall for protected VMs KVM: arm64: Add hvc handler at EL2 for hypercalls from protected VMs KVM: arm64: Return -EFAULT from VCPU_RUN on access to a poisoned pte KVM: arm64: Reclaim faulting page from pKVM in spurious fault handler KVM: arm64: Introduce hypercall to force reclaim of a protected page KVM: arm64: Annotate guest donations with handle and gfn in host stage-2 KVM: arm64: Change 'pkvm_handle_t' to u16 KVM: arm64: Introduce host_stage2_set_owner_metadata_locked() ... Signed-off-by: Marc Zyngier <maz@kernel.org>
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt4
-rw-r--r--Documentation/virt/kvm/arm/index.rst1
-rw-r--r--Documentation/virt/kvm/arm/pkvm.rst106
-rw-r--r--arch/arm64/include/asm/kvm_asm.h31
-rw-r--r--arch/arm64/include/asm/kvm_host.h9
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h45
-rw-r--r--arch/arm64/include/asm/kvm_pkvm.h4
-rw-r--r--arch/arm64/include/asm/virt.h9
-rw-r--r--arch/arm64/kvm/arm.c12
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h10
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h12
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h7
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trap_handler.h2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c184
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c587
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c239
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c1
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c8
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c33
-rw-r--r--arch/arm64/kvm/mmu.c113
-rw-r--r--arch/arm64/kvm/pkvm.c159
-rw-r--r--arch/arm64/mm/fault.c33
-rw-r--r--drivers/virt/coco/pkvm-guest/Kconfig2
-rw-r--r--include/uapi/linux/kvm.h5
24 files changed, 1384 insertions, 232 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 03a550630644..44854a67bc63 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3247,8 +3247,8 @@ Kernel parameters
for the host. To force nVHE on VHE hardware, add
"arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" to the
command-line.
- "nested" is experimental and should be used with
- extreme caution.
+ "nested" and "protected" are experimental and should be
+ used with extreme caution.
kvm-arm.vgic_v3_group0_trap=
[KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index ec09881de4cf..0856b4942e05 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,6 +10,7 @@ ARM
fw-pseudo-registers
hyp-abi
hypercalls
+ pkvm
pvtime
ptp_kvm
vcpu-features
diff --git a/Documentation/virt/kvm/arm/pkvm.rst b/Documentation/virt/kvm/arm/pkvm.rst
new file mode 100644
index 000000000000..514992a79a83
--- /dev/null
+++ b/Documentation/virt/kvm/arm/pkvm.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Protected KVM (pKVM)
+====================
+
+**NOTE**: pKVM is currently an experimental, development feature and
+subject to breaking changes as new isolation features are implemented.
+Please reach out to the developers at kvmarm@lists.linux.dev if you have
+any questions.
+
+Overview
+========
+
+Booting a host kernel with '``kvm-arm.mode=protected``' enables
+"Protected KVM" (pKVM). During boot, pKVM installs a stage-2 identity
+map page-table for the host and uses it to isolate the hypervisor
+running at EL2 from the rest of the host running at EL1/0.
+
+pKVM permits creation of protected virtual machines (pVMs) by passing
+the ``KVM_VM_TYPE_ARM_PROTECTED`` machine type identifier to the
+``KVM_CREATE_VM`` ioctl(). The hypervisor isolates pVMs from the host by
+unmapping pages from the stage-2 identity map as they are accessed by a
+pVM. Hypercalls are provided for a pVM to share specific regions of its
+IPA space back with the host, allowing for communication with the VMM.
+A Linux guest must be configured with ``CONFIG_ARM_PKVM_GUEST=y`` in
+order to issue these hypercalls.
+
+See hypercalls.rst for more details.
+
+Isolation mechanisms
+====================
+
+pKVM relies on a number of mechanisms to isolate PVMs from the host:
+
+CPU memory isolation
+--------------------
+
+Status: Isolation of anonymous memory and metadata pages.
+
+Metadata pages (e.g. page-table pages and '``struct kvm_vcpu``' pages)
+are donated from the host to the hypervisor during pVM creation and
+are consequently unmapped from the stage-2 identity map until the pVM is
+destroyed.
+
+Similarly to regular KVM, pages are lazily mapped into the guest in
+response to stage-2 page faults handled by the host. However, when
+running a pVM, these pages are first pinned and then unmapped from the
+stage-2 identity map as part of the donation procedure. This gives rise
+to some user-visible differences when compared to non-protected VMs,
+largely due to the lack of MMU notifiers:
+
+* Memslots cannot be moved or deleted once the pVM has started running.
+* Read-only memslots and dirty logging are not supported.
+* With the exception of swap, file-backed pages cannot be mapped into a
+ pVM.
+* Donated pages are accounted against ``RLIMIT_MLOCK`` and so the VMM
+ must have a sufficient resource limit or be granted ``CAP_IPC_LOCK``.
+ The lack of a runtime reclaim mechanism means that memory locked for
+ a pVM will remain locked until the pVM is destroyed.
+* Changes to the VMM address space (e.g. a ``MAP_FIXED`` mmap() over a
+ mapping associated with a memslot) are not reflected in the guest and
+ may lead to loss of coherency.
+* Accessing pVM memory that has not been shared back will result in the
+ delivery of a SIGSEGV.
+* If a system call accesses pVM memory that has not been shared back
+ then it will either return ``-EFAULT`` or forcefully reclaim the
+ memory pages. Reclaimed memory is zeroed by the hypervisor and a
+ subsequent attempt to access it in the pVM will return ``-EFAULT``
+ from the ``VCPU_RUN`` ioctl().
+
+CPU state isolation
+-------------------
+
+Status: **Unimplemented.**
+
+DMA isolation using an IOMMU
+----------------------------
+
+Status: **Unimplemented.**
+
+Proxying of Trustzone services
+------------------------------
+
+Status: FF-A and PSCI calls from the host are proxied by the pKVM
+hypervisor.
+
+The FF-A proxy ensures that the host cannot share pVM or hypervisor
+memory with Trustzone as part of a "confused deputy" attack.
+
+The PSCI proxy ensures that CPUs always have the stage-2 identity map
+installed when they are executing in the host.
+
+Protected VM firmware (pvmfw)
+-----------------------------
+
+Status: **Unimplemented.**
+
+Resources
+=========
+
+Quentin Perret's KVM Forum 2022 talk entitled "Protected KVM on arm64: A
+technical deep dive" remains a good resource for learning more about
+pKVM, despite some of the details having changed in the meantime:
+
+https://www.youtube.com/watch?v=9npebeVFbFw
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 93371b6138d1..37414440cee7 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -51,7 +51,7 @@
#include <linux/mm.h>
enum __kvm_host_smccc_func {
- /* Hypercalls available only prior to pKVM finalisation */
+ /* Hypercalls that are unavailable once pKVM has finalised. */
/* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */
__KVM_HOST_SMCCC_FUNC___pkvm_init = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1,
__KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping,
@@ -60,16 +60,9 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs,
__KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config,
__KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
+ __KVM_HOST_SMCCC_FUNC_MIN_PKVM = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
- /* Hypercalls available after pKVM finalisation */
- __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest,
+ /* Hypercalls that are always available and common to [nh]VHE/pKVM. */
__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
@@ -83,11 +76,27 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
__KVM_HOST_SMCCC_FUNC___vgic_v5_save_apr,
__KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
+ __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
+
+ /* Hypercalls that are available only when pKVM has finalised. */
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_donate_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest,
__KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
- __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
+ __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_in_poison_fault,
+ __KVM_HOST_SMCCC_FUNC___pkvm_force_reclaim_guest_page,
+ __KVM_HOST_SMCCC_FUNC___pkvm_reclaim_dying_guest_page,
+ __KVM_HOST_SMCCC_FUNC___pkvm_start_teardown_vm,
+ __KVM_HOST_SMCCC_FUNC___pkvm_finalize_teardown_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f8c1e6920fd3..1a4aac11c969 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -251,7 +251,7 @@ struct kvm_smccc_features {
unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */
};
-typedef unsigned int pkvm_handle_t;
+typedef u16 pkvm_handle_t;
struct kvm_protected_vm {
pkvm_handle_t handle;
@@ -259,6 +259,13 @@ struct kvm_protected_vm {
struct kvm_hyp_memcache stage2_teardown_mc;
bool is_protected;
bool is_created;
+
+ /*
+ * True when the guest is being torn down. When in this state, the
+ * guest's vCPUs can't be loaded anymore, but its pages can be
+ * reclaimed by the host.
+ */
+ bool is_dying;
};
struct kvm_mpidr_data {
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index c201168f2857..41a8687938eb 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -99,14 +99,30 @@ typedef u64 kvm_pte_t;
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID 1
+/* pKVM invalid pte encodings */
+#define KVM_INVALID_PTE_TYPE_MASK GENMASK(63, 60)
+#define KVM_INVALID_PTE_ANNOT_MASK ~(KVM_PTE_VALID | \
+ KVM_INVALID_PTE_TYPE_MASK)
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED BIT(10)
+enum kvm_invalid_pte_type {
+ /*
+ * Used to indicate a pte for which a 'break-before-make'
+ * sequence is in progress.
+ */
+ KVM_INVALID_PTE_TYPE_LOCKED = 1,
+
+ /*
+ * pKVM has unmapped the page from the host due to a change of
+ * ownership.
+ */
+ KVM_HOST_INVALID_PTE_TYPE_DONATION,
+
+ /*
+ * The page has been forcefully reclaimed from the guest by the
+ * host.
+ */
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED,
+};
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
@@ -658,14 +674,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
void *mc, enum kvm_pgtable_walk_flags flags);
/**
- * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
- * track ownership.
+ * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space
+ * to track ownership (and more).
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Base intermediate physical address to annotate.
* @size: Size of the annotated range.
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
* page-table pages.
- * @owner_id: Unique identifier for the owner of the page.
+ * @type: The type of the annotation, determining its meaning and format.
+ * @annotation: A 59-bit value that will be stored in the page tables.
+ * @annotation[0] and @annotation[63:60] must be 0.
+ * @annotation[59:1] is stored in the page tables, along
+ * with @type.
*
* By default, all page-tables are owned by identifier 0. This function can be
* used to mark portions of the IPA space as owned by other entities. When a
@@ -674,8 +694,9 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
*
* Return: 0 on success, negative error code on failure.
*/
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
- void *mc, u8 owner_id);
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ void *mc, enum kvm_invalid_pte_type type,
+ kvm_pte_t annotation);
/**
* kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 757076ad4ec9..2954b311128c 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -17,7 +17,7 @@
#define HYP_MEMBLOCK_REGIONS 128
-int pkvm_init_host_vm(struct kvm *kvm);
+int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
int pkvm_create_hyp_vm(struct kvm *kvm);
bool pkvm_hyp_vm_is_created(struct kvm *kvm);
void pkvm_destroy_hyp_vm(struct kvm *kvm);
@@ -40,8 +40,6 @@ static inline bool kvm_pkvm_ext_allowed(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPU_ID:
case KVM_CAP_MSI_DEVID:
case KVM_CAP_ARM_VM_IPA_SIZE:
- case KVM_CAP_ARM_PMU_V3:
- case KVM_CAP_ARM_SVE:
case KVM_CAP_ARM_PTRAUTH_ADDRESS:
case KVM_CAP_ARM_PTRAUTH_GENERIC:
return true;
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index b51ab6840f9c..b546703c3ab9 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -94,6 +94,15 @@ static inline bool is_pkvm_initialized(void)
static_branch_likely(&kvm_protected_mode_initialized);
}
+#ifdef CONFIG_KVM
+bool pkvm_force_reclaim_guest_page(phys_addr_t phys);
+#else
+static inline bool pkvm_force_reclaim_guest_page(phys_addr_t phys)
+{
+ return false;
+}
+#endif
+
/* Reports the availability of HYP mode */
static inline bool is_hyp_mode_available(void)
{
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c4c810b01012..d7fd6880bd8d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -208,6 +208,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ if (type & ~KVM_VM_TYPE_ARM_MASK)
+ return -EINVAL;
+
mutex_init(&kvm->arch.config_lock);
#ifdef CONFIG_LOCKDEP
@@ -239,9 +242,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
* If any failures occur after this is successful, make sure to
* call __pkvm_unreserve_vm to unreserve the VM in hyp.
*/
- ret = pkvm_init_host_vm(kvm);
+ ret = pkvm_init_host_vm(kvm, type);
if (ret)
- goto err_free_cpumask;
+ goto err_uninit_mmu;
+ } else if (type & KVM_VM_TYPE_ARM_PROTECTED) {
+ ret = -EINVAL;
+ goto err_uninit_mmu;
}
kvm_vgic_early_init(kvm);
@@ -257,6 +263,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
+err_uninit_mmu:
+ kvm_uninit_stage2_mmu(kvm);
err_free_cpumask:
free_cpumask_var(kvm->arch.supported_cpus);
err_unshare_kvm:
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index f8a7b8c04c49..3cbfae0e3dda 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -27,16 +27,22 @@ extern struct host_mmu host_mmu;
enum pkvm_component_id {
PKVM_ID_HOST,
PKVM_ID_HYP,
- PKVM_ID_FFA,
+ PKVM_ID_GUEST,
};
int __pkvm_prot_finalize(void);
int __pkvm_host_share_hyp(u64 pfn);
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn);
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn);
int __pkvm_host_unshare_hyp(u64 pfn);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
+int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu);
+int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys);
+int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm);
int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
enum kvm_pgtable_prot prot);
int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm);
@@ -68,6 +74,8 @@ static __always_inline void __load_host_stage2(void)
#ifdef CONFIG_NVHE_EL2_DEBUG
void pkvm_ownership_selftest(void *base);
+struct pkvm_hyp_vcpu *init_selftest_vm(void *virt);
+void teardown_selftest_vm(void);
#else
static inline void pkvm_ownership_selftest(void *base) { }
#endif
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index dee1a406b0c2..b50712d47f6d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -30,8 +30,14 @@ enum pkvm_page_state {
* struct hyp_page.
*/
PKVM_NOPAGE = BIT(0) | BIT(1),
+
+ /*
+ * 'Meta-states' which aren't encoded directly in the PTE's SW bits (or
+ * the hyp_vmemmap entry for the host)
+ */
+ PKVM_POISON = BIT(2),
};
-#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1))
+#define PKVM_PAGE_STATE_VMEMMAP_MASK (BIT(0) | BIT(1))
#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
@@ -108,12 +114,12 @@ static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state
static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p)
{
- return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK;
+ return p->__hyp_state_comp ^ PKVM_PAGE_STATE_VMEMMAP_MASK;
}
static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state)
{
- p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK;
+ p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_VMEMMAP_MASK;
}
/*
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 184ad7a39950..c904647d2f76 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -73,8 +73,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva);
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
unsigned long vcpu_hva);
-int __pkvm_teardown_vm(pkvm_handle_t handle);
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn);
+int __pkvm_start_teardown_vm(pkvm_handle_t handle);
+int __pkvm_finalize_teardown_vm(pkvm_handle_t handle);
+
+struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle);
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
unsigned int vcpu_idx);
void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
@@ -84,6 +88,7 @@ struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle);
struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle);
void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm);
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
index ba5382c12787..32d7b7746e8e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -16,4 +16,6 @@
__always_unused int ___check_reg_ ## reg; \
type name = (type)cpu_reg(ctxt, (reg))
+void inject_host_exception(u64 esr);
+
#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 31f080307d95..fd391eda8fca 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -173,9 +173,6 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
struct pkvm_hyp_vcpu *hyp_vcpu;
- if (!is_protected_kvm_enabled())
- return;
-
hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx);
if (!hyp_vcpu)
return;
@@ -192,12 +189,8 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
{
- struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
- if (!is_protected_kvm_enabled())
- return;
-
- hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (hyp_vcpu)
pkvm_put_hyp_vcpu(hyp_vcpu);
}
@@ -252,6 +245,26 @@ static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
&host_vcpu->arch.pkvm_memcache);
}
+static void handle___pkvm_host_donate_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || !pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = pkvm_refill_memcache(hyp_vcpu);
+ if (ret)
+ goto out;
+
+ ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, pfn, host_ctxt, 1);
@@ -261,9 +274,6 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -285,9 +295,6 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
@@ -305,9 +312,6 @@ static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ct
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -325,9 +329,6 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
@@ -347,9 +348,6 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
@@ -366,9 +364,6 @@ static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -428,12 +423,8 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
- struct pkvm_hyp_vm *hyp_vm;
+ struct pkvm_hyp_vm *hyp_vm = get_np_pkvm_hyp_vm(handle);
- if (!is_protected_kvm_enabled())
- return;
-
- hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
return;
@@ -584,11 +575,42 @@ static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
}
-static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_vcpu_in_poison_fault(struct kvm_cpu_context *host_ctxt)
+{
+ int ret;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+
+ ret = hyp_vcpu ? __pkvm_vcpu_in_poison_fault(hyp_vcpu) : -EINVAL;
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_force_reclaim_guest_page(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_host_force_reclaim_page_guest(phys);
+}
+
+static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, gfn);
+}
+
+static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
- cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+ cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle);
+}
+
+static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle);
}
static void handle___tracing_load(struct kvm_cpu_context *host_ctxt)
@@ -678,14 +700,6 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_get_gic_config),
HANDLE_FUNC(__pkvm_prot_finalize),
- HANDLE_FUNC(__pkvm_host_share_hyp),
- HANDLE_FUNC(__pkvm_host_unshare_hyp),
- HANDLE_FUNC(__pkvm_host_share_guest),
- HANDLE_FUNC(__pkvm_host_unshare_guest),
- HANDLE_FUNC(__pkvm_host_relax_perms_guest),
- HANDLE_FUNC(__pkvm_host_wrprotect_guest),
- HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
- HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
@@ -699,11 +713,25 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
HANDLE_FUNC(__vgic_v5_save_apr),
HANDLE_FUNC(__vgic_v5_restore_vmcr_apr),
+
+ HANDLE_FUNC(__pkvm_host_share_hyp),
+ HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__pkvm_host_donate_guest),
+ HANDLE_FUNC(__pkvm_host_share_guest),
+ HANDLE_FUNC(__pkvm_host_unshare_guest),
+ HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+ HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+ HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+ HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__pkvm_reserve_vm),
HANDLE_FUNC(__pkvm_unreserve_vm),
HANDLE_FUNC(__pkvm_init_vm),
HANDLE_FUNC(__pkvm_init_vcpu),
- HANDLE_FUNC(__pkvm_teardown_vm),
+ HANDLE_FUNC(__pkvm_vcpu_in_poison_fault),
+ HANDLE_FUNC(__pkvm_force_reclaim_guest_page),
+ HANDLE_FUNC(__pkvm_reclaim_dying_guest_page),
+ HANDLE_FUNC(__pkvm_start_teardown_vm),
+ HANDLE_FUNC(__pkvm_finalize_teardown_vm),
HANDLE_FUNC(__pkvm_vcpu_load),
HANDLE_FUNC(__pkvm_vcpu_put),
HANDLE_FUNC(__pkvm_tlb_flush_vmid),
@@ -720,7 +748,7 @@ static const hcall_t host_hcall[] = {
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(unsigned long, id, host_ctxt, 0);
- unsigned long hcall_min = 0;
+ unsigned long hcall_min = 0, hcall_max = -1;
hcall_t hfn;
/*
@@ -732,14 +760,19 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
* basis. This is all fine, however, since __pkvm_prot_finalize
* returns -EPERM after the first call for a given CPU.
*/
- if (static_branch_unlikely(&kvm_protected_mode_initialized))
- hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;
+ if (static_branch_unlikely(&kvm_protected_mode_initialized)) {
+ hcall_min = __KVM_HOST_SMCCC_FUNC_MIN_PKVM;
+ } else {
+ hcall_max = __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM;
+ }
id &= ~ARM_SMCCC_CALL_HINTS;
id -= KVM_HOST_SMCCC_ID(0);
- if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall)))
+ if (unlikely(id < hcall_min || id > hcall_max ||
+ id >= ARRAY_SIZE(host_hcall))) {
goto inval;
+ }
hfn = host_hcall[id];
if (unlikely(!hfn))
@@ -777,43 +810,52 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
kvm_skip_host_instr();
}
-/*
- * Inject an Undefined Instruction exception into the host.
- *
- * This is open-coded to allow control over PSTATE construction without
- * complicating the generic exception entry helpers.
- */
-static void inject_undef64(void)
+void inject_host_exception(u64 esr)
{
- u64 spsr_mask, vbar, sctlr, old_spsr, new_spsr, esr, offset;
-
- spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT;
+ u64 sctlr, spsr_el1, spsr_el2, exc_offset = except_type_sync;
+ const u64 spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT |
+ PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT;
- vbar = read_sysreg_el1(SYS_VBAR);
- sctlr = read_sysreg_el1(SYS_SCTLR);
- old_spsr = read_sysreg_el2(SYS_SPSR);
+ spsr_el1 = spsr_el2 = read_sysreg_el2(SYS_SPSR);
+ switch (spsr_el1 & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
+ case PSR_MODE_EL0t:
+ exc_offset += LOWER_EL_AArch64_VECTOR;
+ break;
+ case PSR_MODE_EL0t | PSR_MODE32_BIT:
+ exc_offset += LOWER_EL_AArch32_VECTOR;
+ break;
+ default:
+ exc_offset += CURRENT_EL_SP_ELx_VECTOR;
+ }
- new_spsr = old_spsr & spsr_mask;
- new_spsr |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT;
- new_spsr |= PSR_MODE_EL1h;
+ spsr_el2 &= spsr_mask;
+ spsr_el2 |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
+ PSR_MODE_EL1h;
+ sctlr = read_sysreg_el1(SYS_SCTLR);
if (!(sctlr & SCTLR_EL1_SPAN))
- new_spsr |= PSR_PAN_BIT;
+ spsr_el2 |= PSR_PAN_BIT;
if (sctlr & SCTLR_ELx_DSSBS)
- new_spsr |= PSR_SSBS_BIT;
+ spsr_el2 |= PSR_SSBS_BIT;
if (system_supports_mte())
- new_spsr |= PSR_TCO_BIT;
+ spsr_el2 |= PSR_TCO_BIT;
- esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) | ESR_ELx_IL;
- offset = CURRENT_EL_SP_ELx_VECTOR + except_type_sync;
+ if (esr_fsc_is_translation_fault(esr))
+ write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR);
write_sysreg_el1(esr, SYS_ESR);
write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
- write_sysreg_el1(old_spsr, SYS_SPSR);
- write_sysreg_el2(vbar + offset, SYS_ELR);
- write_sysreg_el2(new_spsr, SYS_SPSR);
+ write_sysreg_el1(spsr_el1, SYS_SPSR);
+ write_sysreg_el2(read_sysreg_el1(SYS_VBAR) + exc_offset, SYS_ELR);
+ write_sysreg_el2(spsr_el2, SYS_SPSR);
+}
+
+static void inject_host_undef64(void)
+{
+ inject_host_exception((ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_IL);
}
static bool handle_host_mte(u64 esr)
@@ -836,7 +878,7 @@ static bool handle_host_mte(u64 esr)
return false;
}
- inject_undef64();
+ inject_host_undef64();
return true;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d815265bd374..28a471d1927c 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -18,6 +18,7 @@
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_AS_S1 | KVM_PGTABLE_S2_IDMAP)
@@ -461,8 +462,15 @@ static bool range_is_memory(u64 start, u64 end)
static inline int __host_stage2_idmap(u64 start, u64 end,
enum kvm_pgtable_prot prot)
{
+ /*
+ * We don't make permission changes to the host idmap after
+ * initialisation, so we can squash -EAGAIN to save callers
+ * having to treat it like success in the case that they try to
+ * map something that is already mapped.
+ */
return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
- prot, &host_s2_pool, 0);
+ prot, &host_s2_pool,
+ KVM_PGTABLE_WALK_IGNORE_EAGAIN);
}
/*
@@ -504,7 +512,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
return ret;
if (kvm_pte_valid(pte))
- return -EAGAIN;
+ return -EEXIST;
if (pte) {
WARN_ON(addr_is_memory(addr) &&
@@ -541,24 +549,99 @@ static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_
set_host_state(page, state);
}
-int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+#define KVM_HOST_DONATION_PTE_OWNER_MASK GENMASK(3, 1)
+#define KVM_HOST_DONATION_PTE_EXTRA_MASK GENMASK(59, 4)
+static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size,
+ u8 owner_id, u64 meta)
{
+ kvm_pte_t annotation;
int ret;
+ if (owner_id == PKVM_ID_HOST)
+ return -EINVAL;
+
if (!range_is_memory(addr, addr + size))
return -EPERM;
- ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
- addr, size, &host_s2_pool, owner_id);
- if (ret)
- return ret;
+ if (!FIELD_FIT(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id))
+ return -EINVAL;
- /* Don't forget to update the vmemmap tracking for the host */
- if (owner_id == PKVM_ID_HOST)
- __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
- else
+ if (!FIELD_FIT(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta))
+ return -EINVAL;
+
+ annotation = FIELD_PREP(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id) |
+ FIELD_PREP(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta);
+ ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
+ addr, size, &host_s2_pool,
+ KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation);
+ if (!ret)
__host_update_page_state(addr, size, PKVM_NOPAGE);
+ return ret;
+}
+
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+{
+ int ret = -EINVAL;
+
+ switch (owner_id) {
+ case PKVM_ID_HOST:
+ if (!range_is_memory(addr, addr + size))
+ return -EPERM;
+
+ ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
+ if (!ret)
+ __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
+ break;
+ case PKVM_ID_HYP:
+ ret = host_stage2_set_owner_metadata_locked(addr, size,
+ owner_id, 0);
+ break;
+ }
+
+ return ret;
+}
+
+#define KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK GENMASK(15, 0)
+/* We need 40 bits for the GFN to cover a 52-bit IPA with 4k pages and LPA2 */
+#define KVM_HOST_PTE_OWNER_GUEST_GFN_MASK GENMASK(55, 16)
+static u64 host_stage2_encode_gfn_meta(struct pkvm_hyp_vm *vm, u64 gfn)
+{
+ pkvm_handle_t handle = vm->kvm.arch.pkvm.handle;
+
+ BUILD_BUG_ON((pkvm_handle_t)-1 > KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK);
+ WARN_ON(!FIELD_FIT(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn));
+
+ return FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, handle) |
+ FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn);
+}
+
+static int host_stage2_decode_gfn_meta(kvm_pte_t pte, struct pkvm_hyp_vm **vm,
+ u64 *gfn)
+{
+ pkvm_handle_t handle;
+ u64 meta;
+
+ if (WARN_ON(kvm_pte_valid(pte)))
+ return -EINVAL;
+
+ if (FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) !=
+ KVM_HOST_INVALID_PTE_TYPE_DONATION) {
+ return -EINVAL;
+ }
+
+ if (FIELD_GET(KVM_HOST_DONATION_PTE_OWNER_MASK, pte) != PKVM_ID_GUEST)
+ return -EPERM;
+
+ meta = FIELD_GET(KVM_HOST_DONATION_PTE_EXTRA_MASK, pte);
+ handle = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, meta);
+ *vm = get_vm_by_handle(handle);
+ if (!*vm) {
+ /* We probably raced with teardown; try again */
+ return -EAGAIN;
+ }
+
+ *gfn = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, meta);
return 0;
}
@@ -605,11 +688,43 @@ unlock:
return ret;
}
+static void host_inject_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+ u64 ec, esr, spsr;
+
+ esr = read_sysreg_el2(SYS_ESR);
+ spsr = read_sysreg_el2(SYS_SPSR);
+
+ /* Repaint the ESR to report a same-level fault if taken from EL1 */
+ if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+ ec = ESR_ELx_EC(esr);
+ if (ec == ESR_ELx_EC_DABT_LOW)
+ ec = ESR_ELx_EC_DABT_CUR;
+ else if (ec == ESR_ELx_EC_IABT_LOW)
+ ec = ESR_ELx_EC_IABT_CUR;
+ else
+ WARN_ON(1);
+ esr &= ~ESR_ELx_EC_MASK;
+ esr |= ec << ESR_ELx_EC_SHIFT;
+ }
+
+ /*
+ * Since S1PTW should only ever be set for stage-2 faults, we're pretty
+ * much guaranteed that it won't be set in ESR_EL1 by the hardware. So,
+ * let's use that bit to allow the host abort handler to differentiate
+ * this abort from normal userspace faults.
+ *
+ * Note: although S1PTW is RES0 at EL1, it is guaranteed by the
+ * architecture to be backed by flops, so it should be safe to use.
+ */
+ esr |= ESR_ELx_S1PTW;
+ inject_host_exception(esr);
+}
+
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
{
struct kvm_vcpu_fault_info fault;
u64 esr, addr;
- int ret = 0;
esr = read_sysreg_el2(SYS_ESR);
if (!__get_fault_info(esr, &fault)) {
@@ -628,8 +743,16 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS));
addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12;
- ret = host_stage2_idmap(addr);
- BUG_ON(ret && ret != -EAGAIN);
+ switch (host_stage2_idmap(addr)) {
+ case -EPERM:
+ host_inject_mem_abort(host_ctxt);
+ fallthrough;
+ case -EEXIST:
+ case 0:
+ break;
+ default:
+ BUG();
+ }
}
struct check_walk_data {
@@ -707,8 +830,20 @@ static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_pa
return 0;
}
+static bool guest_pte_is_poisoned(kvm_pte_t pte)
+{
+ if (kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) ==
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED;
+}
+
static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
{
+ if (guest_pte_is_poisoned(pte))
+ return PKVM_POISON;
+
if (!kvm_pte_valid(pte))
return PKVM_NOPAGE;
@@ -727,6 +862,77 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr,
return check_page_state_range(&vm->pgt, addr, size, &d);
}
+static int get_valid_guest_pte(struct pkvm_hyp_vm *vm, u64 ipa, kvm_pte_t *ptep, u64 *physp)
+{
+ kvm_pte_t pte;
+ u64 phys;
+ s8 level;
+ int ret;
+
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+ if (ret)
+ return ret;
+ if (guest_pte_is_poisoned(pte))
+ return -EHWPOISON;
+ if (!kvm_pte_valid(pte))
+ return -ENOENT;
+ if (level != KVM_PGTABLE_LAST_LEVEL)
+ return -E2BIG;
+
+ phys = kvm_pte_to_phys(pte);
+ ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
+ if (WARN_ON(ret))
+ return ret;
+
+ *ptep = pte;
+ *physp = phys;
+
+ return 0;
+}
+
+int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+ kvm_pte_t pte;
+ s8 level;
+ u64 ipa;
+ int ret;
+
+ switch (kvm_vcpu_trap_get_class(&hyp_vcpu->vcpu)) {
+ case ESR_ELx_EC_DABT_LOW:
+ case ESR_ELx_EC_IABT_LOW:
+ if (kvm_vcpu_trap_is_translation_fault(&hyp_vcpu->vcpu))
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * The host has the faulting IPA when it calls us from the guest
+ * fault handler but we retrieve it ourselves from the FAR so as
+ * to avoid exposing an "oracle" that could reveal data access
+ * patterns of the guest after initial donation of its pages.
+ */
+ ipa = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu);
+ ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(&hyp_vcpu->vcpu));
+
+ guest_lock_component(vm);
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+ if (ret)
+ goto unlock;
+
+ if (level != KVM_PGTABLE_LAST_LEVEL) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = guest_pte_is_poisoned(pte);
+unlock:
+ guest_unlock_component(vm);
+ return ret;
+}
+
int __pkvm_host_share_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
@@ -753,6 +959,72 @@ unlock:
return ret;
}
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys, ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ ret = -EPERM;
+ if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_OWNED)
+ goto unlock;
+ if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE))
+ goto unlock;
+
+ ret = 0;
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_SHARED_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+ WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 meta, phys, ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ ret = -EPERM;
+ if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_SHARED_OWNED)
+ goto unlock;
+ if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED))
+ goto unlock;
+
+ ret = 0;
+ meta = host_stage2_encode_gfn_meta(vm, gfn);
+ WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
+ PKVM_ID_GUEST, meta));
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
int __pkvm_host_unshare_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
@@ -960,6 +1232,176 @@ static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *s
return 0;
}
+static void hyp_poison_page(phys_addr_t phys)
+{
+ void *addr = hyp_fixmap_map(phys);
+
+ memset(addr, 0, PAGE_SIZE);
+ /*
+ * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page()
+ * here as the latter may elide the CMO under the assumption that FWB
+ * will be enabled on CPUs that support it. This is incorrect for the
+ * host stage-2 and would otherwise lead to a malicious host potentially
+ * being able to read the contents of newly reclaimed guest pages.
+ */
+ kvm_flush_dcache_to_poc(addr, PAGE_SIZE);
+ hyp_fixmap_unmap();
+}
+
+static int host_stage2_get_guest_info(phys_addr_t phys, struct pkvm_hyp_vm **vm,
+ u64 *gfn)
+{
+ enum pkvm_page_state state;
+ kvm_pte_t pte;
+ s8 level;
+ int ret;
+
+ if (!addr_is_memory(phys))
+ return -EFAULT;
+
+ state = get_host_state(hyp_phys_to_page(phys));
+ switch (state) {
+ case PKVM_PAGE_OWNED:
+ case PKVM_PAGE_SHARED_OWNED:
+ case PKVM_PAGE_SHARED_BORROWED:
+ /* The access should no longer fault; try again. */
+ return -EAGAIN;
+ case PKVM_NOPAGE:
+ break;
+ default:
+ return -EPERM;
+ }
+
+ ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level);
+ if (ret)
+ return ret;
+
+ if (WARN_ON(level != KVM_PGTABLE_LAST_LEVEL))
+ return -EINVAL;
+
+ return host_stage2_decode_gfn_meta(pte, vm, gfn);
+}
+
+int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys)
+{
+ struct pkvm_hyp_vm *vm;
+ u64 gfn, ipa, pa;
+ kvm_pte_t pte;
+ int ret;
+
+ phys &= PAGE_MASK;
+
+ hyp_spin_lock(&vm_table_lock);
+ host_lock_component();
+
+ ret = host_stage2_get_guest_info(phys, &vm, &gfn);
+ if (ret)
+ goto unlock_host;
+
+ ipa = hyp_pfn_to_phys(gfn);
+ guest_lock_component(vm);
+ ret = get_valid_guest_pte(vm, ipa, &pte, &pa);
+ if (ret)
+ goto unlock_guest;
+
+ WARN_ON(pa != phys);
+ if (guest_get_page_state(pte, ipa) != PKVM_PAGE_OWNED) {
+ ret = -EPERM;
+ goto unlock_guest;
+ }
+
+ /* We really shouldn't be allocating, so don't pass a memcache */
+ ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, NULL,
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED,
+ 0);
+ if (ret)
+ goto unlock_guest;
+
+ hyp_poison_page(phys);
+ WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+unlock_guest:
+ guest_unlock_component(vm);
+unlock_host:
+ host_unlock_component();
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
+}
+
+int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm)
+{
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ u64 phys;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ switch (guest_get_page_state(pte, ipa)) {
+ case PKVM_PAGE_OWNED:
+ WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE));
+ hyp_poison_page(phys);
+ break;
+ case PKVM_PAGE_SHARED_OWNED:
+ WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+ break;
+ default:
+ ret = -EPERM;
+ goto unlock;
+ }
+
+ WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE));
+ WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ /*
+ * -EHWPOISON implies that the page was forcefully reclaimed already
+ * so return success for the GUP pin to be dropped.
+ */
+ return ret && ret != -EHWPOISON ? ret : 0;
+}
+
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 meta;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = __host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+
+ ret = __guest_check_page_state_range(vm, ipa, PAGE_SIZE, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+
+ meta = host_stage2_encode_gfn_meta(vm, gfn);
+ WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
+ PKVM_ID_GUEST, meta));
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
enum kvm_pgtable_prot prot)
{
@@ -1206,53 +1648,18 @@ struct pkvm_expected_state {
static struct pkvm_expected_state selftest_state;
static struct hyp_page *selftest_page;
-
-static struct pkvm_hyp_vm selftest_vm = {
- .kvm = {
- .arch = {
- .mmu = {
- .arch = &selftest_vm.kvm.arch,
- .pgt = &selftest_vm.pgt,
- },
- },
- },
-};
-
-static struct pkvm_hyp_vcpu selftest_vcpu = {
- .vcpu = {
- .arch = {
- .hw_mmu = &selftest_vm.kvm.arch.mmu,
- },
- .kvm = &selftest_vm.kvm,
- },
-};
-
-static void init_selftest_vm(void *virt)
-{
- struct hyp_page *p = hyp_virt_to_page(virt);
- int i;
-
- selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
- WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
-
- for (i = 0; i < pkvm_selftest_pages(); i++) {
- if (p[i].refcount)
- continue;
- p[i].refcount = 1;
- hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
- }
-}
+static struct pkvm_hyp_vcpu *selftest_vcpu;
static u64 selftest_ipa(void)
{
- return BIT(selftest_vm.pgt.ia_bits - 1);
+ return BIT(selftest_vcpu->vcpu.arch.hw_mmu->pgt->ia_bits - 1);
}
static void assert_page_state(void)
{
void *virt = hyp_page_to_virt(selftest_page);
u64 size = PAGE_SIZE << selftest_page->order;
- struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
+ struct pkvm_hyp_vcpu *vcpu = selftest_vcpu;
u64 phys = hyp_virt_to_phys(virt);
u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE };
struct pkvm_hyp_vm *vm;
@@ -1267,10 +1674,10 @@ static void assert_page_state(void)
WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp));
hyp_unlock_component();
- guest_lock_component(&selftest_vm);
+ guest_lock_component(vm);
WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0]));
WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1]));
- guest_unlock_component(&selftest_vm);
+ guest_unlock_component(vm);
}
#define assert_transition_res(res, fn, ...) \
@@ -1283,14 +1690,15 @@ void pkvm_ownership_selftest(void *base)
{
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
void *virt = hyp_alloc_pages(&host_s2_pool, 0);
- struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
- struct pkvm_hyp_vm *vm = &selftest_vm;
+ struct pkvm_hyp_vcpu *vcpu;
u64 phys, size, pfn, gfn;
+ struct pkvm_hyp_vm *vm;
WARN_ON(!virt);
selftest_page = hyp_virt_to_page(virt);
selftest_page->refcount = 0;
- init_selftest_vm(base);
+ selftest_vcpu = vcpu = init_selftest_vm(base);
+ vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
size = PAGE_SIZE << selftest_page->order;
phys = hyp_virt_to_phys(virt);
@@ -1309,6 +1717,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
selftest_state.host = PKVM_PAGE_OWNED;
selftest_state.hyp = PKVM_NOPAGE;
@@ -1328,6 +1737,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
@@ -1340,6 +1750,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
hyp_unpin_shared_mem(virt, virt + size);
assert_page_state();
@@ -1359,6 +1770,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.host = PKVM_PAGE_OWNED;
@@ -1375,6 +1787,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED;
@@ -1389,9 +1802,69 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm);
selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[0] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_SHARED_BORROWED;
+ selftest_state.guest[0] = PKVM_PAGE_SHARED_OWNED;
+ assert_transition_res(0, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[0] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_guest_unshare_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_guest_unshare_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.guest[0] = PKVM_POISON;
+ assert_transition_res(0, __pkvm_host_force_reclaim_page_guest, phys);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EHWPOISON, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EHWPOISON, __pkvm_guest_unshare_host, vcpu, gfn);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[1] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.guest[1] = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_reclaim_page_guest, gfn + 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+
+ selftest_state.host = PKVM_NOPAGE;
selftest_state.hyp = PKVM_PAGE_OWNED;
assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1);
+ teardown_selftest_vm();
selftest_page->refcount = 1;
hyp_put_page(&host_s2_pool, virt);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 2f029bfe4755..7ed96d64d611 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -4,6 +4,8 @@
* Author: Fuad Tabba <tabba@google.com>
*/
+#include <kvm/arm_hypercalls.h>
+
#include <linux/kvm_host.h>
#include <linux/mm.h>
@@ -222,6 +224,7 @@ static struct pkvm_hyp_vm **vm_table;
void pkvm_hyp_vm_table_init(void *tbl)
{
+ BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1);
WARN_ON(vm_table);
vm_table = tbl;
}
@@ -229,10 +232,12 @@ void pkvm_hyp_vm_table_init(void *tbl)
/*
* Return the hyp vm structure corresponding to the handle.
*/
-static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
{
unsigned int idx = vm_handle_to_idx(handle);
+ hyp_assert_lock_held(&vm_table_lock);
+
if (unlikely(idx >= KVM_MAX_PVMS))
return NULL;
@@ -255,7 +260,10 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
hyp_spin_lock(&vm_table_lock);
hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx)
+ if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying)
+ goto unlock;
+
+ if (hyp_vm->kvm.created_vcpus <= vcpu_idx)
goto unlock;
hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
@@ -719,6 +727,55 @@ void __pkvm_unreserve_vm(pkvm_handle_t handle)
hyp_spin_unlock(&vm_table_lock);
}
+#ifdef CONFIG_NVHE_EL2_DEBUG
+static struct pkvm_hyp_vm selftest_vm = {
+ .kvm = {
+ .arch = {
+ .mmu = {
+ .arch = &selftest_vm.kvm.arch,
+ .pgt = &selftest_vm.pgt,
+ },
+ },
+ },
+};
+
+static struct pkvm_hyp_vcpu selftest_vcpu = {
+ .vcpu = {
+ .arch = {
+ .hw_mmu = &selftest_vm.kvm.arch.mmu,
+ },
+ .kvm = &selftest_vm.kvm,
+ },
+};
+
+struct pkvm_hyp_vcpu *init_selftest_vm(void *virt)
+{
+ struct hyp_page *p = hyp_virt_to_page(virt);
+ int i;
+
+ selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
+ WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
+
+ for (i = 0; i < pkvm_selftest_pages(); i++) {
+ if (p[i].refcount)
+ continue;
+ p[i].refcount = 1;
+ hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
+ }
+
+ selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm();
+ insert_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle, &selftest_vm);
+ return &selftest_vcpu;
+}
+
+void teardown_selftest_vm(void)
+{
+ hyp_spin_lock(&vm_table_lock);
+ remove_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle);
+ hyp_spin_unlock(&vm_table_lock);
+}
+#endif /* CONFIG_NVHE_EL2_DEBUG */
+
/*
* Initialize the hypervisor copy of the VM state using host-donated memory.
*
@@ -859,7 +916,54 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
unmap_donated_memory_noclear(addr, size);
}
-int __pkvm_teardown_vm(pkvm_handle_t handle)
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn)
+{
+ struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle);
+ int ret = -EINVAL;
+
+ if (!hyp_vm)
+ return ret;
+
+ if (hyp_vm->kvm.arch.pkvm.is_dying)
+ ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm);
+
+ put_pkvm_hyp_vm(hyp_vm);
+ return ret;
+}
+
+static struct pkvm_hyp_vm *get_pkvm_unref_hyp_vm_locked(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm || hyp_page_count(hyp_vm))
+ return NULL;
+
+ return hyp_vm;
+}
+
+int __pkvm_start_teardown_vm(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = 0;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_pkvm_unref_hyp_vm_locked(handle);
+ if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ hyp_vm->kvm.arch.pkvm.is_dying = true;
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
+}
+
+int __pkvm_finalize_teardown_vm(pkvm_handle_t handle)
{
struct kvm_hyp_memcache *mc, *stage2_mc;
struct pkvm_hyp_vm *hyp_vm;
@@ -869,14 +973,9 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
int err;
hyp_spin_lock(&vm_table_lock);
- hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm) {
- err = -ENOENT;
- goto err_unlock;
- }
-
- if (WARN_ON(hyp_page_count(hyp_vm))) {
- err = -EBUSY;
+ hyp_vm = get_pkvm_unref_hyp_vm_locked(handle);
+ if (!hyp_vm || !hyp_vm->kvm.arch.pkvm.is_dying) {
+ err = -EINVAL;
goto err_unlock;
}
@@ -922,3 +1021,121 @@ err_unlock:
hyp_spin_unlock(&vm_table_lock);
return err;
}
+
+static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa)
+{
+ u64 elr;
+
+ /* Fake up a data abort (level 3 translation fault on write) */
+ vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
+ FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
+
+ /* Shuffle the IPA around into the HPFAR */
+ vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK;
+
+ /* This is a virtual address. 0's good. Let's go with 0. */
+ vcpu->arch.fault.far_el2 = 0;
+
+ /* Rewind the ELR so we return to the HVC once the IPA is mapped */
+ elr = read_sysreg(elr_el2);
+ elr -= 4;
+ write_sysreg(elr, elr_el2);
+
+ return ARM_EXCEPTION_TRAP;
+}
+
+static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ u64 ipa = smccc_get_arg1(vcpu);
+
+ if (!PAGE_ALIGNED(ipa))
+ goto out_guest;
+
+ hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+ switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) {
+ case 0:
+ ret[0] = SMCCC_RET_SUCCESS;
+ goto out_guest;
+ case -ENOENT:
+ /*
+ * Convert the exception into a data abort so that the page
+ * being shared is mapped into the guest next time.
+ */
+ *exit_code = __pkvm_memshare_page_req(vcpu, ipa);
+ goto out_host;
+ }
+
+out_guest:
+ return true;
+out_host:
+ return false;
+}
+
+static void pkvm_memunshare_call(u64 *ret, struct kvm_vcpu *vcpu)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ u64 ipa = smccc_get_arg1(vcpu);
+
+ if (!PAGE_ALIGNED(ipa))
+ return;
+
+ hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+ if (!__pkvm_guest_unshare_host(hyp_vcpu, hyp_phys_to_pfn(ipa)))
+ ret[0] = SMCCC_RET_SUCCESS;
+}
+
+/*
+ * Handler for protected VM HVC calls.
+ *
+ * Returns true if the hypervisor has handled the exit (and control
+ * should return to the guest) or false if it hasn't (and the handling
+ * should be performed by the host).
+ */
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 val[4] = { SMCCC_RET_INVALID_PARAMETER };
+ bool handled = true;
+
+ switch (smccc_get_function(vcpu)) {
+ case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+ val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
+ if (smccc_get_arg1(vcpu) ||
+ smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ val[0] = PAGE_SIZE;
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+ if (smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ handled = pkvm_memshare_call(val, vcpu, exit_code);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+ if (smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ pkvm_memunshare_call(val, vcpu);
+ break;
+ default:
+ /* Punt everything else back to the host, for now. */
+ handled = false;
+ }
+
+ if (handled)
+ smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
+ return handled;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 923351f70ff8..8d1df3d33595 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -205,6 +205,7 @@ static const exit_handler_fn hyp_exit_handlers[] = {
static const exit_handler_fn pvm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
+ [ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64,
[ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64,
[ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted,
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index b40fd01ebf32..b48e66ac0b49 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -400,6 +400,14 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Cache maintenance by set/way operations are restricted. */
/* Debug and Trace Registers are restricted. */
+ RAZ_WI(SYS_DBGBVRn_EL1(0)),
+ RAZ_WI(SYS_DBGBCRn_EL1(0)),
+ RAZ_WI(SYS_DBGWVRn_EL1(0)),
+ RAZ_WI(SYS_DBGWCRn_EL1(0)),
+ RAZ_WI(SYS_MDSCR_EL1),
+ RAZ_WI(SYS_OSLAR_EL1),
+ RAZ_WI(SYS_OSLSR_EL1),
+ RAZ_WI(SYS_OSDLR_EL1),
/* Group 1 ID registers */
HOST_HANDLED(SYS_REVIDR_EL1),
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 9b480f947da2..84c7a1df845d 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -114,11 +114,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
return pte;
}
-static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
-{
- return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
-}
-
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
@@ -581,7 +576,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
struct stage2_map_data {
const u64 phys;
kvm_pte_t attr;
- u8 owner_id;
+ kvm_pte_t pte_annot;
kvm_pte_t *anchor;
kvm_pte_t *childp;
@@ -798,7 +793,11 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
static bool stage2_pte_is_locked(kvm_pte_t pte)
{
- return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
+ if (kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) ==
+ KVM_INVALID_PTE_TYPE_LOCKED;
}
static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
@@ -829,6 +828,7 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_s2_mmu *mmu)
{
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t locked_pte;
if (stage2_pte_is_locked(ctx->old)) {
/*
@@ -839,7 +839,9 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
return false;
}
- if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
+ locked_pte = FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK,
+ KVM_INVALID_PTE_TYPE_LOCKED);
+ if (!stage2_try_set_pte(ctx, locked_pte))
return false;
if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
@@ -964,7 +966,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
if (!data->annotation)
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
else
- new = kvm_init_invalid_leaf_owner(data->owner_id);
+ new = data->pte_annot;
/*
* Skip updating the PTE if we are trying to recreate the exact
@@ -1118,16 +1120,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
return ret;
}
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
- void *mc, u8 owner_id)
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ void *mc, enum kvm_invalid_pte_type type,
+ kvm_pte_t pte_annot)
{
int ret;
struct stage2_map_data map_data = {
.mmu = pgt->mmu,
.memcache = mc,
- .owner_id = owner_id,
.force_pte = true,
.annotation = true,
+ .pte_annot = pte_annot |
+ FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK, type),
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -1136,7 +1140,10 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
.arg = &map_data,
};
- if (owner_id > KVM_MAX_OWNER_ID)
+ if (pte_annot & ~KVM_INVALID_PTE_ANNOT_MASK)
+ return -EINVAL;
+
+ if (!type || type == KVM_INVALID_PTE_TYPE_LOCKED)
return -EINVAL;
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 03e1f389339c..c2e12d256e43 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -340,6 +340,9 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
u64 size, bool may_block)
{
+ if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu)))
+ return;
+
__unmap_stage2_range(mmu, start, size, may_block);
}
@@ -878,9 +881,6 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
u64 mmfr0, mmfr1;
u32 phys_shift;
- if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
- return -EINVAL;
-
phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
if (is_protected_kvm_enabled()) {
phys_shift = kvm_ipa_limit;
@@ -1659,6 +1659,75 @@ struct kvm_s2_fault_vma_info {
bool map_non_cacheable;
};
+static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd)
+{
+ unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
+ struct kvm_vcpu *vcpu = s2fd->vcpu;
+ struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ struct mm_struct *mm = current->mm;
+ struct kvm *kvm = vcpu->kvm;
+ void *hyp_memcache;
+ struct page *page;
+ int ret;
+
+ hyp_memcache = get_mmu_memcache(vcpu);
+ ret = topup_mmu_memcache(vcpu, hyp_memcache);
+ if (ret)
+ return -ENOMEM;
+
+ ret = account_locked_vm(mm, 1, true);
+ if (ret)
+ return ret;
+
+ mmap_read_lock(mm);
+ ret = pin_user_pages(s2fd->hva, 1, flags, &page);
+ mmap_read_unlock(mm);
+
+ if (ret == -EHWPOISON) {
+ kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT);
+ ret = 0;
+ goto dec_account;
+ } else if (ret != 1) {
+ ret = -EFAULT;
+ goto dec_account;
+ } else if (!folio_test_swapbacked(page_folio(page))) {
+ /*
+ * We really can't deal with page-cache pages returned by GUP
+ * because (a) we may trigger writeback of a page for which we
+ * no longer have access and (b) page_mkclean() won't find the
+ * stage-2 mapping in the rmap so we can get out-of-whack with
+ * the filesystem when marking the page dirty during unpinning
+ * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
+ * without asking ext4 first")).
+ *
+ * Ideally we'd just restrict ourselves to anonymous pages, but
+ * we also want to allow memfd (i.e. shmem) pages, so check for
+ * pages backed by swap in the knowledge that the GUP pin will
+ * prevent try_to_unmap() from succeeding.
+ */
+ ret = -EIO;
+ goto unpin;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE,
+ page_to_phys(page), KVM_PGTABLE_PROT_RWX,
+ hyp_memcache, 0);
+ write_unlock(&kvm->mmu_lock);
+ if (ret) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto unpin;
+ }
+
+ return 0;
+unpin:
+ unpin_user_pages(&page, 1);
+dec_account:
+ account_locked_vm(mm, 1, false);
+ return ret;
+}
+
static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd,
struct kvm_s2_fault_vma_info *s2vi,
struct vm_area_struct *vma)
@@ -2285,9 +2354,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
- !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
-
const struct kvm_s2_fault_desc s2fd = {
.vcpu = vcpu,
.fault_ipa = fault_ipa,
@@ -2296,10 +2362,18 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
.hva = hva,
};
- if (kvm_slot_has_gmem(memslot))
- ret = gmem_abort(&s2fd);
- else
- ret = user_mem_abort(&s2fd);
+ if (kvm_vm_is_protected(vcpu->kvm)) {
+ ret = pkvm_mem_abort(&s2fd);
+ } else {
+ VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+ !write_fault &&
+ !kvm_vcpu_trap_is_exec_fault(vcpu));
+
+ if (kvm_slot_has_gmem(memslot))
+ ret = gmem_abort(&s2fd);
+ else
+ ret = user_mem_abort(&s2fd);
+ }
if (ret == 0)
ret = 1;
@@ -2313,7 +2387,7 @@ out_unlock:
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- if (!kvm->arch.mmu.pgt)
+ if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
return false;
__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
@@ -2328,7 +2402,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- if (!kvm->arch.mmu.pgt)
+ if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
return false;
return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
@@ -2344,7 +2418,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- if (!kvm->arch.mmu.pgt)
+ if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
return false;
return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
@@ -2501,6 +2575,19 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
hva_t hva, reg_end;
int ret = 0;
+ if (kvm_vm_is_protected(kvm)) {
+ /* Cannot modify memslots once a pVM has run. */
+ if (pkvm_hyp_vm_is_created(kvm) &&
+ (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) {
+ return -EPERM;
+ }
+
+ if (new &&
+ new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) {
+ return -EPERM;
+ }
+ }
+
if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
change != KVM_MR_FLAGS_ONLY)
return 0;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index d7a0f69a9982..053e4f733e4b 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -88,7 +88,7 @@ void __init kvm_hyp_reserve(void)
static void __pkvm_destroy_hyp_vm(struct kvm *kvm)
{
if (pkvm_hyp_vm_is_created(kvm)) {
- WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm,
kvm->arch.pkvm.handle));
} else if (kvm->arch.pkvm.handle) {
/*
@@ -192,10 +192,16 @@ int pkvm_create_hyp_vm(struct kvm *kvm)
{
int ret = 0;
+ /*
+ * Synchronise with kvm_arch_prepare_memory_region(), as we
+ * prevent memslot modifications on a pVM that has been run.
+ */
+ mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->arch.config_lock);
if (!pkvm_hyp_vm_is_created(kvm))
ret = __pkvm_create_hyp_vm(kvm);
mutex_unlock(&kvm->arch.config_lock);
+ mutex_unlock(&kvm->slots_lock);
return ret;
}
@@ -219,9 +225,10 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm)
mutex_unlock(&kvm->arch.config_lock);
}
-int pkvm_init_host_vm(struct kvm *kvm)
+int pkvm_init_host_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ bool protected = type & KVM_VM_TYPE_ARM_PROTECTED;
if (pkvm_hyp_vm_is_created(kvm))
return -EINVAL;
@@ -236,6 +243,11 @@ int pkvm_init_host_vm(struct kvm *kvm)
return ret;
kvm->arch.pkvm.handle = ret;
+ kvm->arch.pkvm.is_protected = protected;
+ if (protected) {
+ pr_warn_once("kvm: protected VMs are experimental and for development only, tainting kernel\n");
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
return 0;
}
@@ -322,15 +334,38 @@ int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
return 0;
}
-static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
+static int __pkvm_pgtable_stage2_reclaim(struct kvm_pgtable *pgt, u64 start, u64 end)
{
struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
pkvm_handle_t handle = kvm->arch.pkvm.handle;
struct pkvm_mapping *mapping;
int ret;
- if (!handle)
- return 0;
+ for_each_mapping_in_range_safe(pgt, start, end, mapping) {
+ struct page *page;
+
+ ret = kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page,
+ handle, mapping->gfn);
+ if (WARN_ON(ret))
+ continue;
+
+ page = pfn_to_page(mapping->pfn);
+ WARN_ON_ONCE(mapping->nr_pages != 1);
+ unpin_user_pages_dirty_lock(&page, 1, true);
+ account_locked_vm(current->mm, 1, false);
+ pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
+ kfree(mapping);
+ }
+
+ return 0;
+}
+
+static int __pkvm_pgtable_stage2_unshare(struct kvm_pgtable *pgt, u64 start, u64 end)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ int ret;
for_each_mapping_in_range_safe(pgt, start, end, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
@@ -347,7 +382,21 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e
void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
u64 addr, u64 size)
{
- __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+
+ if (!handle)
+ return;
+
+ if (pkvm_hyp_vm_is_created(kvm) && !kvm->arch.pkvm.is_dying) {
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, handle));
+ kvm->arch.pkvm.is_dying = true;
+ }
+
+ if (kvm_vm_is_protected(kvm))
+ __pkvm_pgtable_stage2_reclaim(pgt, addr, addr + size);
+ else
+ __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size);
}
void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
@@ -365,31 +414,58 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_hyp_memcache *cache = mc;
u64 gfn = addr >> PAGE_SHIFT;
u64 pfn = phys >> PAGE_SHIFT;
+ u64 end = addr + size;
int ret;
- if (size != PAGE_SIZE && size != PMD_SIZE)
- return -EINVAL;
-
lockdep_assert_held_write(&kvm->mmu_lock);
+ mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, end - 1);
- /*
- * Calling stage2_map() on top of existing mappings is either happening because of a race
- * with another vCPU, or because we're changing between page and block mappings. As per
- * user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
- */
- mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1);
- if (mapping) {
- if (size == (mapping->nr_pages * PAGE_SIZE))
- return -EAGAIN;
-
- /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
- ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
- if (ret)
- return ret;
- mapping = NULL;
+ if (kvm_vm_is_protected(kvm)) {
+ /* Protected VMs are mapped using RWX page-granular mappings */
+ if (WARN_ON_ONCE(size != PAGE_SIZE))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(prot != KVM_PGTABLE_PROT_RWX))
+ return -EINVAL;
+
+ /*
+ * We either raced with another vCPU or the guest PTE
+ * has been poisoned by an erroneous host access.
+ */
+ if (mapping) {
+ ret = kvm_call_hyp_nvhe(__pkvm_vcpu_in_poison_fault);
+ return ret ? -EFAULT : -EAGAIN;
+ }
+
+ ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest, pfn, gfn);
+ } else {
+ if (WARN_ON_ONCE(size != PAGE_SIZE && size != PMD_SIZE))
+ return -EINVAL;
+
+ /*
+ * We either raced with another vCPU or we're changing between
+ * page and block mappings. As per user_mem_abort(), same-size
+ * permission faults are handled in the relax_perms() path.
+ */
+ if (mapping) {
+ if (size == (mapping->nr_pages * PAGE_SIZE))
+ return -EAGAIN;
+
+ /*
+ * Remove _any_ pkvm_mapping overlapping with the range,
+ * bigger or smaller.
+ */
+ ret = __pkvm_pgtable_stage2_unshare(pgt, addr, end);
+ if (ret)
+ return ret;
+
+ mapping = NULL;
+ }
+
+ ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn,
+ size / PAGE_SIZE, prot);
}
- ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
if (WARN_ON(ret))
return ret;
@@ -404,9 +480,14 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
- lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
- return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+ if (WARN_ON(kvm_vm_is_protected(kvm)))
+ return -EPERM;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size);
}
int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
@@ -416,6 +497,9 @@ int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
struct pkvm_mapping *mapping;
int ret = 0;
+ if (WARN_ON(kvm_vm_is_protected(kvm)))
+ return -EPERM;
+
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
@@ -447,6 +531,9 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64
struct pkvm_mapping *mapping;
bool young = false;
+ if (WARN_ON(kvm_vm_is_protected(kvm)))
+ return false;
+
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
@@ -458,12 +545,18 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64
int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
enum kvm_pgtable_walk_flags flags)
{
+ if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu))))
+ return -EPERM;
+
return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
}
void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_walk_flags flags)
{
+ if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu))))
+ return;
+
WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
}
@@ -485,3 +578,15 @@ int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
WARN_ON_ONCE(1);
return -EINVAL;
}
+
+/*
+ * Forcefully reclaim a page from the guest, zeroing its contents and
+ * poisoning the stage-2 pte so that pages can no longer be mapped at
+ * the same IPA. The page remains pinned until the guest is destroyed.
+ */
+bool pkvm_force_reclaim_guest_page(phys_addr_t phys)
+{
+ int ret = kvm_call_hyp_nvhe(__pkvm_force_reclaim_guest_page, phys);
+
+ return !ret || ret == -EAGAIN;
+}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index be9dab2c7d6a..7eacc7b45c1f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -43,6 +43,7 @@
#include <asm/system_misc.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
+#include <asm/virt.h>
struct fault_info {
int (*fn)(unsigned long far, unsigned long esr,
@@ -269,6 +270,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr
return false;
}
+static bool is_pkvm_stage2_abort(unsigned int esr)
+{
+ /*
+ * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor
+ * injected a stage-2 abort -- see host_inject_mem_abort().
+ */
+ return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW);
+}
+
static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
unsigned long esr,
struct pt_regs *regs)
@@ -289,8 +299,14 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
* If we now have a valid translation, treat the translation fault as
* spurious.
*/
- if (!(par & SYS_PAR_EL1_F))
+ if (!(par & SYS_PAR_EL1_F)) {
+ if (is_pkvm_stage2_abort(esr)) {
+ par &= SYS_PAR_EL1_PA;
+ return pkvm_force_reclaim_guest_page(par);
+ }
+
return true;
+ }
/*
* If we got a different type of fault from the AT instruction,
@@ -376,9 +392,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
return;
- if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
- "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
+ if (is_spurious_el1_translation_fault(addr, esr, regs)) {
+ WARN_RATELIMIT(!is_pkvm_stage2_abort(esr),
+ "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr);
return;
+ }
if (is_el1_mte_sync_tag_check_fault(esr)) {
do_tag_recovery(addr, esr, regs);
@@ -395,6 +413,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
msg = "read from unreadable memory";
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
+ } else if (is_pkvm_stage2_abort(esr)) {
+ msg = "access to hypervisor-protected memory";
} else {
if (esr_fsc_is_translation_fault(esr) &&
kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
@@ -621,6 +641,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
addr, esr, regs);
}
+ if (is_pkvm_stage2_abort(esr)) {
+ if (!user_mode(regs))
+ goto no_context;
+ arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault");
+ return 0;
+ }
+
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (!(mm_flags & FAULT_FLAG_USER))
diff --git a/drivers/virt/coco/pkvm-guest/Kconfig b/drivers/virt/coco/pkvm-guest/Kconfig
index d2f344f1f98f..928b8e1668cc 100644
--- a/drivers/virt/coco/pkvm-guest/Kconfig
+++ b/drivers/virt/coco/pkvm-guest/Kconfig
@@ -1,6 +1,6 @@
config ARM_PKVM_GUEST
bool "Arm pKVM protected guest driver"
- depends on ARM64
+ depends on ARM64 && DMA_RESTRICTED_POOL
help
Protected guests running under the pKVM hypervisor on arm64
are isolated from the host and must issue hypercalls to enable
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d0c0c8605976..ae659566e44f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -703,6 +703,11 @@ struct kvm_enable_cap {
#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL
#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \
((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+
+#define KVM_VM_TYPE_ARM_PROTECTED (1UL << 31)
+#define KVM_VM_TYPE_ARM_MASK (KVM_VM_TYPE_ARM_IPA_SIZE_MASK | \
+ KVM_VM_TYPE_ARM_PROTECTED)
+
/*
* ioctls for /dev/kvm fds:
*/