summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-04-13 12:49:54 +0300
committerPaolo Bonzini <pbonzini@redhat.com>2026-04-13 12:49:54 +0300
commite74c3a8891c05f88eeb87121de7e12dc95766a4a (patch)
tree3016a634244192f7d3e520ddeaf38cfc9d0470ec
parent05578316ca7246b45fb7d9eaf81308c0e0f3ee10 (diff)
parent94b4ae79ebb42a8a6f2124b4d4b033b15a98e4f9 (diff)
downloadlinux-e74c3a8891c05f88eeb87121de7e12dc95766a4a.tar.xz
Merge tag 'kvmarm-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for 7.1 * New features: - Add support for tracing in the standalone EL2 hypervisor code, which should help both debugging and performance analysis. This comes with a full infrastructure for 'remote' trace buffers that can be exposed by non-kernel entities such as firmware. - Add support for GICv5 Per Processor Interrupts (PPIs), as the starting point for supporting the new GIC architecture in KVM. - Finally add support for pKVM protected guests, with anonymous memory being used as a backing store. About time! * Improvements and bug fixes: - Rework the dreaded user_mem_abort() function to make it more maintainable, reducing the amount of state being exposed to the various helpers and rendering a substantial amount of state immutable. - Expand the Stage-2 page table dumper to support NV shadow page tables on a per-VM basis. - Tidy up the pKVM PSCI proxy code to be slightly less hard to follow. - Fix both SPE and TRBE in non-VHE configurations so that they do not generate spurious, out of context table walks that ultimately lead to very bad HW lockups. - A small set of patches fixing the Stage-2 MMU freeing in error cases. - Tighten-up accepted SMC immediate value to be only #0 for host SMCCC calls. - The usual cleanups and other selftest churn.
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt4
-rw-r--r--Documentation/trace/index.rst11
-rw-r--r--Documentation/trace/remotes.rst66
-rw-r--r--Documentation/virt/kvm/api.rst6
-rw-r--r--Documentation/virt/kvm/arm/index.rst1
-rw-r--r--Documentation/virt/kvm/arm/pkvm.rst106
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic-v5.rst50
-rw-r--r--Documentation/virt/kvm/devices/index.rst1
-rw-r--r--Documentation/virt/kvm/devices/vcpu.rst5
-rw-r--r--arch/arm64/include/asm/el2_setup.h4
-rw-r--r--arch/arm64/include/asm/kvm_asm.h44
-rw-r--r--arch/arm64/include/asm/kvm_define_hypevents.h16
-rw-r--r--arch/arm64/include/asm/kvm_host.h50
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h14
-rw-r--r--arch/arm64/include/asm/kvm_hypevents.h60
-rw-r--r--arch/arm64/include/asm/kvm_hyptrace.h26
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h4
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h45
-rw-r--r--arch/arm64/include/asm/kvm_pkvm.h4
-rw-r--r--arch/arm64/include/asm/sysreg.h13
-rw-r--r--arch/arm64/include/asm/virt.h9
-rw-r--r--arch/arm64/include/asm/vncr_mapping.h3
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h1
-rw-r--r--arch/arm64/kernel/cpufeature.c1
-rw-r--r--arch/arm64/kernel/hyp-stub.S1
-rw-r--r--arch/arm64/kernel/image-vars.h4
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S18
-rw-r--r--arch/arm64/kvm/Kconfig64
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arch_timer.c102
-rw-r--r--arch/arm64/kvm/arm.c69
-rw-r--r--arch/arm64/kvm/config.c127
-rw-r--r--arch/arm64/kvm/emulate-nested.c68
-rw-r--r--arch/arm64/kvm/handle_exit.c2
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h27
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h23
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/clock.h16
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/define_events.h14
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h12
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h12
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/pkvm.h7
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trace.h70
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/trap_handler.h2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile8
-rw-r--r--arch/arm64/kvm/hyp/nvhe/clock.c65
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c116
-rw-r--r--arch/arm64/kvm/hyp/nvhe/events.c25
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c28
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S13
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S41
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c294
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c587
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c239
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c45
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stacktrace.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c23
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/trace.c306
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c33
-rw-r--r--arch/arm64/kvm/hyp/vgic-v5-sr.c166
-rw-r--r--arch/arm64/kvm/hyp/vhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp_trace.c442
-rw-r--r--arch/arm64/kvm/hyp_trace.h11
-rw-r--r--arch/arm64/kvm/mmu.c624
-rw-r--r--arch/arm64/kvm/nested.c11
-rw-r--r--arch/arm64/kvm/pkvm.c159
-rw-r--r--arch/arm64/kvm/pmu-emul.c20
-rw-r--r--arch/arm64/kvm/ptdump.c79
-rw-r--r--arch/arm64/kvm/stacktrace.c8
-rw-r--r--arch/arm64/kvm/sys_regs.c188
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c228
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c107
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c40
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v5.c499
-rw-r--r--arch/arm64/kvm/vgic/vgic.c173
-rw-r--r--arch/arm64/kvm/vgic/vgic.h53
-rw-r--r--arch/arm64/mm/fault.c33
-rw-r--r--arch/arm64/tools/sysreg480
-rw-r--r--drivers/irqchip/irq-gic-v5.c18
-rw-r--r--drivers/virt/coco/pkvm-guest/Kconfig2
-rw-r--r--fs/tracefs/inode.c1
-rw-r--r--include/kvm/arm_arch_timer.h8
-rw-r--r--include/kvm/arm_pmu.h5
-rw-r--r--include/kvm/arm_vgic.h191
-rw-r--r--include/linux/irqchip/arm-gic-v5.h27
-rw-r--r--include/linux/kvm_host.h1
-rw-r--r--include/linux/ring_buffer.h58
-rw-r--r--include/linux/ring_buffer_types.h41
-rw-r--r--include/linux/simple_ring_buffer.h65
-rw-r--r--include/linux/trace_remote.h48
-rw-r--r--include/linux/trace_remote_event.h33
-rw-r--r--include/trace/define_remote_events.h73
-rw-r--r--include/uapi/linux/kvm.h7
-rw-r--r--include/uapi/linux/trace_mmap.h8
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile59
-rw-r--r--kernel/trace/remote_test.c261
-rw-r--r--kernel/trace/remote_test_events.h10
-rw-r--r--kernel/trace/ring_buffer.c354
-rw-r--r--kernel/trace/simple_ring_buffer.c517
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_remote.c1384
-rw-r--r--tools/arch/arm64/include/uapi/asm/kvm.h1
-rw-r--r--tools/include/uapi/linux/kvm.h2
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc25
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/functions99
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc88
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/reset.tc90
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/trace.tc102
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc102
-rw-r--r--tools/testing/selftests/ftrace/test.d/remotes/unloading.tc41
-rw-r--r--tools/testing/selftests/kvm/Makefile.kvm3
-rw-r--r--tools/testing/selftests/kvm/arm64/at.c14
-rw-r--r--tools/testing/selftests/kvm/arm64/no-vgic-v3.c177
-rw-r--r--tools/testing/selftests/kvm/arm64/no-vgic.c297
-rw-r--r--tools/testing/selftests/kvm/arm64/set_id_regs.c52
-rw-r--r--tools/testing/selftests/kvm/arm64/vgic_v5.c228
-rw-r--r--tools/testing/selftests/kvm/include/arm64/gic_v5.h150
129 files changed, 10017 insertions, 1086 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 03a550630644..44854a67bc63 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3247,8 +3247,8 @@ Kernel parameters
for the host. To force nVHE on VHE hardware, add
"arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" to the
command-line.
- "nested" is experimental and should be used with
- extreme caution.
+ "nested" and "protected" are experimental and should be
+ used with extreme caution.
kvm-arm.vgic_v3_group0_trap=
[KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0
diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst
index 338bc4d7cfab..036db96864d2 100644
--- a/Documentation/trace/index.rst
+++ b/Documentation/trace/index.rst
@@ -91,6 +91,17 @@ interactions.
user_events
uprobetracer
+Remote Tracing
+--------------
+
+This section covers the framework to read compatible ring-buffers, written by
+entities outside of the kernel (most likely firmware or hypervisor)
+
+.. toctree::
+ :maxdepth: 1
+
+ remotes
+
Additional Resources
--------------------
diff --git a/Documentation/trace/remotes.rst b/Documentation/trace/remotes.rst
new file mode 100644
index 000000000000..1f9d764f69aa
--- /dev/null
+++ b/Documentation/trace/remotes.rst
@@ -0,0 +1,66 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Tracing Remotes
+===============
+
+:Author: Vincent Donnefort <vdonnefort@google.com>
+
+Overview
+========
+Firmware and hypervisors are black boxes to the kernel. Having a way to see what
+they are doing can be useful to debug both. This is where remote tracing buffers
+come in. A remote tracing buffer is a ring buffer executed by the firmware or
+hypervisor into memory that is memory mapped to the host kernel. This is similar
+to how user space memory maps the kernel ring buffer but in this case the kernel
+is acting like user space and the firmware or hypervisor is the "kernel" side.
+With a trace remote ring buffer, the firmware and hypervisor can record events
+for which the host kernel can see and expose to user space.
+
+Register a remote
+=================
+A remote must provide a set of callbacks `struct trace_remote_callbacks` whom
+description can be found below. Those callbacks allows Tracefs to enable and
+disable tracing and events, to load and unload a tracing buffer (a set of
+ring-buffers) and to swap a reader page with the head page, which enables
+consuming reading.
+
+.. kernel-doc:: include/linux/trace_remote.h
+
+Once registered, an instance will appear for this remote in the Tracefs
+directory **remotes/**. Buffers can then be read using the usual Tracefs files
+**trace_pipe** and **trace**.
+
+Declare a remote event
+======================
+Macros are provided to ease the declaration of remote events, in a similar
+fashion to in-kernel events. A declaration must provide an ID, a description of
+the event arguments and how to print the event:
+
+.. code-block:: c
+
+ REMOTE_EVENT(foo, EVENT_FOO_ID,
+ RE_STRUCT(
+ re_field(u64, bar)
+ ),
+ RE_PRINTK("bar=%lld", __entry->bar)
+ );
+
+Then those events must be declared in a C file with the following:
+
+.. code-block:: c
+
+ #define REMOTE_EVENT_INCLUDE_FILE foo_events.h
+ #include <trace/define_remote_events.h>
+
+This will provide a `struct remote_event remote_event_foo` that can be given to
+`trace_remote_register`.
+
+Registered events appear in the remote directory under **events/**.
+
+Simple ring-buffer
+==================
+A simple implementation for a ring-buffer writer can be found in
+kernel/trace/simple_ring_buffer.c.
+
+.. kernel-doc:: include/linux/simple_ring_buffer.h
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 032516783e96..03d87d9b97d9 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -907,10 +907,12 @@ The irq_type field has the following values:
- KVM_ARM_IRQ_TYPE_CPU:
out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
- KVM_ARM_IRQ_TYPE_SPI:
- in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.)
+ in-kernel GICv2/GICv3: SPI, irq_id between 32 and 1019 (incl.)
(the vcpu_index field is ignored)
+ in-kernel GICv5: SPI, irq_id between 0 and 65535 (incl.)
- KVM_ARM_IRQ_TYPE_PPI:
- in-kernel GIC: PPI, irq_id between 16 and 31 (incl.)
+ in-kernel GICv2/GICv3: PPI, irq_id between 16 and 31 (incl.)
+ in-kernel GICv5: PPI, irq_id between 0 and 127 (incl.)
(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index ec09881de4cf..0856b4942e05 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,6 +10,7 @@ ARM
fw-pseudo-registers
hyp-abi
hypercalls
+ pkvm
pvtime
ptp_kvm
vcpu-features
diff --git a/Documentation/virt/kvm/arm/pkvm.rst b/Documentation/virt/kvm/arm/pkvm.rst
new file mode 100644
index 000000000000..514992a79a83
--- /dev/null
+++ b/Documentation/virt/kvm/arm/pkvm.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Protected KVM (pKVM)
+====================
+
+**NOTE**: pKVM is currently an experimental, development feature and
+subject to breaking changes as new isolation features are implemented.
+Please reach out to the developers at kvmarm@lists.linux.dev if you have
+any questions.
+
+Overview
+========
+
+Booting a host kernel with '``kvm-arm.mode=protected``' enables
+"Protected KVM" (pKVM). During boot, pKVM installs a stage-2 identity
+map page-table for the host and uses it to isolate the hypervisor
+running at EL2 from the rest of the host running at EL1/0.
+
+pKVM permits creation of protected virtual machines (pVMs) by passing
+the ``KVM_VM_TYPE_ARM_PROTECTED`` machine type identifier to the
+``KVM_CREATE_VM`` ioctl(). The hypervisor isolates pVMs from the host by
+unmapping pages from the stage-2 identity map as they are accessed by a
+pVM. Hypercalls are provided for a pVM to share specific regions of its
+IPA space back with the host, allowing for communication with the VMM.
+A Linux guest must be configured with ``CONFIG_ARM_PKVM_GUEST=y`` in
+order to issue these hypercalls.
+
+See hypercalls.rst for more details.
+
+Isolation mechanisms
+====================
+
+pKVM relies on a number of mechanisms to isolate PVMs from the host:
+
+CPU memory isolation
+--------------------
+
+Status: Isolation of anonymous memory and metadata pages.
+
+Metadata pages (e.g. page-table pages and '``struct kvm_vcpu``' pages)
+are donated from the host to the hypervisor during pVM creation and
+are consequently unmapped from the stage-2 identity map until the pVM is
+destroyed.
+
+Similarly to regular KVM, pages are lazily mapped into the guest in
+response to stage-2 page faults handled by the host. However, when
+running a pVM, these pages are first pinned and then unmapped from the
+stage-2 identity map as part of the donation procedure. This gives rise
+to some user-visible differences when compared to non-protected VMs,
+largely due to the lack of MMU notifiers:
+
+* Memslots cannot be moved or deleted once the pVM has started running.
+* Read-only memslots and dirty logging are not supported.
+* With the exception of swap, file-backed pages cannot be mapped into a
+ pVM.
+* Donated pages are accounted against ``RLIMIT_MLOCK`` and so the VMM
+ must have a sufficient resource limit or be granted ``CAP_IPC_LOCK``.
+ The lack of a runtime reclaim mechanism means that memory locked for
+ a pVM will remain locked until the pVM is destroyed.
+* Changes to the VMM address space (e.g. a ``MAP_FIXED`` mmap() over a
+ mapping associated with a memslot) are not reflected in the guest and
+ may lead to loss of coherency.
+* Accessing pVM memory that has not been shared back will result in the
+ delivery of a SIGSEGV.
+* If a system call accesses pVM memory that has not been shared back
+ then it will either return ``-EFAULT`` or forcefully reclaim the
+ memory pages. Reclaimed memory is zeroed by the hypervisor and a
+ subsequent attempt to access it in the pVM will return ``-EFAULT``
+ from the ``VCPU_RUN`` ioctl().
+
+CPU state isolation
+-------------------
+
+Status: **Unimplemented.**
+
+DMA isolation using an IOMMU
+----------------------------
+
+Status: **Unimplemented.**
+
+Proxying of Trustzone services
+------------------------------
+
+Status: FF-A and PSCI calls from the host are proxied by the pKVM
+hypervisor.
+
+The FF-A proxy ensures that the host cannot share pVM or hypervisor
+memory with Trustzone as part of a "confused deputy" attack.
+
+The PSCI proxy ensures that CPUs always have the stage-2 identity map
+installed when they are executing in the host.
+
+Protected VM firmware (pvmfw)
+-----------------------------
+
+Status: **Unimplemented.**
+
+Resources
+=========
+
+Quentin Perret's KVM Forum 2022 talk entitled "Protected KVM on arm64: A
+technical deep dive" remains a good resource for learning more about
+pKVM, despite some of the details having changed in the meantime:
+
+https://www.youtube.com/watch?v=9npebeVFbFw
diff --git a/Documentation/virt/kvm/devices/arm-vgic-v5.rst b/Documentation/virt/kvm/devices/arm-vgic-v5.rst
new file mode 100644
index 000000000000..29335ea823fc
--- /dev/null
+++ b/Documentation/virt/kvm/devices/arm-vgic-v5.rst
@@ -0,0 +1,50 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================================================
+ARM Virtual Generic Interrupt Controller v5 (VGICv5)
+====================================================
+
+
+Device types supported:
+ - KVM_DEV_TYPE_ARM_VGIC_V5 ARM Generic Interrupt Controller v5.0
+
+Only one VGIC instance may be instantiated through this API. The created VGIC
+will act as the VM interrupt controller, requiring emulated user-space devices
+to inject interrupts to the VGIC instead of directly to CPUs.
+
+Creating a guest GICv5 device requires a host GICv5 host. The current VGICv5
+device only supports PPI interrupts. These can either be injected from emulated
+in-kernel devices (such as the Arch Timer, or PMU), or via the KVM_IRQ_LINE
+ioctl.
+
+Groups:
+ KVM_DEV_ARM_VGIC_GRP_CTRL
+ Attributes:
+
+ KVM_DEV_ARM_VGIC_CTRL_INIT
+ request the initialization of the VGIC, no additional parameter in
+ kvm_device_attr.addr. Must be called after all VCPUs have been created.
+
+ KVM_DEV_ARM_VGIC_USERPSPACE_PPIs
+ request the mask of userspace-drivable PPIs. Only a subset of the PPIs can
+ be directly driven from userspace with GICv5, and the returned mask
+ informs userspace of which it is allowed to drive via KVM_IRQ_LINE.
+
+ Userspace must allocate and point to __u64[2] of data in
+ kvm_device_attr.addr. When this call returns, the provided memory will be
+ populated with the userspace PPI mask. The lower __u64 contains the mask
+ for the lower 64 PPIS, with the remaining 64 being in the second __u64.
+
+ This is a read-only attribute, and cannot be set. Attempts to set it are
+ rejected.
+
+ Errors:
+
+ ======= ========================================================
+ -ENXIO VGIC not properly configured as required prior to calling
+ this attribute
+ -ENODEV no online VCPU
+ -ENOMEM memory shortage when allocating vgic internal data
+ -EFAULT Invalid guest ram access
+ -EBUSY One or more VCPUS are running
+ ======= ========================================================
diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst
index 192cda7405c8..70845aba38f4 100644
--- a/Documentation/virt/kvm/devices/index.rst
+++ b/Documentation/virt/kvm/devices/index.rst
@@ -10,6 +10,7 @@ Devices
arm-vgic-its
arm-vgic
arm-vgic-v3
+ arm-vgic-v5
mpic
s390_flic
vcpu
diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index 60bf205cb373..5e3805820010 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -37,7 +37,8 @@ Returns:
A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt
number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt
type must be same for each vcpu. As a PPI, the interrupt number is the same for
-all vcpus, while as an SPI it must be a separate number per vcpu.
+all vcpus, while as an SPI it must be a separate number per vcpu. For
+GICv5-based guests, the architected PPI (23) must be used.
1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT
---------------------------------------
@@ -50,7 +51,7 @@ Returns:
-EEXIST Interrupt number already used
-ENODEV PMUv3 not supported or GIC not initialized
-ENXIO PMUv3 not supported, missing VCPU feature or interrupt
- number not set
+ number not set (non-GICv5 guests, only)
-EBUSY PMUv3 already initialized
======= ======================================================
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 85f4c1615472..144a757e937f 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -50,7 +50,6 @@
* effectively VHE-only or not.
*/
msr_hcr_el2 x0 // Setup HCR_EL2 as nVHE
- isb
mov x1, #1 // Write something to FAR_EL1
msr far_el1, x1
isb
@@ -64,7 +63,6 @@
.LnE2H0_\@:
orr x0, x0, #HCR_E2H
msr_hcr_el2 x0
- isb
.LnVHE_\@:
.endm
@@ -248,6 +246,8 @@
ICH_HFGWTR_EL2_ICC_CR0_EL1 | \
ICH_HFGWTR_EL2_ICC_APR_EL1)
msr_s SYS_ICH_HFGWTR_EL2, x0 // Disable reg write traps
+ mov x0, #(ICH_VCTLR_EL2_En)
+ msr_s SYS_ICH_VCTLR_EL2, x0 // Enable vHPPI selection
.Lskip_gicv5_\@:
.endm
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a1ad12c72ebf..37414440cee7 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -51,7 +51,7 @@
#include <linux/mm.h>
enum __kvm_host_smccc_func {
- /* Hypercalls available only prior to pKVM finalisation */
+ /* Hypercalls that are unavailable once pKVM has finalised. */
/* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */
__KVM_HOST_SMCCC_FUNC___pkvm_init = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1,
__KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping,
@@ -60,16 +60,9 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs,
__KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config,
__KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
+ __KVM_HOST_SMCCC_FUNC_MIN_PKVM = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
- /* Hypercalls available after pKVM finalisation */
- __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest,
- __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest,
+ /* Hypercalls that are always available and common to [nh]VHE/pKVM. */
__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
@@ -81,14 +74,40 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
+ __KVM_HOST_SMCCC_FUNC___vgic_v5_save_apr,
+ __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
+ __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
+
+ /* Hypercalls that are available only when pKVM has finalised. */
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_donate_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest,
__KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
- __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
+ __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_in_poison_fault,
+ __KVM_HOST_SMCCC_FUNC___pkvm_force_reclaim_guest_page,
+ __KVM_HOST_SMCCC_FUNC___pkvm_reclaim_dying_guest_page,
+ __KVM_HOST_SMCCC_FUNC___pkvm_start_teardown_vm,
+ __KVM_HOST_SMCCC_FUNC___pkvm_finalize_teardown_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
+ __KVM_HOST_SMCCC_FUNC___tracing_load,
+ __KVM_HOST_SMCCC_FUNC___tracing_unload,
+ __KVM_HOST_SMCCC_FUNC___tracing_enable,
+ __KVM_HOST_SMCCC_FUNC___tracing_swap_reader,
+ __KVM_HOST_SMCCC_FUNC___tracing_update_clock,
+ __KVM_HOST_SMCCC_FUNC___tracing_reset,
+ __KVM_HOST_SMCCC_FUNC___tracing_enable_event,
+ __KVM_HOST_SMCCC_FUNC___tracing_write_event,
};
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
@@ -291,7 +310,8 @@ asmlinkage void __noreturn hyp_panic_bad_stack(void);
asmlinkage void kvm_unexpected_el2_exception(void);
struct kvm_cpu_context;
void handle_trap(struct kvm_cpu_context *host_ctxt);
-asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on);
+asmlinkage void __noreturn __kvm_host_psci_cpu_on_entry(void);
+asmlinkage void __noreturn __kvm_host_psci_cpu_resume_entry(void);
void __noreturn __pkvm_init_finalise(void);
void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
void kvm_patch_vector_branch(struct alt_instr *alt,
diff --git a/arch/arm64/include/asm/kvm_define_hypevents.h b/arch/arm64/include/asm/kvm_define_hypevents.h
new file mode 100644
index 000000000000..77d6790252a6
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_define_hypevents.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define REMOTE_EVENT_INCLUDE_FILE arch/arm64/include/asm/kvm_hypevents.h
+
+#define REMOTE_EVENT_SECTION "_hyp_events"
+
+#define HE_STRUCT(__args) __args
+#define HE_PRINTK(__args...) __args
+#define he_field re_field
+
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \
+ REMOTE_EVENT(__name, 0, RE_STRUCT(__struct), RE_PRINTK(__printk))
+
+#define HYP_EVENT_MULTI_READ
+#include <trace/define_remote_events.h>
+#undef HYP_EVENT_MULTI_READ
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 70cb9cfd760a..851f6171751c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -217,6 +217,10 @@ struct kvm_s2_mmu {
*/
bool nested_stage2_enabled;
+#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
+ struct dentry *shadow_pt_debugfs_dentry;
+#endif
+
/*
* true when this MMU needs to be unmapped before being used for a new
* purpose.
@@ -247,7 +251,7 @@ struct kvm_smccc_features {
unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */
};
-typedef unsigned int pkvm_handle_t;
+typedef u16 pkvm_handle_t;
struct kvm_protected_vm {
pkvm_handle_t handle;
@@ -255,6 +259,13 @@ struct kvm_protected_vm {
struct kvm_hyp_memcache stage2_teardown_mc;
bool is_protected;
bool is_created;
+
+ /*
+ * True when the guest is being torn down. When in this state, the
+ * guest's vCPUs can't be loaded anymore, but its pages can be
+ * reclaimed by the host.
+ */
+ bool is_dying;
};
struct kvm_mpidr_data {
@@ -287,6 +298,9 @@ enum fgt_group_id {
HDFGRTR2_GROUP,
HDFGWTR2_GROUP = HDFGRTR2_GROUP,
HFGITR2_GROUP,
+ ICH_HFGRTR_GROUP,
+ ICH_HFGWTR_GROUP = ICH_HFGRTR_GROUP,
+ ICH_HFGITR_GROUP,
/* Must be last */
__NR_FGT_GROUP_IDS__
@@ -405,6 +419,11 @@ struct kvm_arch {
* the associated pKVM instance in the hypervisor.
*/
struct kvm_protected_vm pkvm;
+
+#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
+ /* Nested virtualization info */
+ struct dentry *debugfs_nv_dentry;
+#endif
};
struct kvm_vcpu_fault_info {
@@ -620,6 +639,10 @@ enum vcpu_sysreg {
VNCR(ICH_HCR_EL2),
VNCR(ICH_VMCR_EL2),
+ VNCR(ICH_HFGRTR_EL2),
+ VNCR(ICH_HFGWTR_EL2),
+ VNCR(ICH_HFGITR_EL2),
+
NR_SYS_REGS /* Nothing after this line! */
};
@@ -675,6 +698,9 @@ extern struct fgt_masks hfgwtr2_masks;
extern struct fgt_masks hfgitr2_masks;
extern struct fgt_masks hdfgrtr2_masks;
extern struct fgt_masks hdfgwtr2_masks;
+extern struct fgt_masks ich_hfgrtr_masks;
+extern struct fgt_masks ich_hfgwtr_masks;
+extern struct fgt_masks ich_hfgitr_masks;
extern struct fgt_masks kvm_nvhe_sym(hfgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgwtr_masks);
@@ -687,6 +713,9 @@ extern struct fgt_masks kvm_nvhe_sym(hfgwtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgitr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgrtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgwtr2_masks);
+extern struct fgt_masks kvm_nvhe_sym(ich_hfgrtr_masks);
+extern struct fgt_masks kvm_nvhe_sym(ich_hfgwtr_masks);
+extern struct fgt_masks kvm_nvhe_sym(ich_hfgitr_masks);
struct kvm_cpu_context {
struct user_pt_regs regs; /* sp = sp_el0 */
@@ -768,8 +797,10 @@ struct kvm_host_data {
struct kvm_guest_debug_arch regs;
/* Statistical profiling extension */
u64 pmscr_el1;
+ u64 pmblimitr_el1;
/* Self-hosted trace */
u64 trfcr_el1;
+ u64 trblimitr_el1;
/* Values of trap registers for the host before guest entry. */
u64 mdcr_el2;
u64 brbcr_el1;
@@ -787,6 +818,14 @@ struct kvm_host_data {
/* Last vgic_irq part of the AP list recorded in an LR */
struct vgic_irq *last_lr_irq;
+
+ /* PPI state tracking for GICv5-based guests */
+ struct {
+ DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS);
+
+ /* The saved state of the regs when leaving the guest */
+ DECLARE_BITMAP(activer_exit, VGIC_V5_NR_PRIVATE_IRQS);
+ } vgic_v5_ppi_state;
};
struct kvm_host_psci_config {
@@ -923,6 +962,9 @@ struct kvm_vcpu_arch {
/* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */
struct vncr_tlb *vncr_tlb;
+
+ /* Hyp-readable copy of kvm_vcpu::pid */
+ pid_t pid;
};
/*
@@ -1659,6 +1701,11 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg
case HDFGRTR2_EL2:
case HDFGWTR2_EL2:
return HDFGRTR2_GROUP;
+ case ICH_HFGRTR_EL2:
+ case ICH_HFGWTR_EL2:
+ return ICH_HFGRTR_GROUP;
+ case ICH_HFGITR_EL2:
+ return ICH_HFGITR_GROUP;
default:
BUILD_BUG_ON(1);
}
@@ -1673,6 +1720,7 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg
case HDFGWTR_EL2: \
case HFGWTR2_EL2: \
case HDFGWTR2_EL2: \
+ case ICH_HFGWTR_EL2: \
p = &(vcpu)->arch.fgt[id].w; \
break; \
default: \
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 76ce2b94bd97..8d06b62e7188 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -87,6 +87,15 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
+/* GICv5 */
+void __vgic_v5_save_apr(struct vgic_v5_cpu_if *cpu_if);
+void __vgic_v5_restore_vmcr_apr(struct vgic_v5_cpu_if *cpu_if);
+/* No hypercalls for the following */
+void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if);
+void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if);
+void __vgic_v5_save_state(struct vgic_v5_cpu_if *cpu_if);
+void __vgic_v5_restore_state(struct vgic_v5_cpu_if *cpu_if);
+
#ifdef __KVM_NVHE_HYPERVISOR__
void __timer_enable_traps(struct kvm_vcpu *vcpu);
void __timer_disable_traps(struct kvm_vcpu *vcpu);
@@ -129,13 +138,13 @@ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
#ifdef __KVM_NVHE_HYPERVISOR__
void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp,
void (*fn)(void));
-int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
- unsigned long *per_cpu_base, u32 hyp_va_bits);
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits);
void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
#endif
extern u64 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val);
+extern u64 kvm_nvhe_sym(id_aa64pfr2_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64isar0_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64isar1_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64isar2_el1_sys_val);
@@ -147,5 +156,6 @@ extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val);
extern unsigned long kvm_nvhe_sym(__icache_flags);
extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
extern unsigned int kvm_nvhe_sym(kvm_host_sve_max_vl);
+extern unsigned long kvm_nvhe_sym(hyp_nr_cpus);
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h
new file mode 100644
index 000000000000..743c49bd878f
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_hypevents.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(__ARM64_KVM_HYPEVENTS_H_) || defined(HYP_EVENT_MULTI_READ)
+#define __ARM64_KVM_HYPEVENTS_H_
+
+#ifdef __KVM_NVHE_HYPERVISOR__
+#include <nvhe/trace.h>
+#endif
+
+#ifndef __HYP_ENTER_EXIT_REASON
+#define __HYP_ENTER_EXIT_REASON
+enum hyp_enter_exit_reason {
+ HYP_REASON_SMC,
+ HYP_REASON_HVC,
+ HYP_REASON_PSCI,
+ HYP_REASON_HOST_ABORT,
+ HYP_REASON_GUEST_EXIT,
+ HYP_REASON_ERET_HOST,
+ HYP_REASON_ERET_GUEST,
+ HYP_REASON_UNKNOWN /* Must be last */
+};
+#endif
+
+HYP_EVENT(hyp_enter,
+ HE_PROTO(struct kvm_cpu_context *host_ctxt, u8 reason),
+ HE_STRUCT(
+ he_field(u8, reason)
+ he_field(pid_t, vcpu)
+ ),
+ HE_ASSIGN(
+ __entry->reason = reason;
+ __entry->vcpu = __tracing_get_vcpu_pid(host_ctxt);
+ ),
+ HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu)
+);
+
+HYP_EVENT(hyp_exit,
+ HE_PROTO(struct kvm_cpu_context *host_ctxt, u8 reason),
+ HE_STRUCT(
+ he_field(u8, reason)
+ he_field(pid_t, vcpu)
+ ),
+ HE_ASSIGN(
+ __entry->reason = reason;
+ __entry->vcpu = __tracing_get_vcpu_pid(host_ctxt);
+ ),
+ HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu)
+);
+
+HYP_EVENT(selftest,
+ HE_PROTO(u64 id),
+ HE_STRUCT(
+ he_field(u64, id)
+ ),
+ HE_ASSIGN(
+ __entry->id = id;
+ ),
+ RE_PRINTK("id=%llu", __entry->id)
+);
+#endif
diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h
new file mode 100644
index 000000000000..de133b735f72
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_hyptrace.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_HYPTRACE_H_
+#define __ARM64_KVM_HYPTRACE_H_
+
+#include <linux/ring_buffer.h>
+
+struct hyp_trace_desc {
+ unsigned long bpages_backing_start;
+ size_t bpages_backing_size;
+ struct trace_buffer_desc trace_buffer_desc;
+
+};
+
+struct hyp_event_id {
+ unsigned short id;
+ atomic_t enabled;
+};
+
+extern struct remote_event __hyp_events_start[];
+extern struct remote_event __hyp_events_end[];
+
+/* hyp_event section used by the hypervisor */
+extern struct hyp_event_id __hyp_event_ids_start[];
+extern struct hyp_event_id __hyp_event_ids_end[];
+
+#endif
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index d968aca0461a..01e9c72d6aa7 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -393,8 +393,12 @@ static inline bool kvm_supports_cacheable_pfnmap(void)
#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
+void kvm_nested_s2_ptdump_create_debugfs(struct kvm_s2_mmu *mmu);
+void kvm_nested_s2_ptdump_remove_debugfs(struct kvm_s2_mmu *mmu);
#else
static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
+static inline void kvm_nested_s2_ptdump_create_debugfs(struct kvm_s2_mmu *mmu) {}
+static inline void kvm_nested_s2_ptdump_remove_debugfs(struct kvm_s2_mmu *mmu) {}
#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
#endif /* __ASSEMBLER__ */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index c201168f2857..41a8687938eb 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -99,14 +99,30 @@ typedef u64 kvm_pte_t;
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID 1
+/* pKVM invalid pte encodings */
+#define KVM_INVALID_PTE_TYPE_MASK GENMASK(63, 60)
+#define KVM_INVALID_PTE_ANNOT_MASK ~(KVM_PTE_VALID | \
+ KVM_INVALID_PTE_TYPE_MASK)
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED BIT(10)
+enum kvm_invalid_pte_type {
+ /*
+ * Used to indicate a pte for which a 'break-before-make'
+ * sequence is in progress.
+ */
+ KVM_INVALID_PTE_TYPE_LOCKED = 1,
+
+ /*
+ * pKVM has unmapped the page from the host due to a change of
+ * ownership.
+ */
+ KVM_HOST_INVALID_PTE_TYPE_DONATION,
+
+ /*
+ * The page has been forcefully reclaimed from the guest by the
+ * host.
+ */
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED,
+};
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
@@ -658,14 +674,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
void *mc, enum kvm_pgtable_walk_flags flags);
/**
- * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
- * track ownership.
+ * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space
+ * to track ownership (and more).
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Base intermediate physical address to annotate.
* @size: Size of the annotated range.
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
* page-table pages.
- * @owner_id: Unique identifier for the owner of the page.
+ * @type: The type of the annotation, determining its meaning and format.
+ * @annotation: A 59-bit value that will be stored in the page tables.
+ * @annotation[0] and @annotation[63:60] must be 0.
+ * @annotation[59:1] is stored in the page tables, along
+ * with @type.
*
* By default, all page-tables are owned by identifier 0. This function can be
* used to mark portions of the IPA space as owned by other entities. When a
@@ -674,8 +694,9 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
*
* Return: 0 on success, negative error code on failure.
*/
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
- void *mc, u8 owner_id);
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ void *mc, enum kvm_invalid_pte_type type,
+ kvm_pte_t annotation);
/**
* kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 757076ad4ec9..2954b311128c 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -17,7 +17,7 @@
#define HYP_MEMBLOCK_REGIONS 128
-int pkvm_init_host_vm(struct kvm *kvm);
+int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
int pkvm_create_hyp_vm(struct kvm *kvm);
bool pkvm_hyp_vm_is_created(struct kvm *kvm);
void pkvm_destroy_hyp_vm(struct kvm *kvm);
@@ -40,8 +40,6 @@ static inline bool kvm_pkvm_ext_allowed(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPU_ID:
case KVM_CAP_MSI_DEVID:
case KVM_CAP_ARM_VM_IPA_SIZE:
- case KVM_CAP_ARM_PMU_V3:
- case KVM_CAP_ARM_SVE:
case KVM_CAP_ARM_PTRAUTH_ADDRESS:
case KVM_CAP_ARM_PTRAUTH_GENERIC:
return true;
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index f4436ecc630c..736561480f36 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1052,6 +1052,7 @@
#define GICV5_OP_GIC_CDPRI sys_insn(1, 0, 12, 1, 2)
#define GICV5_OP_GIC_CDRCFG sys_insn(1, 0, 12, 1, 5)
#define GICV5_OP_GICR_CDIA sys_insn(1, 0, 12, 3, 0)
+#define GICV5_OP_GICR_CDNMIA sys_insn(1, 0, 12, 3, 1)
/* Definitions for GIC CDAFF */
#define GICV5_GIC_CDAFF_IAFFID_MASK GENMASK_ULL(47, 32)
@@ -1098,6 +1099,12 @@
#define GICV5_GIC_CDIA_TYPE_MASK GENMASK_ULL(31, 29)
#define GICV5_GIC_CDIA_ID_MASK GENMASK_ULL(23, 0)
+/* Definitions for GICR CDNMIA */
+#define GICV5_GICR_CDNMIA_VALID_MASK BIT_ULL(32)
+#define GICV5_GICR_CDNMIA_VALID(r) FIELD_GET(GICV5_GICR_CDNMIA_VALID_MASK, r)
+#define GICV5_GICR_CDNMIA_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GICR_CDNMIA_ID_MASK GENMASK_ULL(23, 0)
+
#define gicr_insn(insn) read_sysreg_s(GICV5_OP_GICR_##insn)
#define gic_insn(v, insn) write_sysreg_s(v, GICV5_OP_GIC_##insn)
@@ -1114,11 +1121,9 @@
.macro msr_hcr_el2, reg
#if IS_ENABLED(CONFIG_AMPERE_ERRATUM_AC04_CPU_23)
dsb nsh
- msr hcr_el2, \reg
- isb
-#else
- msr hcr_el2, \reg
#endif
+ msr hcr_el2, \reg
+ isb // Required by AMPERE_ERRATUM_AC04_CPU_23
.endm
#else
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index b51ab6840f9c..b546703c3ab9 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -94,6 +94,15 @@ static inline bool is_pkvm_initialized(void)
static_branch_likely(&kvm_protected_mode_initialized);
}
+#ifdef CONFIG_KVM
+bool pkvm_force_reclaim_guest_page(phys_addr_t phys);
+#else
+static inline bool pkvm_force_reclaim_guest_page(phys_addr_t phys)
+{
+ return false;
+}
+#endif
+
/* Reports the availability of HYP mode */
static inline bool is_hyp_mode_available(void)
{
diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h
index c2485a862e69..14366d35ce82 100644
--- a/arch/arm64/include/asm/vncr_mapping.h
+++ b/arch/arm64/include/asm/vncr_mapping.h
@@ -108,5 +108,8 @@
#define VNCR_MPAMVPM5_EL2 0x968
#define VNCR_MPAMVPM6_EL2 0x970
#define VNCR_MPAMVPM7_EL2 0x978
+#define VNCR_ICH_HFGITR_EL2 0xB10
+#define VNCR_ICH_HFGRTR_EL2 0xB18
+#define VNCR_ICH_HFGWTR_EL2 0xB20
#endif /* __ARM64_VNCR_MAPPING_H__ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index a792a599b9d6..1c13bfa2d38a 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -428,6 +428,7 @@ enum {
#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
#define KVM_DEV_ARM_ITS_CTRL_RESET 4
+#define KVM_DEV_ARM_VGIC_USERSPACE_PPIS 5
/* Device Control API on vcpu fd */
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 32c2dbcc0c64..1bfaa96881da 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -325,6 +325,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_GCIE_SHIFT, 4, ID_AA64PFR2_EL1_GCIE_NI),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTEFAR_SHIFT, 4, ID_AA64PFR2_EL1_MTEFAR_NI),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTESTOREONLY_SHIFT, 4, ID_AA64PFR2_EL1_MTESTOREONLY_NI),
ARM64_FTR_END,
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 085bc9972f6b..634ddc904244 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -103,7 +103,6 @@ SYM_CODE_START_LOCAL(__finalise_el2)
// Engage the VHE magic!
mov_q x0, HCR_HOST_VHE_FLAGS
msr_hcr_el2 x0
- isb
// Use the EL1 allocated stack, per-cpu offset
mrs x0, sp_el1
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index d7b0d12b1015..d4c7d45ae6bc 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -138,6 +138,10 @@ KVM_NVHE_ALIAS(__hyp_data_start);
KVM_NVHE_ALIAS(__hyp_data_end);
KVM_NVHE_ALIAS(__hyp_rodata_start);
KVM_NVHE_ALIAS(__hyp_rodata_end);
+#ifdef CONFIG_NVHE_EL2_TRACING
+KVM_NVHE_ALIAS(__hyp_event_ids_start);
+KVM_NVHE_ALIAS(__hyp_event_ids_end);
+#endif
/* pKVM static key */
KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 2d1e75263f03..e1ac876200a3 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,12 +13,23 @@
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
+#ifdef CONFIG_NVHE_EL2_TRACING
+#define HYPERVISOR_EVENT_IDS \
+ . = ALIGN(PAGE_SIZE); \
+ __hyp_event_ids_start = .; \
+ *(HYP_SECTION_NAME(.event_ids)) \
+ __hyp_event_ids_end = .;
+#else
+#define HYPERVISOR_EVENT_IDS
+#endif
+
#define HYPERVISOR_RODATA_SECTIONS \
HYP_SECTION_NAME(.rodata) : { \
. = ALIGN(PAGE_SIZE); \
__hyp_rodata_start = .; \
*(HYP_SECTION_NAME(.data..ro_after_init)) \
*(HYP_SECTION_NAME(.rodata)) \
+ HYPERVISOR_EVENT_IDS \
. = ALIGN(PAGE_SIZE); \
__hyp_rodata_end = .; \
}
@@ -308,6 +319,13 @@ SECTIONS
HYPERVISOR_DATA_SECTION
+#ifdef CONFIG_NVHE_EL2_TRACING
+ .data.hyp_events : {
+ __hyp_events_start = .;
+ *(SORT(_hyp_events.*))
+ __hyp_events_end = .;
+ }
+#endif
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 7d1f22fd490b..449154f9a485 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -42,9 +42,27 @@ menuconfig KVM
If unsure, say N.
+if KVM
+
+config PTDUMP_STAGE2_DEBUGFS
+ bool "Present the stage-2 pagetables to debugfs"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on ARCH_HAS_PTDUMP
+ select PTDUMP
+ default n
+ help
+ Say Y here if you want to show the stage-2 kernel pagetables
+ layout in a debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
+
config NVHE_EL2_DEBUG
bool "Debug mode for non-VHE EL2 object"
- depends on KVM
+ default n
help
Say Y here to enable the debug mode for the non-VHE KVM EL2 object.
Failure reports will BUG() in the hypervisor. This is intended for
@@ -52,11 +70,30 @@ config NVHE_EL2_DEBUG
If unsure, say N.
-config PROTECTED_NVHE_STACKTRACE
- bool "Protected KVM hypervisor stacktraces"
- depends on NVHE_EL2_DEBUG
+if NVHE_EL2_DEBUG
+
+config NVHE_EL2_TRACING
+ bool
+ depends on TRACING && FTRACE
+ select TRACE_REMOTE
+ default y
+
+config PKVM_DISABLE_STAGE2_ON_PANIC
+ bool "Disable the host stage-2 on panic"
default n
help
+ Relax the host stage-2 on hypervisor panic to allow the kernel to
+ unwind and symbolize the hypervisor stacktrace. This however tampers
+ the system security. This is intended for local EL2 hypervisor
+ development.
+
+ If unsure, say N.
+
+config PKVM_STACKTRACE
+ bool "Protected KVM hypervisor stacktraces"
+ depends on PKVM_DISABLE_STAGE2_ON_PANIC
+ default y
+ help
Say Y here to enable pKVM hypervisor stacktraces on hyp_panic()
If using protected nVHE mode, but cannot afford the associated
@@ -65,21 +102,6 @@ config PROTECTED_NVHE_STACKTRACE
If unsure, or not using protected nVHE (pKVM), say N.
-config PTDUMP_STAGE2_DEBUGFS
- bool "Present the stage-2 pagetables to debugfs"
- depends on KVM
- depends on DEBUG_KERNEL
- depends on DEBUG_FS
- depends on ARCH_HAS_PTDUMP
- select PTDUMP
- default n
- help
- Say Y here if you want to show the stage-2 kernel pagetables
- layout in a debugfs file. This information is only useful for kernel developers
- who are working in architecture specific areas of the kernel.
- It is probably not a good idea to enable this feature in a production
- kernel.
-
- If in doubt, say N.
-
+endif # NVHE_EL2_DEBUG
+endif # KVM
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 3ebc0570345c..59612d2f277c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -30,6 +30,8 @@ kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
+kvm-$(CONFIG_NVHE_EL2_TRACING) += hyp_trace.o
+
always-y := hyp_constants.h hyp-constants.s
define rule_gen_hyp_constants
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 600f250753b4..cbea4d9ee955 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -56,6 +56,12 @@ static struct irq_ops arch_timer_irq_ops = {
.get_input_level = kvm_arch_timer_get_input_level,
};
+static struct irq_ops arch_timer_irq_ops_vgic_v5 = {
+ .get_input_level = kvm_arch_timer_get_input_level,
+ .queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock,
+ .set_direct_injection = vgic_v5_set_ppi_dvi,
+};
+
static int nr_timers(struct kvm_vcpu *vcpu)
{
if (!vcpu_has_nv(vcpu))
@@ -447,6 +453,17 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
if (userspace_irqchip(vcpu->kvm))
return;
+ /* Skip injecting on GICv5 for directly injected (DVI'd) timers */
+ if (vgic_is_v5(vcpu->kvm)) {
+ struct timer_map map;
+
+ get_timer_map(vcpu, &map);
+
+ if (map.direct_ptimer == timer_ctx ||
+ map.direct_vtimer == timer_ctx)
+ return;
+ }
+
kvm_vgic_inject_irq(vcpu->kvm, vcpu,
timer_irq(timer_ctx),
timer_ctx->irq.level,
@@ -674,6 +691,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx));
phys_active |= ctx->irq.level;
+ phys_active |= vgic_is_v5(vcpu->kvm);
set_timer_irq_phys_active(ctx, phys_active);
}
@@ -740,13 +758,11 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
ret = kvm_vgic_map_phys_irq(vcpu,
map->direct_vtimer->host_timer_irq,
- timer_irq(map->direct_vtimer),
- &arch_timer_irq_ops);
+ timer_irq(map->direct_vtimer));
WARN_ON_ONCE(ret);
ret = kvm_vgic_map_phys_irq(vcpu,
map->direct_ptimer->host_timer_irq,
- timer_irq(map->direct_ptimer),
- &arch_timer_irq_ops);
+ timer_irq(map->direct_ptimer));
WARN_ON_ONCE(ret);
}
}
@@ -864,7 +880,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
get_timer_map(vcpu, &map);
if (static_branch_likely(&has_gic_active_state)) {
- if (vcpu_has_nv(vcpu))
+ /* We don't do NV on GICv5, yet */
+ if (vcpu_has_nv(vcpu) && !vgic_is_v5(vcpu->kvm))
kvm_timer_vcpu_load_nested_switch(vcpu, &map);
kvm_timer_vcpu_load_gic(map.direct_vtimer);
@@ -934,6 +951,12 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
if (kvm_vcpu_is_blocking(vcpu))
kvm_timer_blocking(vcpu);
+
+ if (vgic_is_v5(vcpu->kvm)) {
+ set_timer_irq_phys_active(map.direct_vtimer, false);
+ if (map.direct_ptimer)
+ set_timer_irq_phys_active(map.direct_ptimer, false);
+ }
}
void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
@@ -1097,10 +1120,19 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
HRTIMER_MODE_ABS_HARD);
}
+/*
+ * This is always called during kvm_arch_init_vm, but will also be
+ * called from kvm_vgic_create if we have a vGICv5.
+ */
void kvm_timer_init_vm(struct kvm *kvm)
{
+ /*
+ * Set up the default PPIs - note that we adjust them based on
+ * the model of the GIC as GICv5 uses a different way to
+ * describing interrupts.
+ */
for (int i = 0; i < NR_KVM_TIMERS; i++)
- kvm->arch.timer_data.ppi[i] = default_ppi[i];
+ kvm->arch.timer_data.ppi[i] = get_vgic_ppi(kvm, default_ppi[i]);
}
void kvm_timer_cpu_up(void)
@@ -1269,7 +1301,15 @@ static int timer_irq_set_irqchip_state(struct irq_data *d,
static void timer_irq_eoi(struct irq_data *d)
{
- if (!irqd_is_forwarded_to_vcpu(d))
+ /*
+ * On a GICv5 host, we still need to call EOI on the parent for
+ * PPIs. The host driver already handles irqs which are forwarded to
+ * vcpus, and skips the GIC CDDI while still doing the GIC CDEOI. This
+ * is required to emulate the EOIMode=1 on GICv5 hardware. Failure to
+ * call EOI unsurprisingly results in *BAD* lock-ups.
+ */
+ if (!irqd_is_forwarded_to_vcpu(d) ||
+ kvm_vgic_global_state.type == VGIC_V5)
irq_chip_eoi_parent(d);
}
@@ -1333,7 +1373,8 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info)
host_vtimer_irq = info->virtual_irq;
kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags);
- if (kvm_vgic_global_state.no_hw_deactivation) {
+ if (kvm_vgic_global_state.no_hw_deactivation ||
+ kvm_vgic_global_state.type == VGIC_V5) {
struct fwnode_handle *fwnode;
struct irq_data *data;
@@ -1351,7 +1392,8 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info)
return -ENOMEM;
}
- arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE;
+ if (kvm_vgic_global_state.no_hw_deactivation)
+ arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE;
WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq,
(void *)TIMER_VTIMER));
}
@@ -1501,11 +1543,18 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
if (kvm_vgic_set_owner(vcpu, irq, ctx))
break;
+ /* With GICv5, the default PPI is what you get -- nothing else */
+ if (vgic_is_v5(vcpu->kvm) && irq != get_vgic_ppi(vcpu->kvm, default_ppi[i]))
+ break;
+
/*
- * We know by construction that we only have PPIs, so
- * all values are less than 32.
+ * We know by construction that we only have PPIs, so all values
+ * are less than 32 for non-GICv5 VGICs. On GICv5, they are
+ * architecturally defined to be under 32 too. However, we mask
+ * off most of the bits as we might be presented with a GICv5
+ * style PPI where the type is encoded in the top-bits.
*/
- ppis |= BIT(irq);
+ ppis |= BIT(irq & 0x1f);
}
valid = hweight32(ppis) == nr_timers(vcpu);
@@ -1543,6 +1592,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = vcpu_timer(vcpu);
struct timer_map map;
+ struct irq_ops *ops;
int ret;
if (timer->enabled)
@@ -1563,20 +1613,22 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
get_timer_map(vcpu, &map);
+ ops = vgic_is_v5(vcpu->kvm) ? &arch_timer_irq_ops_vgic_v5 :
+ &arch_timer_irq_ops;
+
+ for (int i = 0; i < nr_timers(vcpu); i++)
+ kvm_vgic_set_irq_ops(vcpu, timer_irq(vcpu_get_timer(vcpu, i)), ops);
+
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_vtimer->host_timer_irq,
- timer_irq(map.direct_vtimer),
- &arch_timer_irq_ops);
+ timer_irq(map.direct_vtimer));
if (ret)
return ret;
- if (map.direct_ptimer) {
+ if (map.direct_ptimer)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_ptimer->host_timer_irq,
- timer_irq(map.direct_ptimer),
- &arch_timer_irq_ops);
- }
-
+ timer_irq(map.direct_ptimer));
if (ret)
return ret;
@@ -1603,15 +1655,14 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (get_user(irq, uaddr))
return -EFAULT;
- if (!(irq_is_ppi(irq)))
+ if (!(irq_is_ppi(vcpu->kvm, irq)))
return -EINVAL;
- mutex_lock(&vcpu->kvm->arch.config_lock);
+ guard(mutex)(&vcpu->kvm->arch.config_lock);
if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE,
&vcpu->kvm->arch.flags)) {
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
switch (attr->attr) {
@@ -1628,8 +1679,7 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
idx = TIMER_HPTIMER;
break;
default:
- ret = -ENXIO;
- goto out;
+ return -ENXIO;
}
/*
@@ -1639,8 +1689,6 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
*/
vcpu->kvm->arch.timer_data.ppi[idx] = irq;
-out:
- mutex_unlock(&vcpu->kvm->arch.config_lock);
return ret;
}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 410ffd41fd73..176cbe8baad3 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -24,6 +24,7 @@
#define CREATE_TRACE_POINTS
#include "trace_arm.h"
+#include "hyp_trace.h"
#include <linux/uaccess.h>
#include <asm/ptrace.h>
@@ -35,6 +36,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/kvm_pkvm.h>
@@ -45,6 +47,9 @@
#include <kvm/arm_hypercalls.h>
#include <kvm/arm_pmu.h>
#include <kvm/arm_psci.h>
+#include <kvm/arm_vgic.h>
+
+#include <linux/irqchip/arm-gic-v5.h>
#include "sys_regs.h"
@@ -203,6 +208,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ if (type & ~KVM_VM_TYPE_ARM_MASK)
+ return -EINVAL;
+
mutex_init(&kvm->arch.config_lock);
#ifdef CONFIG_LOCKDEP
@@ -234,9 +242,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
* If any failures occur after this is successful, make sure to
* call __pkvm_unreserve_vm to unreserve the VM in hyp.
*/
- ret = pkvm_init_host_vm(kvm);
+ ret = pkvm_init_host_vm(kvm, type);
if (ret)
- goto err_free_cpumask;
+ goto err_uninit_mmu;
+ } else if (type & KVM_VM_TYPE_ARM_PROTECTED) {
+ ret = -EINVAL;
+ goto err_uninit_mmu;
}
kvm_vgic_early_init(kvm);
@@ -252,6 +263,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
+err_uninit_mmu:
+ kvm_uninit_stage2_mmu(kvm);
err_free_cpumask:
free_cpumask_var(kvm->arch.supported_cpus);
err_unshare_kvm:
@@ -301,6 +314,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (is_protected_kvm_enabled())
pkvm_destroy_hyp_vm(kvm);
+ kvm_uninit_stage2_mmu(kvm);
kvm_destroy_mpidr_data(kvm);
kfree(kvm->arch.sysreg_masks);
@@ -613,6 +627,9 @@ static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
+ if (vgic_is_v5(vcpu->kvm))
+ return single_task_running();
+
return single_task_running() &&
vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
(atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
@@ -705,6 +722,8 @@ nommu:
if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
vcpu_set_on_unsupported_cpu(vcpu);
+
+ vcpu->arch.pid = pid_nr(vcpu->pid);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -934,6 +953,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
return ret;
}
+ ret = vgic_v5_finalize_ppi_state(kvm);
+ if (ret)
+ return ret;
+
if (is_protected_kvm_enabled()) {
ret = pkvm_create_hyp_vm(kvm);
if (ret)
@@ -1439,10 +1462,11 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
bool line_status)
{
- u32 irq = irq_level->irq;
unsigned int irq_type, vcpu_id, irq_num;
struct kvm_vcpu *vcpu = NULL;
bool level = irq_level->level;
+ u32 irq = irq_level->irq;
+ unsigned long *mask;
irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
@@ -1472,16 +1496,37 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
if (!vcpu)
return -EINVAL;
- if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
+ if (vgic_is_v5(kvm)) {
+ if (irq_num >= VGIC_V5_NR_PRIVATE_IRQS)
+ return -EINVAL;
+
+ /*
+ * Only allow PPIs that are explicitly exposed to
+ * usespace to be driven via KVM_IRQ_LINE
+ */
+ mask = kvm->arch.vgic.gicv5_vm.userspace_ppis;
+ if (!test_bit(irq_num, mask))
+ return -EINVAL;
+
+ /* Build a GICv5-style IntID here */
+ irq_num = vgic_v5_make_ppi(irq_num);
+ } else if (irq_num < VGIC_NR_SGIS ||
+ irq_num >= VGIC_NR_PRIVATE_IRQS) {
return -EINVAL;
+ }
return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
case KVM_ARM_IRQ_TYPE_SPI:
if (!irqchip_in_kernel(kvm))
return -ENXIO;
- if (irq_num < VGIC_NR_PRIVATE_IRQS)
- return -EINVAL;
+ if (vgic_is_v5(kvm)) {
+ /* Build a GICv5-style IntID here */
+ irq_num = vgic_v5_make_spi(irq_num);
+ } else {
+ if (irq_num < VGIC_NR_PRIVATE_IRQS)
+ return -EINVAL;
+ }
return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
}
@@ -2414,6 +2459,10 @@ static int __init init_subsystems(void)
kvm_register_perf_callbacks();
+ err = kvm_hyp_trace_init();
+ if (err)
+ kvm_err("Failed to initialize Hyp tracing\n");
+
out:
if (err)
hyp_cpu_pm_exit();
@@ -2465,7 +2514,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
preempt_disable();
cpu_hyp_init_context();
ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
- num_possible_cpus(), kern_hyp_va(per_cpu_base),
+ kern_hyp_va(per_cpu_base),
hyp_va_bits);
cpu_hyp_init_features();
@@ -2507,6 +2556,7 @@ static void kvm_hyp_init_symbols(void)
{
kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+ kvm_nvhe_sym(id_aa64pfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR2_EL1);
kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
@@ -2529,6 +2579,9 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks;
kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks;
kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks;
+ kvm_nvhe_sym(ich_hfgrtr_masks) = ich_hfgrtr_masks;
+ kvm_nvhe_sym(ich_hfgwtr_masks) = ich_hfgwtr_masks;
+ kvm_nvhe_sym(ich_hfgitr_masks) = ich_hfgitr_masks;
/*
* Flush entire BSS since part of its data containing init symbols is read
@@ -2674,6 +2727,8 @@ static int __init init_hyp_mode(void)
kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
}
+ kvm_nvhe_sym(hyp_nr_cpus) = num_possible_cpus();
+
/*
* Map the Hyp-code called directly from the host
*/
diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c
index d9f553cbf9df..f35b8dddd7c1 100644
--- a/arch/arm64/kvm/config.c
+++ b/arch/arm64/kvm/config.c
@@ -225,6 +225,7 @@ struct reg_feat_map_desc {
#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP
#define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP
#define FEAT_S2PIE ID_AA64MMFR3_EL1, S2PIE, IMP
+#define FEAT_GCIE ID_AA64PFR2_EL1, GCIE, IMP
static bool not_feat_aa64el3(struct kvm *kvm)
{
@@ -1277,6 +1278,58 @@ static const struct reg_bits_to_feat_map vtcr_el2_feat_map[] = {
static const DECLARE_FEAT_MAP(vtcr_el2_desc, VTCR_EL2,
vtcr_el2_feat_map, FEAT_AA64EL2);
+static const struct reg_bits_to_feat_map ich_hfgrtr_feat_map[] = {
+ NEEDS_FEAT(ICH_HFGRTR_EL2_ICC_APR_EL1 |
+ ICH_HFGRTR_EL2_ICC_IDRn_EL1 |
+ ICH_HFGRTR_EL2_ICC_CR0_EL1 |
+ ICH_HFGRTR_EL2_ICC_HPPIR_EL1 |
+ ICH_HFGRTR_EL2_ICC_PCR_EL1 |
+ ICH_HFGRTR_EL2_ICC_ICSR_EL1 |
+ ICH_HFGRTR_EL2_ICC_IAFFIDR_EL1 |
+ ICH_HFGRTR_EL2_ICC_PPI_HMRn_EL1 |
+ ICH_HFGRTR_EL2_ICC_PPI_ENABLERn_EL1 |
+ ICH_HFGRTR_EL2_ICC_PPI_PENDRn_EL1 |
+ ICH_HFGRTR_EL2_ICC_PPI_PRIORITYRn_EL1 |
+ ICH_HFGRTR_EL2_ICC_PPI_ACTIVERn_EL1,
+ FEAT_GCIE),
+};
+
+static const DECLARE_FEAT_MAP_FGT(ich_hfgrtr_desc, ich_hfgrtr_masks,
+ ich_hfgrtr_feat_map, FEAT_GCIE);
+
+static const struct reg_bits_to_feat_map ich_hfgwtr_feat_map[] = {
+ NEEDS_FEAT(ICH_HFGWTR_EL2_ICC_APR_EL1 |
+ ICH_HFGWTR_EL2_ICC_CR0_EL1 |
+ ICH_HFGWTR_EL2_ICC_PCR_EL1 |
+ ICH_HFGWTR_EL2_ICC_ICSR_EL1 |
+ ICH_HFGWTR_EL2_ICC_PPI_ENABLERn_EL1 |
+ ICH_HFGWTR_EL2_ICC_PPI_PENDRn_EL1 |
+ ICH_HFGWTR_EL2_ICC_PPI_PRIORITYRn_EL1 |
+ ICH_HFGWTR_EL2_ICC_PPI_ACTIVERn_EL1,
+ FEAT_GCIE),
+};
+
+static const DECLARE_FEAT_MAP_FGT(ich_hfgwtr_desc, ich_hfgwtr_masks,
+ ich_hfgwtr_feat_map, FEAT_GCIE);
+
+static const struct reg_bits_to_feat_map ich_hfgitr_feat_map[] = {
+ NEEDS_FEAT(ICH_HFGITR_EL2_GICCDEN |
+ ICH_HFGITR_EL2_GICCDDIS |
+ ICH_HFGITR_EL2_GICCDPRI |
+ ICH_HFGITR_EL2_GICCDAFF |
+ ICH_HFGITR_EL2_GICCDPEND |
+ ICH_HFGITR_EL2_GICCDRCFG |
+ ICH_HFGITR_EL2_GICCDHM |
+ ICH_HFGITR_EL2_GICCDEOI |
+ ICH_HFGITR_EL2_GICCDDI |
+ ICH_HFGITR_EL2_GICRCDIA |
+ ICH_HFGITR_EL2_GICRCDNMIA,
+ FEAT_GCIE),
+};
+
+static const DECLARE_FEAT_MAP_FGT(ich_hfgitr_desc, ich_hfgitr_masks,
+ ich_hfgitr_feat_map, FEAT_GCIE);
+
static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
int map_size, u64 resx, const char *str)
{
@@ -1328,6 +1381,9 @@ void __init check_feature_map(void)
check_reg_desc(&sctlr_el2_desc);
check_reg_desc(&mdcr_el2_desc);
check_reg_desc(&vtcr_el2_desc);
+ check_reg_desc(&ich_hfgrtr_desc);
+ check_reg_desc(&ich_hfgwtr_desc);
+ check_reg_desc(&ich_hfgitr_desc);
}
static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map)
@@ -1460,6 +1516,13 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt)
val |= compute_fgu_bits(kvm, &hdfgrtr2_desc);
val |= compute_fgu_bits(kvm, &hdfgwtr2_desc);
break;
+ case ICH_HFGRTR_GROUP:
+ val |= compute_fgu_bits(kvm, &ich_hfgrtr_desc);
+ val |= compute_fgu_bits(kvm, &ich_hfgwtr_desc);
+ break;
+ case ICH_HFGITR_GROUP:
+ val |= compute_fgu_bits(kvm, &ich_hfgitr_desc);
+ break;
default:
BUG();
}
@@ -1531,6 +1594,15 @@ struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg)
case VTCR_EL2:
resx = compute_reg_resx_bits(kvm, &vtcr_el2_desc, 0, 0);
break;
+ case ICH_HFGRTR_EL2:
+ resx = compute_reg_resx_bits(kvm, &ich_hfgrtr_desc, 0, 0);
+ break;
+ case ICH_HFGWTR_EL2:
+ resx = compute_reg_resx_bits(kvm, &ich_hfgwtr_desc, 0, 0);
+ break;
+ case ICH_HFGITR_EL2:
+ resx = compute_reg_resx_bits(kvm, &ich_hfgitr_desc, 0, 0);
+ break;
default:
WARN_ON_ONCE(1);
resx = (typeof(resx)){};
@@ -1565,6 +1637,12 @@ static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg
return &hdfgrtr2_masks;
case HDFGWTR2_EL2:
return &hdfgwtr2_masks;
+ case ICH_HFGRTR_EL2:
+ return &ich_hfgrtr_masks;
+ case ICH_HFGWTR_EL2:
+ return &ich_hfgwtr_masks;
+ case ICH_HFGITR_EL2:
+ return &ich_hfgitr_masks;
default:
BUILD_BUG_ON(1);
}
@@ -1585,8 +1663,8 @@ static __always_inline void __compute_fgt(struct kvm_vcpu *vcpu, enum vcpu_sysre
clear |= ~nested & m->nmask;
}
- val |= set;
- val &= ~clear;
+ val |= set | m->res1;
+ val &= ~(clear | m->res0);
*vcpu_fgt(vcpu, reg) = val;
}
@@ -1606,6 +1684,32 @@ static void __compute_hdfgwtr(struct kvm_vcpu *vcpu)
*vcpu_fgt(vcpu, HDFGWTR_EL2) |= HDFGWTR_EL2_MDSCR_EL1;
}
+static void __compute_ich_hfgrtr(struct kvm_vcpu *vcpu)
+{
+ __compute_fgt(vcpu, ICH_HFGRTR_EL2);
+
+ /*
+ * ICC_IAFFIDR_EL1 *always* needs to be trapped when running a guest.
+ *
+ * We also trap accesses to ICC_IDR0_EL1 to allow us to completely hide
+ * FEAT_GCIE_LEGACY from the guest, and to (potentially) present fewer
+ * ID bits than the host supports.
+ */
+ *vcpu_fgt(vcpu, ICH_HFGRTR_EL2) &= ~(ICH_HFGRTR_EL2_ICC_IAFFIDR_EL1 |
+ ICH_HFGRTR_EL2_ICC_IDRn_EL1);
+}
+
+static void __compute_ich_hfgwtr(struct kvm_vcpu *vcpu)
+{
+ __compute_fgt(vcpu, ICH_HFGWTR_EL2);
+
+ /*
+ * We present a different subset of PPIs the guest from what
+ * exist in real hardware. We only trap writes, not reads.
+ */
+ *vcpu_fgt(vcpu, ICH_HFGWTR_EL2) &= ~(ICH_HFGWTR_EL2_ICC_PPI_ENABLERn_EL1);
+}
+
void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu)
{
if (!cpus_have_final_cap(ARM64_HAS_FGT))
@@ -1618,12 +1722,17 @@ void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu)
__compute_hdfgwtr(vcpu);
__compute_fgt(vcpu, HAFGRTR_EL2);
- if (!cpus_have_final_cap(ARM64_HAS_FGT2))
- return;
+ if (cpus_have_final_cap(ARM64_HAS_FGT2)) {
+ __compute_fgt(vcpu, HFGRTR2_EL2);
+ __compute_fgt(vcpu, HFGWTR2_EL2);
+ __compute_fgt(vcpu, HFGITR2_EL2);
+ __compute_fgt(vcpu, HDFGRTR2_EL2);
+ __compute_fgt(vcpu, HDFGWTR2_EL2);
+ }
- __compute_fgt(vcpu, HFGRTR2_EL2);
- __compute_fgt(vcpu, HFGWTR2_EL2);
- __compute_fgt(vcpu, HFGITR2_EL2);
- __compute_fgt(vcpu, HDFGRTR2_EL2);
- __compute_fgt(vcpu, HDFGWTR2_EL2);
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ __compute_ich_hfgrtr(vcpu);
+ __compute_ich_hfgwtr(vcpu);
+ __compute_fgt(vcpu, ICH_HFGITR_EL2);
+ }
}
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 22d497554c94..dba7ced74ca5 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -2053,6 +2053,60 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1),
SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1),
SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1),
+
+ /*
+ * ICH_HFGRTR_EL2 & ICH_HFGWTR_EL2
+ */
+ SR_FGT(SYS_ICC_APR_EL1, ICH_HFGRTR, ICC_APR_EL1, 0),
+ SR_FGT(SYS_ICC_IDR0_EL1, ICH_HFGRTR, ICC_IDRn_EL1, 0),
+ SR_FGT(SYS_ICC_CR0_EL1, ICH_HFGRTR, ICC_CR0_EL1, 0),
+ SR_FGT(SYS_ICC_HPPIR_EL1, ICH_HFGRTR, ICC_HPPIR_EL1, 0),
+ SR_FGT(SYS_ICC_PCR_EL1, ICH_HFGRTR, ICC_PCR_EL1, 0),
+ SR_FGT(SYS_ICC_ICSR_EL1, ICH_HFGRTR, ICC_ICSR_EL1, 0),
+ SR_FGT(SYS_ICC_IAFFIDR_EL1, ICH_HFGRTR, ICC_IAFFIDR_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_HMR0_EL1, ICH_HFGRTR, ICC_PPI_HMRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_HMR1_EL1, ICH_HFGRTR, ICC_PPI_HMRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_ENABLER0_EL1, ICH_HFGRTR, ICC_PPI_ENABLERn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_ENABLER1_EL1, ICH_HFGRTR, ICC_PPI_ENABLERn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_CPENDR0_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_CPENDR1_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_SPENDR0_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_SPENDR1_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR0_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR1_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR2_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR3_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR4_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR5_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR6_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR7_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR8_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR9_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR10_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR11_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR12_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR13_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR14_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_PRIORITYR15_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_CACTIVER0_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_CACTIVER1_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_SACTIVER0_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0),
+ SR_FGT(SYS_ICC_PPI_SACTIVER1_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0),
+
+ /*
+ * ICH_HFGITR_EL2
+ */
+ SR_FGT(GICV5_OP_GIC_CDEN, ICH_HFGITR, GICCDEN, 0),
+ SR_FGT(GICV5_OP_GIC_CDDIS, ICH_HFGITR, GICCDDIS, 0),
+ SR_FGT(GICV5_OP_GIC_CDPRI, ICH_HFGITR, GICCDPRI, 0),
+ SR_FGT(GICV5_OP_GIC_CDAFF, ICH_HFGITR, GICCDAFF, 0),
+ SR_FGT(GICV5_OP_GIC_CDPEND, ICH_HFGITR, GICCDPEND, 0),
+ SR_FGT(GICV5_OP_GIC_CDRCFG, ICH_HFGITR, GICCDRCFG, 0),
+ SR_FGT(GICV5_OP_GIC_CDHM, ICH_HFGITR, GICCDHM, 0),
+ SR_FGT(GICV5_OP_GIC_CDEOI, ICH_HFGITR, GICCDEOI, 0),
+ SR_FGT(GICV5_OP_GIC_CDDI, ICH_HFGITR, GICCDDI, 0),
+ SR_FGT(GICV5_OP_GICR_CDIA, ICH_HFGITR, GICRCDIA, 0),
+ SR_FGT(GICV5_OP_GICR_CDNMIA, ICH_HFGITR, GICRCDNMIA, 0),
};
/*
@@ -2127,6 +2181,9 @@ FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2);
FGT_MASKS(hfgitr2_masks, HFGITR2_EL2);
FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2);
FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2);
+FGT_MASKS(ich_hfgrtr_masks, ICH_HFGRTR_EL2);
+FGT_MASKS(ich_hfgwtr_masks, ICH_HFGWTR_EL2);
+FGT_MASKS(ich_hfgitr_masks, ICH_HFGITR_EL2);
static __init bool aggregate_fgt(union trap_config tc)
{
@@ -2162,6 +2219,14 @@ static __init bool aggregate_fgt(union trap_config tc)
rmasks = &hfgitr2_masks;
wmasks = NULL;
break;
+ case ICH_HFGRTR_GROUP:
+ rmasks = &ich_hfgrtr_masks;
+ wmasks = &ich_hfgwtr_masks;
+ break;
+ case ICH_HFGITR_GROUP:
+ rmasks = &ich_hfgitr_masks;
+ wmasks = NULL;
+ break;
}
rresx = rmasks->res0 | rmasks->res1;
@@ -2232,6 +2297,9 @@ static __init int check_all_fgt_masks(int ret)
&hfgitr2_masks,
&hdfgrtr2_masks,
&hdfgwtr2_masks,
+ &ich_hfgrtr_masks,
+ &ich_hfgwtr_masks,
+ &ich_hfgitr_masks,
};
int err = 0;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index cc7d5d1709cb..54aedf93c78b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -539,7 +539,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
/* All hyp bugs, including warnings, are treated as fatal. */
if (!is_protected_kvm_enabled() ||
- IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
+ IS_ENABLED(CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC)) {
struct bug_entry *bug = find_bug(elr_in_kimg);
if (bug)
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 2597e8bda867..ae04fd680d1e 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -233,6 +233,18 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu)
__activate_fgt(hctxt, vcpu, HDFGWTR2_EL2);
}
+static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return;
+
+ __activate_fgt(hctxt, vcpu, ICH_HFGRTR_EL2);
+ __activate_fgt(hctxt, vcpu, ICH_HFGWTR_EL2);
+ __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2);
+}
+
#define __deactivate_fgt(htcxt, vcpu, reg) \
do { \
write_sysreg_s(ctxt_sys_reg(hctxt, reg), \
@@ -265,6 +277,19 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
__deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2);
}
+static inline void __deactivate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return;
+
+ __deactivate_fgt(hctxt, vcpu, ICH_HFGRTR_EL2);
+ __deactivate_fgt(hctxt, vcpu, ICH_HFGWTR_EL2);
+ __deactivate_fgt(hctxt, vcpu, ICH_HFGITR_EL2);
+
+}
+
static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
{
u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
@@ -328,6 +353,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
}
__activate_traps_hfgxtr(vcpu);
+ __activate_traps_ich_hfgxtr(vcpu);
__activate_traps_mpam(vcpu);
}
@@ -345,6 +371,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2);
__deactivate_traps_hfgxtr(vcpu);
+ __deactivate_traps_ich_hfgxtr(vcpu);
__deactivate_traps_mpam();
}
diff --git a/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h
new file mode 100644
index 000000000000..1258bc84477f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__
+#define __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__
+
+#include <asm/kvm_hypevents.h>
+
+#include <linux/arm-smccc.h>
+
+#define hyp_smccc_1_1_smc(...) \
+ do { \
+ trace_hyp_exit(NULL, HYP_REASON_SMC); \
+ arm_smccc_1_1_smc(__VA_ARGS__); \
+ trace_hyp_enter(NULL, HYP_REASON_SMC); \
+ } while (0)
+
+#define hyp_smccc_1_2_smc(...) \
+ do { \
+ trace_hyp_exit(NULL, HYP_REASON_SMC); \
+ arm_smccc_1_2_smc(__VA_ARGS__); \
+ trace_hyp_enter(NULL, HYP_REASON_SMC); \
+ } while (0)
+
+#endif /* __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/clock.h b/arch/arm64/kvm/hyp/include/nvhe/clock.h
new file mode 100644
index 000000000000..9f429f5c0664
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/clock.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARM64_KVM_HYP_NVHE_CLOCK_H
+#define __ARM64_KVM_HYP_NVHE_CLOCK_H
+#include <linux/types.h>
+
+#include <asm/kvm_hyp.h>
+
+#ifdef CONFIG_NVHE_EL2_TRACING
+void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc);
+u64 trace_clock(void);
+#else
+static inline void
+trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { }
+static inline u64 trace_clock(void) { return 0; }
+#endif
+#endif
diff --git a/arch/arm64/kvm/hyp/include/nvhe/define_events.h b/arch/arm64/kvm/hyp/include/nvhe/define_events.h
new file mode 100644
index 000000000000..776d4c6cb702
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/define_events.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#undef HYP_EVENT
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \
+ struct hyp_event_id hyp_event_id_##__name \
+ __section(".hyp.event_ids."#__name) = { \
+ .enabled = ATOMIC_INIT(0), \
+ }
+
+#define HYP_EVENT_MULTI_READ
+#include <asm/kvm_hypevents.h>
+#undef HYP_EVENT_MULTI_READ
+
+#undef HYP_EVENT
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 5f9d56754e39..3cbfae0e3dda 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -27,18 +27,22 @@ extern struct host_mmu host_mmu;
enum pkvm_component_id {
PKVM_ID_HOST,
PKVM_ID_HYP,
- PKVM_ID_FFA,
+ PKVM_ID_GUEST,
};
-extern unsigned long hyp_nr_cpus;
-
int __pkvm_prot_finalize(void);
int __pkvm_host_share_hyp(u64 pfn);
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn);
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn);
int __pkvm_host_unshare_hyp(u64 pfn);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
+int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu);
+int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys);
+int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm);
int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
enum kvm_pgtable_prot prot);
int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm);
@@ -70,6 +74,8 @@ static __always_inline void __load_host_stage2(void)
#ifdef CONFIG_NVHE_EL2_DEBUG
void pkvm_ownership_selftest(void *base);
+struct pkvm_hyp_vcpu *init_selftest_vm(void *virt);
+void teardown_selftest_vm(void);
#else
static inline void pkvm_ownership_selftest(void *base) { }
#endif
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index dee1a406b0c2..b50712d47f6d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -30,8 +30,14 @@ enum pkvm_page_state {
* struct hyp_page.
*/
PKVM_NOPAGE = BIT(0) | BIT(1),
+
+ /*
+ * 'Meta-states' which aren't encoded directly in the PTE's SW bits (or
+ * the hyp_vmemmap entry for the host)
+ */
+ PKVM_POISON = BIT(2),
};
-#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1))
+#define PKVM_PAGE_STATE_VMEMMAP_MASK (BIT(0) | BIT(1))
#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
@@ -108,12 +114,12 @@ static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state
static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p)
{
- return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK;
+ return p->__hyp_state_comp ^ PKVM_PAGE_STATE_VMEMMAP_MASK;
}
static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state)
{
- p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK;
+ p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_VMEMMAP_MASK;
}
/*
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 184ad7a39950..c904647d2f76 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -73,8 +73,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva);
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
unsigned long vcpu_hva);
-int __pkvm_teardown_vm(pkvm_handle_t handle);
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn);
+int __pkvm_start_teardown_vm(pkvm_handle_t handle);
+int __pkvm_finalize_teardown_vm(pkvm_handle_t handle);
+
+struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle);
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
unsigned int vcpu_idx);
void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
@@ -84,6 +88,7 @@ struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle);
struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle);
void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm);
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
new file mode 100644
index 000000000000..8813ff250f8e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H
+#define __ARM64_KVM_HYP_NVHE_TRACE_H
+
+#include <linux/trace_remote_event.h>
+
+#include <asm/kvm_hyptrace.h>
+
+static inline pid_t __tracing_get_vcpu_pid(struct kvm_cpu_context *host_ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!host_ctxt)
+ host_ctxt = host_data_ptr(host_ctxt);
+
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ return vcpu ? vcpu->arch.pid : 0;
+}
+
+#define HE_PROTO(__args...) __args
+#define HE_ASSIGN(__args...) __args
+#define HE_STRUCT RE_STRUCT
+#define he_field re_field
+
+#ifdef CONFIG_NVHE_EL2_TRACING
+
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \
+ REMOTE_EVENT_FORMAT(__name, __struct); \
+ extern struct hyp_event_id hyp_event_id_##__name; \
+ static __always_inline void trace_##__name(__proto) \
+ { \
+ struct remote_event_format_##__name *__entry; \
+ size_t length = sizeof(*__entry); \
+ \
+ if (!atomic_read(&hyp_event_id_##__name.enabled)) \
+ return; \
+ __entry = tracing_reserve_entry(length); \
+ if (!__entry) \
+ return; \
+ __entry->hdr.id = hyp_event_id_##__name.id; \
+ __assign \
+ tracing_commit_entry(); \
+ }
+
+void *tracing_reserve_entry(unsigned long length);
+void tracing_commit_entry(void);
+
+int __tracing_load(unsigned long desc_va, size_t desc_size);
+void __tracing_unload(void);
+int __tracing_enable(bool enable);
+int __tracing_swap_reader(unsigned int cpu);
+void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc);
+int __tracing_reset(unsigned int cpu);
+int __tracing_enable_event(unsigned short id, bool enable);
+#else
+static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
+static inline void tracing_commit_entry(void) { }
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \
+ static inline void trace_##__name(__proto) {}
+
+static inline int __tracing_load(unsigned long desc_va, size_t desc_size) { return -ENODEV; }
+static inline void __tracing_unload(void) { }
+static inline int __tracing_enable(bool enable) { return -ENODEV; }
+static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; }
+static inline void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { }
+static inline int __tracing_reset(unsigned int cpu) { return -ENODEV; }
+static inline int __tracing_enable_event(unsigned short id, bool enable) { return -ENODEV; }
+#endif
+#endif
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
index ba5382c12787..32d7b7746e8e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -16,4 +16,6 @@
__always_unused int ___check_reg_ ## reg; \
type name = (type)cpu_reg(ctxt, (reg))
+void inject_host_exception(u64 esr);
+
#endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a244ec25f8c5..62cdfbff7562 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -17,7 +17,7 @@ ccflags-y += -fno-stack-protector \
hostprogs := gen-hyprel
HOST_EXTRACFLAGS += -I$(objtree)/include
-lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o tishift.o
lib-objs := $(addprefix ../../../lib/, $(lib-objs))
CFLAGS_switch.nvhe.o += -Wno-override-init
@@ -26,11 +26,15 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
- ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o ../vgic-v5-sr.o
hyp-obj-y += ../../../kernel/smccc-call.o
hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
+hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o events.o
hyp-obj-y += $(lib-objs)
+# Path to simple_ring_buffer.c
+CFLAGS_trace.nvhe.o += -I$(srctree)/kernel/trace/
+
##
## Build rules for compiling nVHE hyp code
## Output of this folder is `kvm_nvhe.o`, a partially linked object
diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c
new file mode 100644
index 000000000000..32fc4313fe43
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/clock.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/clock.h>
+
+#include <asm/arch_timer.h>
+#include <asm/div64.h>
+
+static struct clock_data {
+ struct {
+ u32 mult;
+ u32 shift;
+ u64 epoch_ns;
+ u64 epoch_cyc;
+ u64 cyc_overflow64;
+ } data[2];
+ u64 cur;
+} trace_clock_data;
+
+static u64 __clock_mult_uint128(u64 cyc, u32 mult, u32 shift)
+{
+ __uint128_t ns = (__uint128_t)cyc * mult;
+
+ ns >>= shift;
+
+ return (u64)ns;
+}
+
+/* Does not guarantee no reader on the modified bank. */
+void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
+{
+ struct clock_data *clock = &trace_clock_data;
+ u64 bank = clock->cur ^ 1;
+
+ clock->data[bank].mult = mult;
+ clock->data[bank].shift = shift;
+ clock->data[bank].epoch_ns = epoch_ns;
+ clock->data[bank].epoch_cyc = epoch_cyc;
+ clock->data[bank].cyc_overflow64 = ULONG_MAX / mult;
+
+ smp_store_release(&clock->cur, bank);
+}
+
+/* Use untrusted host data */
+u64 trace_clock(void)
+{
+ struct clock_data *clock = &trace_clock_data;
+ u64 bank = smp_load_acquire(&clock->cur);
+ u64 cyc, ns;
+
+ cyc = __arch_counter_get_cntvct() - clock->data[bank].epoch_cyc;
+
+ if (likely(cyc < clock->data[bank].cyc_overflow64)) {
+ ns = cyc * clock->data[bank].mult;
+ ns >>= clock->data[bank].shift;
+ } else {
+ ns = __clock_mult_uint128(cyc, clock->data[bank].mult,
+ clock->data[bank].shift);
+ }
+
+ return (u64)ns + clock->data[bank].epoch_ns;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 2a1c0f49792b..f8904391c125 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -14,20 +14,20 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
-static void __debug_save_spe(u64 *pmscr_el1)
+static void __debug_save_spe(void)
{
- u64 reg;
+ u64 *pmscr_el1, *pmblimitr_el1;
- /* Clear pmscr in case of early return */
- *pmscr_el1 = 0;
+ pmscr_el1 = host_data_ptr(host_debug_state.pmscr_el1);
+ pmblimitr_el1 = host_data_ptr(host_debug_state.pmblimitr_el1);
/*
* At this point, we know that this CPU implements
* SPE and is available to the host.
* Check if the host is actually using it ?
*/
- reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
- if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT)))
+ *pmblimitr_el1 = read_sysreg_s(SYS_PMBLIMITR_EL1);
+ if (!(*pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT)))
return;
/* Yes; save the control register and disable data generation */
@@ -37,18 +37,29 @@ static void __debug_save_spe(u64 *pmscr_el1)
/* Now drain all buffered data to memory */
psb_csync();
+ dsb(nsh);
+
+ /* And disable the profiling buffer */
+ write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+ isb();
}
-static void __debug_restore_spe(u64 pmscr_el1)
+static void __debug_restore_spe(void)
{
- if (!pmscr_el1)
+ u64 pmblimitr_el1 = *host_data_ptr(host_debug_state.pmblimitr_el1);
+
+ if (!(pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT)))
return;
/* The host page table is installed, but not yet synchronised */
isb();
+ /* Re-enable the profiling buffer. */
+ write_sysreg_s(pmblimitr_el1, SYS_PMBLIMITR_EL1);
+ isb();
+
/* Re-enable data generation */
- write_sysreg_el1(pmscr_el1, SYS_PMSCR);
+ write_sysreg_el1(*host_data_ptr(host_debug_state.pmscr_el1), SYS_PMSCR);
}
static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr)
@@ -57,12 +68,54 @@ static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr)
write_sysreg_el1(new_trfcr, SYS_TRFCR);
}
-static bool __trace_needs_drain(void)
+static void __trace_drain_and_disable(void)
{
- if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE))
- return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E;
+ u64 *trblimitr_el1 = host_data_ptr(host_debug_state.trblimitr_el1);
+ bool needs_drain = is_protected_kvm_enabled() ?
+ host_data_test_flag(HAS_TRBE) :
+ host_data_test_flag(TRBE_ENABLED);
+
+ if (!needs_drain) {
+ *trblimitr_el1 = 0;
+ return;
+ }
+
+ *trblimitr_el1 = read_sysreg_s(SYS_TRBLIMITR_EL1);
+ if (*trblimitr_el1 & TRBLIMITR_EL1_E) {
+ /*
+ * The host has enabled the Trace Buffer Unit so we have
+ * to beat the CPU with a stick until it stops accessing
+ * memory.
+ */
- return host_data_test_flag(TRBE_ENABLED);
+ /* First, ensure that our prior write to TRFCR has stuck. */
+ isb();
+
+ /* Now synchronise with the trace and drain the buffer. */
+ tsb_csync();
+ dsb(nsh);
+
+ /*
+ * With no more trace being generated, we can disable the
+ * Trace Buffer Unit.
+ */
+ write_sysreg_s(0, SYS_TRBLIMITR_EL1);
+ if (cpus_have_final_cap(ARM64_WORKAROUND_2064142)) {
+ /*
+ * Some CPUs are so good, we have to drain 'em
+ * twice.
+ */
+ tsb_csync();
+ dsb(nsh);
+ }
+
+ /*
+ * Ensure that the Trace Buffer Unit is disabled before
+ * we start mucking with the stage-2 and trap
+ * configuration.
+ */
+ isb();
+ }
}
static bool __trace_needs_switch(void)
@@ -79,21 +132,34 @@ static void __trace_switch_to_guest(void)
__trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1),
*host_data_ptr(trfcr_while_in_guest));
-
- if (__trace_needs_drain()) {
- isb();
- tsb_csync();
- }
+ __trace_drain_and_disable();
}
static void __trace_switch_to_host(void)
{
+ u64 trblimitr_el1 = *host_data_ptr(host_debug_state.trblimitr_el1);
+
+ if (trblimitr_el1 & TRBLIMITR_EL1_E) {
+ /* Re-enable the Trace Buffer Unit for the host. */
+ write_sysreg_s(trblimitr_el1, SYS_TRBLIMITR_EL1);
+ isb();
+ if (cpus_have_final_cap(ARM64_WORKAROUND_2038923)) {
+ /*
+ * Make sure the unit is re-enabled before we
+ * poke TRFCR.
+ */
+ isb();
+ }
+ }
+
__trace_do_switch(host_data_ptr(trfcr_while_in_guest),
*host_data_ptr(host_debug_state.trfcr_el1));
}
-static void __debug_save_brbe(u64 *brbcr_el1)
+static void __debug_save_brbe(void)
{
+ u64 *brbcr_el1 = host_data_ptr(host_debug_state.brbcr_el1);
+
*brbcr_el1 = 0;
/* Check if the BRBE is enabled */
@@ -109,8 +175,10 @@ static void __debug_save_brbe(u64 *brbcr_el1)
write_sysreg_el1(0, SYS_BRBCR);
}
-static void __debug_restore_brbe(u64 brbcr_el1)
+static void __debug_restore_brbe(void)
{
+ u64 brbcr_el1 = *host_data_ptr(host_debug_state.brbcr_el1);
+
if (!brbcr_el1)
return;
@@ -122,11 +190,11 @@ void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
if (host_data_test_flag(HAS_SPE))
- __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
+ __debug_save_spe();
/* Disable BRBE branch records */
if (host_data_test_flag(HAS_BRBE))
- __debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1));
+ __debug_save_brbe();
if (__trace_needs_switch())
__trace_switch_to_guest();
@@ -140,9 +208,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
if (host_data_test_flag(HAS_SPE))
- __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
+ __debug_restore_spe();
if (host_data_test_flag(HAS_BRBE))
- __debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1));
+ __debug_restore_brbe();
if (__trace_needs_switch())
__trace_switch_to_host();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/events.c b/arch/arm64/kvm/hyp/nvhe/events.c
new file mode 100644
index 000000000000..add9383aadb5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/events.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/mm.h>
+#include <nvhe/trace.h>
+
+#include <nvhe/define_events.h>
+
+int __tracing_enable_event(unsigned short id, bool enable)
+{
+ struct hyp_event_id *event_id = &__hyp_event_ids_start[id];
+ atomic_t *enabled;
+
+ if (event_id >= __hyp_event_ids_end)
+ return -EINVAL;
+
+ enabled = hyp_fixmap_map(__hyp_pa(&event_id->enabled));
+ atomic_set(enabled, enable);
+ hyp_fixmap_unmap();
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 94161ea1cd60..1af722771178 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -26,10 +26,10 @@
* the duration and are therefore serialised.
*/
-#include <linux/arm-smccc.h>
#include <linux/arm_ffa.h>
#include <asm/kvm_pkvm.h>
+#include <nvhe/arm-smccc.h>
#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/memory.h>
@@ -147,7 +147,7 @@ static int ffa_map_hyp_buffers(u64 ffa_page_count)
{
struct arm_smccc_1_2_regs res;
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_FN64_RXTX_MAP,
.a1 = hyp_virt_to_phys(hyp_buffers.tx),
.a2 = hyp_virt_to_phys(hyp_buffers.rx),
@@ -161,7 +161,7 @@ static int ffa_unmap_hyp_buffers(void)
{
struct arm_smccc_1_2_regs res;
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_RXTX_UNMAP,
.a1 = HOST_FFA_ID,
}, &res);
@@ -172,7 +172,7 @@ static int ffa_unmap_hyp_buffers(void)
static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 fraglen, u32 endpoint_id)
{
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_MEM_FRAG_TX,
.a1 = handle_lo,
.a2 = handle_hi,
@@ -184,7 +184,7 @@ static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 fragoff)
{
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_MEM_FRAG_RX,
.a1 = handle_lo,
.a2 = handle_hi,
@@ -196,7 +196,7 @@ static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
u32 fraglen)
{
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = func_id,
.a1 = len,
.a2 = fraglen,
@@ -206,7 +206,7 @@ static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 flags)
{
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_MEM_RECLAIM,
.a1 = handle_lo,
.a2 = handle_hi,
@@ -216,7 +216,7 @@ static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
{
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_FN64_MEM_RETRIEVE_REQ,
.a1 = len,
.a2 = len,
@@ -225,7 +225,7 @@ static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
static void ffa_rx_release(struct arm_smccc_1_2_regs *res)
{
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_RX_RELEASE,
}, res);
}
@@ -728,7 +728,7 @@ static int hyp_ffa_post_init(void)
size_t min_rxtx_sz;
struct arm_smccc_1_2_regs res;
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
.a0 = FFA_ID_GET,
}, &res);
if (res.a0 != FFA_SUCCESS)
@@ -737,7 +737,7 @@ static int hyp_ffa_post_init(void)
if (res.a2 != HOST_FFA_ID)
return -EINVAL;
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
.a0 = FFA_FEATURES,
.a1 = FFA_FN64_RXTX_MAP,
}, &res);
@@ -788,7 +788,7 @@ static void do_ffa_version(struct arm_smccc_1_2_regs *res,
* first if TEE supports it.
*/
if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_VERSION,
.a1 = ffa_req_version,
}, res);
@@ -824,7 +824,7 @@ static void do_ffa_part_get(struct arm_smccc_1_2_regs *res,
goto out_unlock;
}
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_PARTITION_INFO_GET,
.a1 = uuid0,
.a2 = uuid1,
@@ -939,7 +939,7 @@ int hyp_ffa_init(void *pages)
if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
return 0;
- arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
.a0 = FFA_VERSION,
.a1 = FFA_VERSION_1_2,
}, &res);
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index eef15b374abb..f337770ec459 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -120,12 +120,11 @@ SYM_FUNC_START(__hyp_do_panic)
mov x29, x0
-#ifdef CONFIG_NVHE_EL2_DEBUG
+#ifdef PKVM_DISABLE_STAGE2_ON_PANIC
/* Ensure host stage-2 is disabled */
mrs x0, hcr_el2
bic x0, x0, #HCR_VM
msr_hcr_el2 x0
- isb
tlbi vmalls12e1
dsb nsh
#endif
@@ -291,13 +290,3 @@ SYM_CODE_START(__kvm_hyp_host_forward_smc)
ret
SYM_CODE_END(__kvm_hyp_host_forward_smc)
-
-/*
- * kvm_host_psci_cpu_entry is called through br instruction, which requires
- * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external
- * functions, but bti c instead.
- */
-SYM_CODE_START(kvm_host_psci_cpu_entry)
- bti j
- b __kvm_host_psci_cpu_entry
-SYM_CODE_END(kvm_host_psci_cpu_entry)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 0d42eedc7167..64296b31da73 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -173,9 +173,8 @@ SYM_CODE_END(___kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START(kvm_hyp_cpu_entry)
- mov x1, #1 // is_cpu_on = true
+ ldr x29, =__kvm_host_psci_cpu_on_entry
b __kvm_hyp_init_cpu
-SYM_CODE_END(kvm_hyp_cpu_entry)
/*
* PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point
@@ -183,32 +182,17 @@ SYM_CODE_END(kvm_hyp_cpu_entry)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START(kvm_hyp_cpu_resume)
- mov x1, #0 // is_cpu_on = false
- b __kvm_hyp_init_cpu
-SYM_CODE_END(kvm_hyp_cpu_resume)
+ ldr x29, =__kvm_host_psci_cpu_resume_entry
-/*
- * Common code for CPU entry points. Initializes EL2 state and
- * installs the hypervisor before handing over to a C handler.
- *
- * x0: struct kvm_nvhe_init_params PA
- * x1: bool is_cpu_on
- */
-SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
+SYM_INNER_LABEL(__kvm_hyp_init_cpu, SYM_L_LOCAL)
mov x28, x0 // Stash arguments
- mov x29, x1
/* Check that the core was booted in EL2. */
mrs x0, CurrentEL
cmp x0, #CurrentEL_EL2
- b.eq 2f
-
- /* The core booted in EL1. KVM cannot be initialized on it. */
-1: wfe
- wfi
- b 1b
+ b.ne 1f
-2: msr SPsel, #1 // We want to use SP_EL{1,2}
+ msr SPsel, #1 // We want to use SP_EL2
init_el2_hcr 0
@@ -218,11 +202,16 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
mov x0, x28
bl ___kvm_hyp_init // Clobbers x0..x2
- /* Leave idmap. */
- mov x0, x29
- ldr x1, =kvm_host_psci_cpu_entry
- br x1
-SYM_CODE_END(__kvm_hyp_init_cpu)
+ /* Leave idmap -- using BLR is OK, LR is restored from host context */
+ blr x29
+
+ // The core booted in EL1, or the C code unexpectedly returned.
+ // Either way, KVM cannot be initialized on it.
+1: wfe
+ wfi
+ b 1b
+SYM_CODE_END(kvm_hyp_cpu_resume)
+SYM_CODE_END(kvm_hyp_cpu_entry)
SYM_CODE_START(__kvm_handle_stub_hvc)
/*
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e7790097db93..73f2e0221e70 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -12,12 +12,14 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
#include <asm/kvm_mmu.h>
#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
+#include <nvhe/trace.h>
#include <nvhe/trap_handler.h>
DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -136,6 +138,8 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+
+ hyp_vcpu->vcpu.arch.pid = host_vcpu->arch.pid;
}
static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -169,9 +173,6 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
struct pkvm_hyp_vcpu *hyp_vcpu;
- if (!is_protected_kvm_enabled())
- return;
-
hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx);
if (!hyp_vcpu)
return;
@@ -188,12 +189,8 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
{
- struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
- if (!is_protected_kvm_enabled())
- return;
-
- hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (hyp_vcpu)
pkvm_put_hyp_vcpu(hyp_vcpu);
}
@@ -248,6 +245,26 @@ static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
&host_vcpu->arch.pkvm_memcache);
}
+static void handle___pkvm_host_donate_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || !pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = pkvm_refill_memcache(hyp_vcpu);
+ if (ret)
+ goto out;
+
+ ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, pfn, host_ctxt, 1);
@@ -257,9 +274,6 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -281,9 +295,6 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
@@ -301,9 +312,6 @@ static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ct
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -321,9 +329,6 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
@@ -343,9 +348,6 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
@@ -362,9 +364,6 @@ static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -424,12 +423,8 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
- struct pkvm_hyp_vm *hyp_vm;
+ struct pkvm_hyp_vm *hyp_vm = get_np_pkvm_hyp_vm(handle);
- if (!is_protected_kvm_enabled())
- return;
-
- hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
return;
@@ -486,17 +481,15 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
DECLARE_REG(unsigned long, size, host_ctxt, 2);
- DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
- DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
- DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+ DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 3);
+ DECLARE_REG(u32, hyp_va_bits, host_ctxt, 4);
/*
* __pkvm_init() will return only if an error occurred, otherwise it
* will tail-call in __pkvm_init_finalise() which will have to deal
* with the host context directly.
*/
- cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
- hyp_va_bits);
+ cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, per_cpu_base, hyp_va_bits);
}
static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
@@ -582,11 +575,115 @@ static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
}
-static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_vcpu_in_poison_fault(struct kvm_cpu_context *host_ctxt)
+{
+ int ret;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+
+ ret = hyp_vcpu ? __pkvm_vcpu_in_poison_fault(hyp_vcpu) : -EINVAL;
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_force_reclaim_guest_page(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_host_force_reclaim_page_guest(phys);
+}
+
+static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, gfn);
+}
+
+static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
- cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+ cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle);
+}
+
+static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle);
+}
+
+static void handle___tracing_load(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
+ DECLARE_REG(size_t, desc_size, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size);
+}
+
+static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt)
+{
+ __tracing_unload();
+}
+
+static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(bool, enable, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_enable(enable);
+}
+
+static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu);
+}
+
+static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u32, mult, host_ctxt, 1);
+ DECLARE_REG(u32, shift, host_ctxt, 2);
+ DECLARE_REG(u64, epoch_ns, host_ctxt, 3);
+ DECLARE_REG(u64, epoch_cyc, host_ctxt, 4);
+
+ __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc);
+}
+
+static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_reset(cpu);
+}
+
+static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned short, id, host_ctxt, 1);
+ DECLARE_REG(bool, enable, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable);
+}
+
+static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, id, host_ctxt, 1);
+
+ trace_selftest(id);
+}
+
+static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v5_save_apr(kern_hyp_va(cpu_if));
+}
+
+static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if));
}
typedef void (*hcall_t)(struct kvm_cpu_context *);
@@ -603,14 +700,6 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_get_gic_config),
HANDLE_FUNC(__pkvm_prot_finalize),
- HANDLE_FUNC(__pkvm_host_share_hyp),
- HANDLE_FUNC(__pkvm_host_unshare_hyp),
- HANDLE_FUNC(__pkvm_host_share_guest),
- HANDLE_FUNC(__pkvm_host_unshare_guest),
- HANDLE_FUNC(__pkvm_host_relax_perms_guest),
- HANDLE_FUNC(__pkvm_host_wrprotect_guest),
- HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
- HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
@@ -622,20 +711,44 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_timer_set_cntvoff),
HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
+ HANDLE_FUNC(__vgic_v5_save_apr),
+ HANDLE_FUNC(__vgic_v5_restore_vmcr_apr),
+
+ HANDLE_FUNC(__pkvm_host_share_hyp),
+ HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__pkvm_host_donate_guest),
+ HANDLE_FUNC(__pkvm_host_share_guest),
+ HANDLE_FUNC(__pkvm_host_unshare_guest),
+ HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+ HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+ HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+ HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__pkvm_reserve_vm),
HANDLE_FUNC(__pkvm_unreserve_vm),
HANDLE_FUNC(__pkvm_init_vm),
HANDLE_FUNC(__pkvm_init_vcpu),
- HANDLE_FUNC(__pkvm_teardown_vm),
+ HANDLE_FUNC(__pkvm_vcpu_in_poison_fault),
+ HANDLE_FUNC(__pkvm_force_reclaim_guest_page),
+ HANDLE_FUNC(__pkvm_reclaim_dying_guest_page),
+ HANDLE_FUNC(__pkvm_start_teardown_vm),
+ HANDLE_FUNC(__pkvm_finalize_teardown_vm),
HANDLE_FUNC(__pkvm_vcpu_load),
HANDLE_FUNC(__pkvm_vcpu_put),
HANDLE_FUNC(__pkvm_tlb_flush_vmid),
+ HANDLE_FUNC(__tracing_load),
+ HANDLE_FUNC(__tracing_unload),
+ HANDLE_FUNC(__tracing_enable),
+ HANDLE_FUNC(__tracing_swap_reader),
+ HANDLE_FUNC(__tracing_update_clock),
+ HANDLE_FUNC(__tracing_reset),
+ HANDLE_FUNC(__tracing_enable_event),
+ HANDLE_FUNC(__tracing_write_event),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(unsigned long, id, host_ctxt, 0);
- unsigned long hcall_min = 0;
+ unsigned long hcall_min = 0, hcall_max = -1;
hcall_t hfn;
/*
@@ -647,14 +760,19 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
* basis. This is all fine, however, since __pkvm_prot_finalize
* returns -EPERM after the first call for a given CPU.
*/
- if (static_branch_unlikely(&kvm_protected_mode_initialized))
- hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;
+ if (static_branch_unlikely(&kvm_protected_mode_initialized)) {
+ hcall_min = __KVM_HOST_SMCCC_FUNC_MIN_PKVM;
+ } else {
+ hcall_max = __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM;
+ }
id &= ~ARM_SMCCC_CALL_HINTS;
id -= KVM_HOST_SMCCC_ID(0);
- if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall)))
+ if (unlikely(id < hcall_min || id > hcall_max ||
+ id >= ARRAY_SIZE(host_hcall))) {
goto inval;
+ }
hfn = host_hcall[id];
if (unlikely(!hfn))
@@ -670,14 +788,22 @@ inval:
static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
{
+ trace_hyp_exit(host_ctxt, HYP_REASON_SMC);
__kvm_hyp_host_forward_smc(host_ctxt);
+ trace_hyp_enter(host_ctxt, HYP_REASON_SMC);
}
static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, func_id, host_ctxt, 0);
+ u64 esr = read_sysreg_el2(SYS_ESR);
bool handled;
+ if (esr & ESR_ELx_xVC_IMM_MASK) {
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+ goto exit_skip_instr;
+ }
+
func_id &= ~ARM_SMCCC_CALL_HINTS;
handled = kvm_host_psci_handler(host_ctxt, func_id);
@@ -686,47 +812,57 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
if (!handled)
default_host_smc_handler(host_ctxt);
+exit_skip_instr:
/* SMC was trapped, move ELR past the current PC. */
kvm_skip_host_instr();
}
-/*
- * Inject an Undefined Instruction exception into the host.
- *
- * This is open-coded to allow control over PSTATE construction without
- * complicating the generic exception entry helpers.
- */
-static void inject_undef64(void)
+void inject_host_exception(u64 esr)
{
- u64 spsr_mask, vbar, sctlr, old_spsr, new_spsr, esr, offset;
+ u64 sctlr, spsr_el1, spsr_el2, exc_offset = except_type_sync;
+ const u64 spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT |
+ PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT;
- spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT;
-
- vbar = read_sysreg_el1(SYS_VBAR);
- sctlr = read_sysreg_el1(SYS_SCTLR);
- old_spsr = read_sysreg_el2(SYS_SPSR);
+ spsr_el1 = spsr_el2 = read_sysreg_el2(SYS_SPSR);
+ switch (spsr_el1 & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
+ case PSR_MODE_EL0t:
+ exc_offset += LOWER_EL_AArch64_VECTOR;
+ break;
+ case PSR_MODE_EL0t | PSR_MODE32_BIT:
+ exc_offset += LOWER_EL_AArch32_VECTOR;
+ break;
+ default:
+ exc_offset += CURRENT_EL_SP_ELx_VECTOR;
+ }
- new_spsr = old_spsr & spsr_mask;
- new_spsr |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT;
- new_spsr |= PSR_MODE_EL1h;
+ spsr_el2 &= spsr_mask;
+ spsr_el2 |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
+ PSR_MODE_EL1h;
+ sctlr = read_sysreg_el1(SYS_SCTLR);
if (!(sctlr & SCTLR_EL1_SPAN))
- new_spsr |= PSR_PAN_BIT;
+ spsr_el2 |= PSR_PAN_BIT;
if (sctlr & SCTLR_ELx_DSSBS)
- new_spsr |= PSR_SSBS_BIT;
+ spsr_el2 |= PSR_SSBS_BIT;
if (system_supports_mte())
- new_spsr |= PSR_TCO_BIT;
+ spsr_el2 |= PSR_TCO_BIT;
- esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) | ESR_ELx_IL;
- offset = CURRENT_EL_SP_ELx_VECTOR + except_type_sync;
+ if (esr_fsc_is_translation_fault(esr))
+ write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR);
write_sysreg_el1(esr, SYS_ESR);
write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
- write_sysreg_el1(old_spsr, SYS_SPSR);
- write_sysreg_el2(vbar + offset, SYS_ELR);
- write_sysreg_el2(new_spsr, SYS_SPSR);
+ write_sysreg_el1(spsr_el1, SYS_SPSR);
+ write_sysreg_el2(read_sysreg_el1(SYS_VBAR) + exc_offset, SYS_ELR);
+ write_sysreg_el2(spsr_el2, SYS_SPSR);
+}
+
+static void inject_host_undef64(void)
+{
+ inject_host_exception((ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_IL);
}
static bool handle_host_mte(u64 esr)
@@ -749,7 +885,7 @@ static bool handle_host_mte(u64 esr)
return false;
}
- inject_undef64();
+ inject_host_undef64();
return true;
}
@@ -757,15 +893,19 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
{
u64 esr = read_sysreg_el2(SYS_ESR);
+
switch (ESR_ELx_EC(esr)) {
case ESR_ELx_EC_HVC64:
+ trace_hyp_enter(host_ctxt, HYP_REASON_HVC);
handle_host_hcall(host_ctxt);
break;
case ESR_ELx_EC_SMC64:
+ trace_hyp_enter(host_ctxt, HYP_REASON_SMC);
handle_host_smc(host_ctxt);
break;
case ESR_ELx_EC_IABT_LOW:
case ESR_ELx_EC_DABT_LOW:
+ trace_hyp_enter(host_ctxt, HYP_REASON_HOST_ABORT);
handle_host_mem_abort(host_ctxt);
break;
case ESR_ELx_EC_SYS64:
@@ -775,4 +915,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
default:
BUG();
}
+
+ trace_hyp_exit(host_ctxt, HYP_REASON_ERET_HOST);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index d724f6d69302..7a02837203d1 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -16,6 +16,12 @@ SECTIONS {
HYP_SECTION(.text)
HYP_SECTION(.data..ro_after_init)
HYP_SECTION(.rodata)
+#ifdef CONFIG_NVHE_EL2_TRACING
+ . = ALIGN(PAGE_SIZE);
+ BEGIN_HYP_SECTION(.event_ids)
+ *(SORT(.hyp.event_ids.*))
+ END_HYP_SECTION
+#endif
/*
* .hyp..data..percpu needs to be page aligned to maintain the same
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d815265bd374..28a471d1927c 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -18,6 +18,7 @@
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_AS_S1 | KVM_PGTABLE_S2_IDMAP)
@@ -461,8 +462,15 @@ static bool range_is_memory(u64 start, u64 end)
static inline int __host_stage2_idmap(u64 start, u64 end,
enum kvm_pgtable_prot prot)
{
+ /*
+ * We don't make permission changes to the host idmap after
+ * initialisation, so we can squash -EAGAIN to save callers
+ * having to treat it like success in the case that they try to
+ * map something that is already mapped.
+ */
return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
- prot, &host_s2_pool, 0);
+ prot, &host_s2_pool,
+ KVM_PGTABLE_WALK_IGNORE_EAGAIN);
}
/*
@@ -504,7 +512,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
return ret;
if (kvm_pte_valid(pte))
- return -EAGAIN;
+ return -EEXIST;
if (pte) {
WARN_ON(addr_is_memory(addr) &&
@@ -541,24 +549,99 @@ static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_
set_host_state(page, state);
}
-int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+#define KVM_HOST_DONATION_PTE_OWNER_MASK GENMASK(3, 1)
+#define KVM_HOST_DONATION_PTE_EXTRA_MASK GENMASK(59, 4)
+static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size,
+ u8 owner_id, u64 meta)
{
+ kvm_pte_t annotation;
int ret;
+ if (owner_id == PKVM_ID_HOST)
+ return -EINVAL;
+
if (!range_is_memory(addr, addr + size))
return -EPERM;
- ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
- addr, size, &host_s2_pool, owner_id);
- if (ret)
- return ret;
+ if (!FIELD_FIT(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id))
+ return -EINVAL;
- /* Don't forget to update the vmemmap tracking for the host */
- if (owner_id == PKVM_ID_HOST)
- __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
- else
+ if (!FIELD_FIT(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta))
+ return -EINVAL;
+
+ annotation = FIELD_PREP(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id) |
+ FIELD_PREP(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta);
+ ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
+ addr, size, &host_s2_pool,
+ KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation);
+ if (!ret)
__host_update_page_state(addr, size, PKVM_NOPAGE);
+ return ret;
+}
+
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+{
+ int ret = -EINVAL;
+
+ switch (owner_id) {
+ case PKVM_ID_HOST:
+ if (!range_is_memory(addr, addr + size))
+ return -EPERM;
+
+ ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
+ if (!ret)
+ __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
+ break;
+ case PKVM_ID_HYP:
+ ret = host_stage2_set_owner_metadata_locked(addr, size,
+ owner_id, 0);
+ break;
+ }
+
+ return ret;
+}
+
+#define KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK GENMASK(15, 0)
+/* We need 40 bits for the GFN to cover a 52-bit IPA with 4k pages and LPA2 */
+#define KVM_HOST_PTE_OWNER_GUEST_GFN_MASK GENMASK(55, 16)
+static u64 host_stage2_encode_gfn_meta(struct pkvm_hyp_vm *vm, u64 gfn)
+{
+ pkvm_handle_t handle = vm->kvm.arch.pkvm.handle;
+
+ BUILD_BUG_ON((pkvm_handle_t)-1 > KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK);
+ WARN_ON(!FIELD_FIT(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn));
+
+ return FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, handle) |
+ FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn);
+}
+
+static int host_stage2_decode_gfn_meta(kvm_pte_t pte, struct pkvm_hyp_vm **vm,
+ u64 *gfn)
+{
+ pkvm_handle_t handle;
+ u64 meta;
+
+ if (WARN_ON(kvm_pte_valid(pte)))
+ return -EINVAL;
+
+ if (FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) !=
+ KVM_HOST_INVALID_PTE_TYPE_DONATION) {
+ return -EINVAL;
+ }
+
+ if (FIELD_GET(KVM_HOST_DONATION_PTE_OWNER_MASK, pte) != PKVM_ID_GUEST)
+ return -EPERM;
+
+ meta = FIELD_GET(KVM_HOST_DONATION_PTE_EXTRA_MASK, pte);
+ handle = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, meta);
+ *vm = get_vm_by_handle(handle);
+ if (!*vm) {
+ /* We probably raced with teardown; try again */
+ return -EAGAIN;
+ }
+
+ *gfn = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, meta);
return 0;
}
@@ -605,11 +688,43 @@ unlock:
return ret;
}
+static void host_inject_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+ u64 ec, esr, spsr;
+
+ esr = read_sysreg_el2(SYS_ESR);
+ spsr = read_sysreg_el2(SYS_SPSR);
+
+ /* Repaint the ESR to report a same-level fault if taken from EL1 */
+ if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+ ec = ESR_ELx_EC(esr);
+ if (ec == ESR_ELx_EC_DABT_LOW)
+ ec = ESR_ELx_EC_DABT_CUR;
+ else if (ec == ESR_ELx_EC_IABT_LOW)
+ ec = ESR_ELx_EC_IABT_CUR;
+ else
+ WARN_ON(1);
+ esr &= ~ESR_ELx_EC_MASK;
+ esr |= ec << ESR_ELx_EC_SHIFT;
+ }
+
+ /*
+ * Since S1PTW should only ever be set for stage-2 faults, we're pretty
+ * much guaranteed that it won't be set in ESR_EL1 by the hardware. So,
+ * let's use that bit to allow the host abort handler to differentiate
+ * this abort from normal userspace faults.
+ *
+ * Note: although S1PTW is RES0 at EL1, it is guaranteed by the
+ * architecture to be backed by flops, so it should be safe to use.
+ */
+ esr |= ESR_ELx_S1PTW;
+ inject_host_exception(esr);
+}
+
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
{
struct kvm_vcpu_fault_info fault;
u64 esr, addr;
- int ret = 0;
esr = read_sysreg_el2(SYS_ESR);
if (!__get_fault_info(esr, &fault)) {
@@ -628,8 +743,16 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS));
addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12;
- ret = host_stage2_idmap(addr);
- BUG_ON(ret && ret != -EAGAIN);
+ switch (host_stage2_idmap(addr)) {
+ case -EPERM:
+ host_inject_mem_abort(host_ctxt);
+ fallthrough;
+ case -EEXIST:
+ case 0:
+ break;
+ default:
+ BUG();
+ }
}
struct check_walk_data {
@@ -707,8 +830,20 @@ static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_pa
return 0;
}
+static bool guest_pte_is_poisoned(kvm_pte_t pte)
+{
+ if (kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) ==
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED;
+}
+
static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
{
+ if (guest_pte_is_poisoned(pte))
+ return PKVM_POISON;
+
if (!kvm_pte_valid(pte))
return PKVM_NOPAGE;
@@ -727,6 +862,77 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr,
return check_page_state_range(&vm->pgt, addr, size, &d);
}
+static int get_valid_guest_pte(struct pkvm_hyp_vm *vm, u64 ipa, kvm_pte_t *ptep, u64 *physp)
+{
+ kvm_pte_t pte;
+ u64 phys;
+ s8 level;
+ int ret;
+
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+ if (ret)
+ return ret;
+ if (guest_pte_is_poisoned(pte))
+ return -EHWPOISON;
+ if (!kvm_pte_valid(pte))
+ return -ENOENT;
+ if (level != KVM_PGTABLE_LAST_LEVEL)
+ return -E2BIG;
+
+ phys = kvm_pte_to_phys(pte);
+ ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
+ if (WARN_ON(ret))
+ return ret;
+
+ *ptep = pte;
+ *physp = phys;
+
+ return 0;
+}
+
+int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+ kvm_pte_t pte;
+ s8 level;
+ u64 ipa;
+ int ret;
+
+ switch (kvm_vcpu_trap_get_class(&hyp_vcpu->vcpu)) {
+ case ESR_ELx_EC_DABT_LOW:
+ case ESR_ELx_EC_IABT_LOW:
+ if (kvm_vcpu_trap_is_translation_fault(&hyp_vcpu->vcpu))
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * The host has the faulting IPA when it calls us from the guest
+ * fault handler but we retrieve it ourselves from the FAR so as
+ * to avoid exposing an "oracle" that could reveal data access
+ * patterns of the guest after initial donation of its pages.
+ */
+ ipa = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu);
+ ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(&hyp_vcpu->vcpu));
+
+ guest_lock_component(vm);
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+ if (ret)
+ goto unlock;
+
+ if (level != KVM_PGTABLE_LAST_LEVEL) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = guest_pte_is_poisoned(pte);
+unlock:
+ guest_unlock_component(vm);
+ return ret;
+}
+
int __pkvm_host_share_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
@@ -753,6 +959,72 @@ unlock:
return ret;
}
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys, ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ ret = -EPERM;
+ if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_OWNED)
+ goto unlock;
+ if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE))
+ goto unlock;
+
+ ret = 0;
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_SHARED_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+ WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 meta, phys, ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ ret = -EPERM;
+ if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_SHARED_OWNED)
+ goto unlock;
+ if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED))
+ goto unlock;
+
+ ret = 0;
+ meta = host_stage2_encode_gfn_meta(vm, gfn);
+ WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
+ PKVM_ID_GUEST, meta));
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
int __pkvm_host_unshare_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
@@ -960,6 +1232,176 @@ static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *s
return 0;
}
+static void hyp_poison_page(phys_addr_t phys)
+{
+ void *addr = hyp_fixmap_map(phys);
+
+ memset(addr, 0, PAGE_SIZE);
+ /*
+ * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page()
+ * here as the latter may elide the CMO under the assumption that FWB
+ * will be enabled on CPUs that support it. This is incorrect for the
+ * host stage-2 and would otherwise lead to a malicious host potentially
+ * being able to read the contents of newly reclaimed guest pages.
+ */
+ kvm_flush_dcache_to_poc(addr, PAGE_SIZE);
+ hyp_fixmap_unmap();
+}
+
+static int host_stage2_get_guest_info(phys_addr_t phys, struct pkvm_hyp_vm **vm,
+ u64 *gfn)
+{
+ enum pkvm_page_state state;
+ kvm_pte_t pte;
+ s8 level;
+ int ret;
+
+ if (!addr_is_memory(phys))
+ return -EFAULT;
+
+ state = get_host_state(hyp_phys_to_page(phys));
+ switch (state) {
+ case PKVM_PAGE_OWNED:
+ case PKVM_PAGE_SHARED_OWNED:
+ case PKVM_PAGE_SHARED_BORROWED:
+ /* The access should no longer fault; try again. */
+ return -EAGAIN;
+ case PKVM_NOPAGE:
+ break;
+ default:
+ return -EPERM;
+ }
+
+ ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level);
+ if (ret)
+ return ret;
+
+ if (WARN_ON(level != KVM_PGTABLE_LAST_LEVEL))
+ return -EINVAL;
+
+ return host_stage2_decode_gfn_meta(pte, vm, gfn);
+}
+
+int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys)
+{
+ struct pkvm_hyp_vm *vm;
+ u64 gfn, ipa, pa;
+ kvm_pte_t pte;
+ int ret;
+
+ phys &= PAGE_MASK;
+
+ hyp_spin_lock(&vm_table_lock);
+ host_lock_component();
+
+ ret = host_stage2_get_guest_info(phys, &vm, &gfn);
+ if (ret)
+ goto unlock_host;
+
+ ipa = hyp_pfn_to_phys(gfn);
+ guest_lock_component(vm);
+ ret = get_valid_guest_pte(vm, ipa, &pte, &pa);
+ if (ret)
+ goto unlock_guest;
+
+ WARN_ON(pa != phys);
+ if (guest_get_page_state(pte, ipa) != PKVM_PAGE_OWNED) {
+ ret = -EPERM;
+ goto unlock_guest;
+ }
+
+ /* We really shouldn't be allocating, so don't pass a memcache */
+ ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, NULL,
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED,
+ 0);
+ if (ret)
+ goto unlock_guest;
+
+ hyp_poison_page(phys);
+ WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+unlock_guest:
+ guest_unlock_component(vm);
+unlock_host:
+ host_unlock_component();
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
+}
+
+int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm)
+{
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ u64 phys;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ switch (guest_get_page_state(pte, ipa)) {
+ case PKVM_PAGE_OWNED:
+ WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE));
+ hyp_poison_page(phys);
+ break;
+ case PKVM_PAGE_SHARED_OWNED:
+ WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+ break;
+ default:
+ ret = -EPERM;
+ goto unlock;
+ }
+
+ WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE));
+ WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ /*
+ * -EHWPOISON implies that the page was forcefully reclaimed already
+ * so return success for the GUP pin to be dropped.
+ */
+ return ret && ret != -EHWPOISON ? ret : 0;
+}
+
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 meta;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = __host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+
+ ret = __guest_check_page_state_range(vm, ipa, PAGE_SIZE, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+
+ meta = host_stage2_encode_gfn_meta(vm, gfn);
+ WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
+ PKVM_ID_GUEST, meta));
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
enum kvm_pgtable_prot prot)
{
@@ -1206,53 +1648,18 @@ struct pkvm_expected_state {
static struct pkvm_expected_state selftest_state;
static struct hyp_page *selftest_page;
-
-static struct pkvm_hyp_vm selftest_vm = {
- .kvm = {
- .arch = {
- .mmu = {
- .arch = &selftest_vm.kvm.arch,
- .pgt = &selftest_vm.pgt,
- },
- },
- },
-};
-
-static struct pkvm_hyp_vcpu selftest_vcpu = {
- .vcpu = {
- .arch = {
- .hw_mmu = &selftest_vm.kvm.arch.mmu,
- },
- .kvm = &selftest_vm.kvm,
- },
-};
-
-static void init_selftest_vm(void *virt)
-{
- struct hyp_page *p = hyp_virt_to_page(virt);
- int i;
-
- selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
- WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
-
- for (i = 0; i < pkvm_selftest_pages(); i++) {
- if (p[i].refcount)
- continue;
- p[i].refcount = 1;
- hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
- }
-}
+static struct pkvm_hyp_vcpu *selftest_vcpu;
static u64 selftest_ipa(void)
{
- return BIT(selftest_vm.pgt.ia_bits - 1);
+ return BIT(selftest_vcpu->vcpu.arch.hw_mmu->pgt->ia_bits - 1);
}
static void assert_page_state(void)
{
void *virt = hyp_page_to_virt(selftest_page);
u64 size = PAGE_SIZE << selftest_page->order;
- struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
+ struct pkvm_hyp_vcpu *vcpu = selftest_vcpu;
u64 phys = hyp_virt_to_phys(virt);
u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE };
struct pkvm_hyp_vm *vm;
@@ -1267,10 +1674,10 @@ static void assert_page_state(void)
WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp));
hyp_unlock_component();
- guest_lock_component(&selftest_vm);
+ guest_lock_component(vm);
WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0]));
WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1]));
- guest_unlock_component(&selftest_vm);
+ guest_unlock_component(vm);
}
#define assert_transition_res(res, fn, ...) \
@@ -1283,14 +1690,15 @@ void pkvm_ownership_selftest(void *base)
{
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
void *virt = hyp_alloc_pages(&host_s2_pool, 0);
- struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
- struct pkvm_hyp_vm *vm = &selftest_vm;
+ struct pkvm_hyp_vcpu *vcpu;
u64 phys, size, pfn, gfn;
+ struct pkvm_hyp_vm *vm;
WARN_ON(!virt);
selftest_page = hyp_virt_to_page(virt);
selftest_page->refcount = 0;
- init_selftest_vm(base);
+ selftest_vcpu = vcpu = init_selftest_vm(base);
+ vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
size = PAGE_SIZE << selftest_page->order;
phys = hyp_virt_to_phys(virt);
@@ -1309,6 +1717,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
selftest_state.host = PKVM_PAGE_OWNED;
selftest_state.hyp = PKVM_NOPAGE;
@@ -1328,6 +1737,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
@@ -1340,6 +1750,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
hyp_unpin_shared_mem(virt, virt + size);
assert_page_state();
@@ -1359,6 +1770,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.host = PKVM_PAGE_OWNED;
@@ -1375,6 +1787,7 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED;
@@ -1389,9 +1802,69 @@ void pkvm_ownership_selftest(void *base)
assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm);
selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[0] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_SHARED_BORROWED;
+ selftest_state.guest[0] = PKVM_PAGE_SHARED_OWNED;
+ assert_transition_res(0, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[0] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_guest_unshare_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_guest_unshare_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.guest[0] = PKVM_POISON;
+ assert_transition_res(0, __pkvm_host_force_reclaim_page_guest, phys);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EHWPOISON, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EHWPOISON, __pkvm_guest_unshare_host, vcpu, gfn);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[1] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.guest[1] = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_reclaim_page_guest, gfn + 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+
+ selftest_state.host = PKVM_NOPAGE;
selftest_state.hyp = PKVM_PAGE_OWNED;
assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1);
+ teardown_selftest_vm();
selftest_page->refcount = 1;
hyp_put_page(&host_s2_pool, virt);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 218976287d3f..c98cec0c150f 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -244,7 +244,7 @@ static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys)
void *hyp_fixmap_map(phys_addr_t phys)
{
- return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys);
+ return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys) + offset_in_page(phys);
}
static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
@@ -366,7 +366,7 @@ void *hyp_fixblock_map(phys_addr_t phys, size_t *size)
#ifdef HAS_FIXBLOCK
*size = PMD_SIZE;
hyp_spin_lock(&hyp_fixblock_lock);
- return fixmap_map_slot(&hyp_fixblock_slot, phys);
+ return fixmap_map_slot(&hyp_fixblock_slot, phys) + offset_in_page(phys);
#else
*size = PAGE_SIZE;
return hyp_fixmap_map(phys);
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 2f029bfe4755..7ed96d64d611 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -4,6 +4,8 @@
* Author: Fuad Tabba <tabba@google.com>
*/
+#include <kvm/arm_hypercalls.h>
+
#include <linux/kvm_host.h>
#include <linux/mm.h>
@@ -222,6 +224,7 @@ static struct pkvm_hyp_vm **vm_table;
void pkvm_hyp_vm_table_init(void *tbl)
{
+ BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1);
WARN_ON(vm_table);
vm_table = tbl;
}
@@ -229,10 +232,12 @@ void pkvm_hyp_vm_table_init(void *tbl)
/*
* Return the hyp vm structure corresponding to the handle.
*/
-static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
{
unsigned int idx = vm_handle_to_idx(handle);
+ hyp_assert_lock_held(&vm_table_lock);
+
if (unlikely(idx >= KVM_MAX_PVMS))
return NULL;
@@ -255,7 +260,10 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
hyp_spin_lock(&vm_table_lock);
hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx)
+ if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying)
+ goto unlock;
+
+ if (hyp_vm->kvm.created_vcpus <= vcpu_idx)
goto unlock;
hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
@@ -719,6 +727,55 @@ void __pkvm_unreserve_vm(pkvm_handle_t handle)
hyp_spin_unlock(&vm_table_lock);
}
+#ifdef CONFIG_NVHE_EL2_DEBUG
+static struct pkvm_hyp_vm selftest_vm = {
+ .kvm = {
+ .arch = {
+ .mmu = {
+ .arch = &selftest_vm.kvm.arch,
+ .pgt = &selftest_vm.pgt,
+ },
+ },
+ },
+};
+
+static struct pkvm_hyp_vcpu selftest_vcpu = {
+ .vcpu = {
+ .arch = {
+ .hw_mmu = &selftest_vm.kvm.arch.mmu,
+ },
+ .kvm = &selftest_vm.kvm,
+ },
+};
+
+struct pkvm_hyp_vcpu *init_selftest_vm(void *virt)
+{
+ struct hyp_page *p = hyp_virt_to_page(virt);
+ int i;
+
+ selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
+ WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
+
+ for (i = 0; i < pkvm_selftest_pages(); i++) {
+ if (p[i].refcount)
+ continue;
+ p[i].refcount = 1;
+ hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
+ }
+
+ selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm();
+ insert_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle, &selftest_vm);
+ return &selftest_vcpu;
+}
+
+void teardown_selftest_vm(void)
+{
+ hyp_spin_lock(&vm_table_lock);
+ remove_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle);
+ hyp_spin_unlock(&vm_table_lock);
+}
+#endif /* CONFIG_NVHE_EL2_DEBUG */
+
/*
* Initialize the hypervisor copy of the VM state using host-donated memory.
*
@@ -859,7 +916,54 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
unmap_donated_memory_noclear(addr, size);
}
-int __pkvm_teardown_vm(pkvm_handle_t handle)
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn)
+{
+ struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle);
+ int ret = -EINVAL;
+
+ if (!hyp_vm)
+ return ret;
+
+ if (hyp_vm->kvm.arch.pkvm.is_dying)
+ ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm);
+
+ put_pkvm_hyp_vm(hyp_vm);
+ return ret;
+}
+
+static struct pkvm_hyp_vm *get_pkvm_unref_hyp_vm_locked(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm || hyp_page_count(hyp_vm))
+ return NULL;
+
+ return hyp_vm;
+}
+
+int __pkvm_start_teardown_vm(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = 0;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_pkvm_unref_hyp_vm_locked(handle);
+ if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ hyp_vm->kvm.arch.pkvm.is_dying = true;
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
+}
+
+int __pkvm_finalize_teardown_vm(pkvm_handle_t handle)
{
struct kvm_hyp_memcache *mc, *stage2_mc;
struct pkvm_hyp_vm *hyp_vm;
@@ -869,14 +973,9 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
int err;
hyp_spin_lock(&vm_table_lock);
- hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm) {
- err = -ENOENT;
- goto err_unlock;
- }
-
- if (WARN_ON(hyp_page_count(hyp_vm))) {
- err = -EBUSY;
+ hyp_vm = get_pkvm_unref_hyp_vm_locked(handle);
+ if (!hyp_vm || !hyp_vm->kvm.arch.pkvm.is_dying) {
+ err = -EINVAL;
goto err_unlock;
}
@@ -922,3 +1021,121 @@ err_unlock:
hyp_spin_unlock(&vm_table_lock);
return err;
}
+
+static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa)
+{
+ u64 elr;
+
+ /* Fake up a data abort (level 3 translation fault on write) */
+ vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
+ FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
+
+ /* Shuffle the IPA around into the HPFAR */
+ vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK;
+
+ /* This is a virtual address. 0's good. Let's go with 0. */
+ vcpu->arch.fault.far_el2 = 0;
+
+ /* Rewind the ELR so we return to the HVC once the IPA is mapped */
+ elr = read_sysreg(elr_el2);
+ elr -= 4;
+ write_sysreg(elr, elr_el2);
+
+ return ARM_EXCEPTION_TRAP;
+}
+
+static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ u64 ipa = smccc_get_arg1(vcpu);
+
+ if (!PAGE_ALIGNED(ipa))
+ goto out_guest;
+
+ hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+ switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) {
+ case 0:
+ ret[0] = SMCCC_RET_SUCCESS;
+ goto out_guest;
+ case -ENOENT:
+ /*
+ * Convert the exception into a data abort so that the page
+ * being shared is mapped into the guest next time.
+ */
+ *exit_code = __pkvm_memshare_page_req(vcpu, ipa);
+ goto out_host;
+ }
+
+out_guest:
+ return true;
+out_host:
+ return false;
+}
+
+static void pkvm_memunshare_call(u64 *ret, struct kvm_vcpu *vcpu)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ u64 ipa = smccc_get_arg1(vcpu);
+
+ if (!PAGE_ALIGNED(ipa))
+ return;
+
+ hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+ if (!__pkvm_guest_unshare_host(hyp_vcpu, hyp_phys_to_pfn(ipa)))
+ ret[0] = SMCCC_RET_SUCCESS;
+}
+
+/*
+ * Handler for protected VM HVC calls.
+ *
+ * Returns true if the hypervisor has handled the exit (and control
+ * should return to the guest) or false if it hasn't (and the handling
+ * should be performed by the host).
+ */
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 val[4] = { SMCCC_RET_INVALID_PARAMETER };
+ bool handled = true;
+
+ switch (smccc_get_function(vcpu)) {
+ case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+ val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
+ if (smccc_get_arg1(vcpu) ||
+ smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ val[0] = PAGE_SIZE;
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+ if (smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ handled = pkvm_memshare_call(val, vcpu, exit_code);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+ if (smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ pkvm_memunshare_call(val, vcpu);
+ break;
+ default:
+ /* Punt everything else back to the host, for now. */
+ handled = false;
+ }
+
+ if (handled)
+ smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
+ return handled;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index c3e196fb8b18..e20db999e328 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -6,11 +6,12 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
#include <asm/kvm_mmu.h>
-#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <uapi/linux/psci.h>
+#include <nvhe/arm-smccc.h>
#include <nvhe/memory.h>
#include <nvhe/trap_handler.h>
@@ -65,7 +66,7 @@ static unsigned long psci_call(unsigned long fn, unsigned long arg0,
{
struct arm_smccc_res res;
- arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
+ hyp_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
return res.a0;
}
@@ -200,30 +201,42 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
__hyp_pa(init_params), 0);
}
-asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
+static void __noreturn __kvm_host_psci_cpu_entry(unsigned long pc, unsigned long r0)
{
- struct psci_boot_args *boot_args;
- struct kvm_cpu_context *host_ctxt;
-
- host_ctxt = host_data_ptr(host_ctxt);
+ struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt);
- if (is_cpu_on)
- boot_args = this_cpu_ptr(&cpu_on_args);
- else
- boot_args = this_cpu_ptr(&suspend_args);
+ trace_hyp_enter(host_ctxt, HYP_REASON_PSCI);
- cpu_reg(host_ctxt, 0) = boot_args->r0;
- write_sysreg_el2(boot_args->pc, SYS_ELR);
-
- if (is_cpu_on)
- release_boot_args(boot_args);
+ cpu_reg(host_ctxt, 0) = r0;
+ write_sysreg_el2(pc, SYS_ELR);
write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);
+ trace_hyp_exit(host_ctxt, HYP_REASON_PSCI);
__host_enter(host_ctxt);
}
+asmlinkage void __noreturn __kvm_host_psci_cpu_on_entry(void)
+{
+ struct psci_boot_args *boot_args = this_cpu_ptr(&cpu_on_args);
+ unsigned long pc, r0;
+
+ pc = READ_ONCE(boot_args->pc);
+ r0 = READ_ONCE(boot_args->r0);
+
+ release_boot_args(boot_args);
+
+ __kvm_host_psci_cpu_entry(pc, r0);
+}
+
+asmlinkage void __noreturn __kvm_host_psci_cpu_resume_entry(void)
+{
+ struct psci_boot_args *boot_args = this_cpu_ptr(&suspend_args);
+
+ __kvm_host_psci_cpu_entry(boot_args->pc, boot_args->r0);
+}
+
static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
{
if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id))
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 90bd014e952f..d8e5b563fd3d 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -341,8 +341,7 @@ out:
__host_enter(host_ctxt);
}
-int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
- unsigned long *per_cpu_base, u32 hyp_va_bits)
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits)
{
struct kvm_nvhe_init_params *params;
void *virt = hyp_phys_to_virt(phys);
@@ -355,7 +354,6 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
return -EINVAL;
hyp_spin_lock_init(&pkvm_pgd_lock);
- hyp_nr_cpus = nr_cpus;
ret = divide_memory_pool(virt, size);
if (ret)
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
index 5b6eeab1a774..7c832d60d22b 100644
--- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -34,7 +34,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
stacktrace_info->pc = pc;
}
-#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE
+#ifdef CONFIG_PKVM_STACKTRACE
#include <asm/stacktrace/nvhe.h>
DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace);
@@ -134,11 +134,11 @@ static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
unwind(&state, pkvm_save_backtrace_entry, &idx);
}
-#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */
+#else /* !CONFIG_PKVM_STACKTRACE */
static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
{
}
-#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */
+#endif /* CONFIG_PKVM_STACKTRACE */
/*
* kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 779089e42681..8d1df3d33595 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -7,7 +7,6 @@
#include <hyp/switch.h>
#include <hyp/sysreg-sr.h>
-#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -21,6 +20,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
#include <asm/kvm_mmu.h>
#include <asm/fpsimd.h>
#include <asm/debug-monitors.h>
@@ -44,6 +44,9 @@ struct fgt_masks hfgwtr2_masks;
struct fgt_masks hfgitr2_masks;
struct fgt_masks hdfgrtr2_masks;
struct fgt_masks hdfgwtr2_masks;
+struct fgt_masks ich_hfgrtr_masks;
+struct fgt_masks ich_hfgwtr_masks;
+struct fgt_masks ich_hfgitr_masks;
extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
@@ -110,6 +113,12 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
/* Save VGICv3 state on non-VHE systems */
static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
{
+ if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) {
+ __vgic_v5_save_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ __vgic_v5_save_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ return;
+ }
+
if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
__vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
__vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
@@ -119,6 +128,12 @@ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
/* Restore VGICv3 state on non-VHE systems */
static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
{
+ if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) {
+ __vgic_v5_restore_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ __vgic_v5_restore_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ return;
+ }
+
if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
__vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
__vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
@@ -190,6 +205,7 @@ static const exit_handler_fn hyp_exit_handlers[] = {
static const exit_handler_fn pvm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
+ [ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64,
[ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64,
[ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted,
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
@@ -278,7 +294,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
* We're about to restore some new MMU state. Make sure
* ongoing page-table walks that have started before we
* trapped to EL2 have completed. This also synchronises the
- * above disabling of BRBE, SPE and TRBE.
+ * above disabling of BRBE.
*
* See DDI0487I.a D8.1.5 "Out-of-context translation regimes",
* rule R_LFHQG and subsequent information statements.
@@ -308,10 +324,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__debug_switch_to_guest(vcpu);
do {
+ trace_hyp_exit(host_ctxt, HYP_REASON_ERET_GUEST);
+
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
/* And we're baaack! */
+ trace_hyp_enter(host_ctxt, HYP_REASON_GUEST_EXIT);
} while (fixup_guest_exit(vcpu, &exit_code));
__sysreg_save_state_nvhe(guest_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 06d28621722e..8c3fbb413a06 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -20,6 +20,7 @@
*/
u64 id_aa64pfr0_el1_sys_val;
u64 id_aa64pfr1_el1_sys_val;
+u64 id_aa64pfr2_el1_sys_val;
u64 id_aa64isar0_el1_sys_val;
u64 id_aa64isar1_el1_sys_val;
u64 id_aa64isar2_el1_sys_val;
@@ -108,6 +109,11 @@ static const struct pvm_ftr_bits pvmid_aa64pfr1[] = {
FEAT_END
};
+static const struct pvm_ftr_bits pvmid_aa64pfr2[] = {
+ MAX_FEAT(ID_AA64PFR2_EL1, GCIE, NI),
+ FEAT_END
+};
+
static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = {
MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40),
MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16),
@@ -221,6 +227,8 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id)
return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0);
case SYS_ID_AA64PFR1_EL1:
return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1);
+ case SYS_ID_AA64PFR2_EL1:
+ return get_restricted_features(vcpu, id_aa64pfr2_el1_sys_val, pvmid_aa64pfr2);
case SYS_ID_AA64ISAR0_EL1:
return id_aa64isar0_el1_sys_val;
case SYS_ID_AA64ISAR1_EL1:
@@ -392,6 +400,14 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Cache maintenance by set/way operations are restricted. */
/* Debug and Trace Registers are restricted. */
+ RAZ_WI(SYS_DBGBVRn_EL1(0)),
+ RAZ_WI(SYS_DBGBCRn_EL1(0)),
+ RAZ_WI(SYS_DBGWVRn_EL1(0)),
+ RAZ_WI(SYS_DBGWCRn_EL1(0)),
+ RAZ_WI(SYS_MDSCR_EL1),
+ RAZ_WI(SYS_OSLAR_EL1),
+ RAZ_WI(SYS_OSLSR_EL1),
+ RAZ_WI(SYS_OSDLR_EL1),
/* Group 1 ID registers */
HOST_HANDLED(SYS_REVIDR_EL1),
@@ -431,7 +447,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* CRm=4 */
AARCH64(SYS_ID_AA64PFR0_EL1),
AARCH64(SYS_ID_AA64PFR1_EL1),
- ID_UNALLOCATED(4,2),
+ AARCH64(SYS_ID_AA64PFR2_EL1),
ID_UNALLOCATED(4,3),
AARCH64(SYS_ID_AA64ZFR0_EL1),
ID_UNALLOCATED(4,5),
diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
new file mode 100644
index 000000000000..a6ca27b18e15
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/trace.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/clock.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trace.h>
+
+#include <asm/percpu.h>
+#include <asm/kvm_mmu.h>
+#include <asm/local.h>
+
+#include "simple_ring_buffer.c"
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs);
+
+static struct hyp_trace_buffer {
+ struct simple_rb_per_cpu __percpu *simple_rbs;
+ void *bpages_backing_start;
+ size_t bpages_backing_size;
+ hyp_spinlock_t lock;
+} trace_buffer = {
+ .simple_rbs = &__simple_rbs,
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+};
+
+static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer)
+{
+ return trace_buffer->bpages_backing_size > 0;
+}
+
+void *tracing_reserve_entry(unsigned long length)
+{
+ return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length,
+ trace_clock());
+}
+
+void tracing_commit_entry(void)
+{
+ simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs));
+}
+
+static int __admit_host_mem(void *start, u64 size)
+{
+ if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size) || !size)
+ return -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ return 0;
+
+ return __pkvm_host_donate_hyp(hyp_virt_to_pfn(start), size >> PAGE_SHIFT);
+}
+
+static void __release_host_mem(void *start, u64 size)
+{
+ if (!is_protected_kvm_enabled())
+ return;
+
+ WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT));
+}
+
+static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer,
+ struct hyp_trace_desc *desc)
+{
+ void *start = (void *)kern_hyp_va(desc->bpages_backing_start);
+ size_t size = desc->bpages_backing_size;
+ int ret;
+
+ ret = __admit_host_mem(start, size);
+ if (ret)
+ return ret;
+
+ memset(start, 0, size);
+
+ trace_buffer->bpages_backing_start = start;
+ trace_buffer->bpages_backing_size = size;
+
+ return 0;
+}
+
+static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer)
+{
+ void *start = trace_buffer->bpages_backing_start;
+ size_t size = trace_buffer->bpages_backing_size;
+
+ if (!size)
+ return;
+
+ memset(start, 0, size);
+
+ __release_host_mem(start, size);
+
+ trace_buffer->bpages_backing_start = 0;
+ trace_buffer->bpages_backing_size = 0;
+}
+
+static void *__pin_shared_page(unsigned long kern_va)
+{
+ void *va = kern_hyp_va((void *)kern_va);
+
+ if (!is_protected_kvm_enabled())
+ return va;
+
+ return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va;
+}
+
+static void __unpin_shared_page(void *va)
+{
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_unpin_shared_mem(va, va + PAGE_SIZE);
+}
+
+static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer)
+{
+ int cpu;
+
+ hyp_assert_lock_held(&trace_buffer->lock);
+
+ if (!hyp_trace_buffer_loaded(trace_buffer))
+ return;
+
+ for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
+ simple_ring_buffer_unload_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
+ __unpin_shared_page);
+
+ hyp_trace_buffer_unload_bpage_backing(trace_buffer);
+}
+
+static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer,
+ struct hyp_trace_desc *desc)
+{
+ struct simple_buffer_page *bpages;
+ struct ring_buffer_desc *rb_desc;
+ int ret, cpu;
+
+ hyp_assert_lock_held(&trace_buffer->lock);
+
+ if (hyp_trace_buffer_loaded(trace_buffer))
+ return -EINVAL;
+
+ ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc);
+ if (ret)
+ return ret;
+
+ bpages = trace_buffer->bpages_backing_start;
+ for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
+ ret = simple_ring_buffer_init_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
+ bpages, rb_desc, __pin_shared_page,
+ __unpin_shared_page);
+ if (ret)
+ break;
+
+ bpages += rb_desc->nr_page_va;
+ }
+
+ if (ret)
+ hyp_trace_buffer_unload(trace_buffer);
+
+ return ret;
+}
+
+static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size)
+{
+ struct ring_buffer_desc *rb_desc;
+ unsigned int cpu;
+ size_t nr_bpages;
+ void *desc_end;
+
+ /*
+ * Both desc_size and bpages_backing_size are untrusted host-provided
+ * values. We rely on __pkvm_host_donate_hyp() to enforce their validity.
+ */
+ desc_end = (void *)desc + desc_size;
+ nr_bpages = desc->bpages_backing_size / sizeof(struct simple_buffer_page);
+
+ for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
+ /* Can we read nr_page_va? */
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end)
+ return false;
+
+ /* Overflow desc? */
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end)
+ return false;
+
+ /* Overflow bpages backing memory? */
+ if (nr_bpages < rb_desc->nr_page_va)
+ return false;
+
+ if (cpu >= hyp_nr_cpus)
+ return false;
+
+ if (cpu != rb_desc->cpu)
+ return false;
+
+ nr_bpages -= rb_desc->nr_page_va;
+ }
+
+ return true;
+}
+
+int __tracing_load(unsigned long desc_hva, size_t desc_size)
+{
+ struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva);
+ int ret;
+
+ ret = __admit_host_mem(desc, desc_size);
+ if (ret)
+ return ret;
+
+ if (!hyp_trace_desc_validate(desc, desc_size))
+ goto err_release_desc;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ ret = hyp_trace_buffer_load(&trace_buffer, desc);
+
+ hyp_spin_unlock(&trace_buffer.lock);
+
+err_release_desc:
+ __release_host_mem(desc, desc_size);
+ return ret;
+}
+
+void __tracing_unload(void)
+{
+ hyp_spin_lock(&trace_buffer.lock);
+ hyp_trace_buffer_unload(&trace_buffer);
+ hyp_spin_unlock(&trace_buffer.lock);
+}
+
+int __tracing_enable(bool enable)
+{
+ int cpu, ret = enable ? -EINVAL : 0;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ if (!hyp_trace_buffer_loaded(&trace_buffer))
+ goto unlock;
+
+ for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
+ simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu),
+ enable);
+
+ ret = 0;
+
+unlock:
+ hyp_spin_unlock(&trace_buffer.lock);
+
+ return ret;
+}
+
+int __tracing_swap_reader(unsigned int cpu)
+{
+ int ret = -ENODEV;
+
+ if (cpu >= hyp_nr_cpus)
+ return -EINVAL;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ if (hyp_trace_buffer_loaded(&trace_buffer))
+ ret = simple_ring_buffer_swap_reader_page(
+ per_cpu_ptr(trace_buffer.simple_rbs, cpu));
+
+ hyp_spin_unlock(&trace_buffer.lock);
+
+ return ret;
+}
+
+void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
+{
+ int cpu;
+
+ /* After this loop, all CPUs are observing the new bank... */
+ for (cpu = 0; cpu < hyp_nr_cpus; cpu++) {
+ struct simple_rb_per_cpu *simple_rb = per_cpu_ptr(trace_buffer.simple_rbs, cpu);
+
+ while (READ_ONCE(simple_rb->status) == SIMPLE_RB_WRITING)
+ ;
+ }
+
+ /* ...we can now override the old one and swap. */
+ trace_clock_update(mult, shift, epoch_ns, epoch_cyc);
+}
+
+int __tracing_reset(unsigned int cpu)
+{
+ int ret = -ENODEV;
+
+ if (cpu >= hyp_nr_cpus)
+ return -EINVAL;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ if (hyp_trace_buffer_loaded(&trace_buffer))
+ ret = simple_ring_buffer_reset(per_cpu_ptr(trace_buffer.simple_rbs, cpu));
+
+ hyp_spin_unlock(&trace_buffer.lock);
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 9b480f947da2..84c7a1df845d 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -114,11 +114,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
return pte;
}
-static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
-{
- return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
-}
-
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
@@ -581,7 +576,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
struct stage2_map_data {
const u64 phys;
kvm_pte_t attr;
- u8 owner_id;
+ kvm_pte_t pte_annot;
kvm_pte_t *anchor;
kvm_pte_t *childp;
@@ -798,7 +793,11 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
static bool stage2_pte_is_locked(kvm_pte_t pte)
{
- return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
+ if (kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) ==
+ KVM_INVALID_PTE_TYPE_LOCKED;
}
static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
@@ -829,6 +828,7 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_s2_mmu *mmu)
{
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t locked_pte;
if (stage2_pte_is_locked(ctx->old)) {
/*
@@ -839,7 +839,9 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
return false;
}
- if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
+ locked_pte = FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK,
+ KVM_INVALID_PTE_TYPE_LOCKED);
+ if (!stage2_try_set_pte(ctx, locked_pte))
return false;
if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
@@ -964,7 +966,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
if (!data->annotation)
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
else
- new = kvm_init_invalid_leaf_owner(data->owner_id);
+ new = data->pte_annot;
/*
* Skip updating the PTE if we are trying to recreate the exact
@@ -1118,16 +1120,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
return ret;
}
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
- void *mc, u8 owner_id)
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ void *mc, enum kvm_invalid_pte_type type,
+ kvm_pte_t pte_annot)
{
int ret;
struct stage2_map_data map_data = {
.mmu = pgt->mmu,
.memcache = mc,
- .owner_id = owner_id,
.force_pte = true,
.annotation = true,
+ .pte_annot = pte_annot |
+ FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK, type),
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -1136,7 +1140,10 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
.arg = &map_data,
};
- if (owner_id > KVM_MAX_OWNER_ID)
+ if (pte_annot & ~KVM_INVALID_PTE_ANNOT_MASK)
+ return -EINVAL;
+
+ if (!type || type == KVM_INVALID_PTE_TYPE_LOCKED)
return -EINVAL;
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
diff --git a/arch/arm64/kvm/hyp/vgic-v5-sr.c b/arch/arm64/kvm/hyp/vgic-v5-sr.c
new file mode 100644
index 000000000000..47e6bcd43702
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v5-sr.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025, 2026 - Arm Ltd
+ */
+
+#include <linux/irqchip/arm-gic-v5.h>
+
+#include <asm/kvm_hyp.h>
+
+void __vgic_v5_save_apr(struct vgic_v5_cpu_if *cpu_if)
+{
+ cpu_if->vgic_apr = read_sysreg_s(SYS_ICH_APR_EL2);
+}
+
+static void __vgic_v5_compat_mode_disable(void)
+{
+ sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, ICH_VCTLR_EL2_V3, 0);
+ isb();
+}
+
+void __vgic_v5_restore_vmcr_apr(struct vgic_v5_cpu_if *cpu_if)
+{
+ __vgic_v5_compat_mode_disable();
+
+ write_sysreg_s(cpu_if->vgic_vmcr, SYS_ICH_VMCR_EL2);
+ write_sysreg_s(cpu_if->vgic_apr, SYS_ICH_APR_EL2);
+}
+
+void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if)
+{
+ /*
+ * The following code assumes that the bitmap storage that we have for
+ * PPIs is either 64 (architected PPIs, only) or 128 bits (architected &
+ * impdef PPIs).
+ */
+ BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64);
+
+ bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit,
+ read_sysreg_s(SYS_ICH_PPI_ACTIVER0_EL2), 0, 64);
+ bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr,
+ read_sysreg_s(SYS_ICH_PPI_PENDR0_EL2), 0, 64);
+
+ cpu_if->vgic_ppi_priorityr[0] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR0_EL2);
+ cpu_if->vgic_ppi_priorityr[1] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR1_EL2);
+ cpu_if->vgic_ppi_priorityr[2] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR2_EL2);
+ cpu_if->vgic_ppi_priorityr[3] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR3_EL2);
+ cpu_if->vgic_ppi_priorityr[4] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR4_EL2);
+ cpu_if->vgic_ppi_priorityr[5] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR5_EL2);
+ cpu_if->vgic_ppi_priorityr[6] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR6_EL2);
+ cpu_if->vgic_ppi_priorityr[7] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR7_EL2);
+
+ if (VGIC_V5_NR_PRIVATE_IRQS == 128) {
+ bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit,
+ read_sysreg_s(SYS_ICH_PPI_ACTIVER1_EL2), 64, 64);
+ bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr,
+ read_sysreg_s(SYS_ICH_PPI_PENDR1_EL2), 64, 64);
+
+ cpu_if->vgic_ppi_priorityr[8] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR8_EL2);
+ cpu_if->vgic_ppi_priorityr[9] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR9_EL2);
+ cpu_if->vgic_ppi_priorityr[10] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR10_EL2);
+ cpu_if->vgic_ppi_priorityr[11] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR11_EL2);
+ cpu_if->vgic_ppi_priorityr[12] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR12_EL2);
+ cpu_if->vgic_ppi_priorityr[13] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR13_EL2);
+ cpu_if->vgic_ppi_priorityr[14] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR14_EL2);
+ cpu_if->vgic_ppi_priorityr[15] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR15_EL2);
+ }
+
+ /* Now that we are done, disable DVI */
+ write_sysreg_s(0, SYS_ICH_PPI_DVIR0_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2);
+}
+
+void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if)
+{
+ DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS);
+
+ /* We assume 64 or 128 PPIs - see above comment */
+ BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64);
+
+ /* Enable DVI so that the guest's interrupt config takes over */
+ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 0, 64),
+ SYS_ICH_PPI_DVIR0_EL2);
+
+ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 0, 64),
+ SYS_ICH_PPI_ACTIVER0_EL2);
+ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 0, 64),
+ SYS_ICH_PPI_ENABLER0_EL2);
+
+ /* Update the pending state of the NON-DVI'd PPIs, only */
+ bitmap_andnot(pendr, host_data_ptr(vgic_v5_ppi_state)->pendr,
+ cpu_if->vgic_ppi_dvir, VGIC_V5_NR_PRIVATE_IRQS);
+ write_sysreg_s(bitmap_read(pendr, 0, 64), SYS_ICH_PPI_PENDR0_EL2);
+
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[0],
+ SYS_ICH_PPI_PRIORITYR0_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[1],
+ SYS_ICH_PPI_PRIORITYR1_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[2],
+ SYS_ICH_PPI_PRIORITYR2_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[3],
+ SYS_ICH_PPI_PRIORITYR3_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[4],
+ SYS_ICH_PPI_PRIORITYR4_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[5],
+ SYS_ICH_PPI_PRIORITYR5_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[6],
+ SYS_ICH_PPI_PRIORITYR6_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[7],
+ SYS_ICH_PPI_PRIORITYR7_EL2);
+
+ if (VGIC_V5_NR_PRIVATE_IRQS == 128) {
+ /* Enable DVI so that the guest's interrupt config takes over */
+ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 64, 64),
+ SYS_ICH_PPI_DVIR1_EL2);
+
+ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 64, 64),
+ SYS_ICH_PPI_ACTIVER1_EL2);
+ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 64, 64),
+ SYS_ICH_PPI_ENABLER1_EL2);
+ write_sysreg_s(bitmap_read(pendr, 64, 64),
+ SYS_ICH_PPI_PENDR1_EL2);
+
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[8],
+ SYS_ICH_PPI_PRIORITYR8_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[9],
+ SYS_ICH_PPI_PRIORITYR9_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[10],
+ SYS_ICH_PPI_PRIORITYR10_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[11],
+ SYS_ICH_PPI_PRIORITYR11_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[12],
+ SYS_ICH_PPI_PRIORITYR12_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[13],
+ SYS_ICH_PPI_PRIORITYR13_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[14],
+ SYS_ICH_PPI_PRIORITYR14_EL2);
+ write_sysreg_s(cpu_if->vgic_ppi_priorityr[15],
+ SYS_ICH_PPI_PRIORITYR15_EL2);
+ } else {
+ write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2);
+
+ write_sysreg_s(0, SYS_ICH_PPI_ACTIVER1_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_ENABLER1_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PENDR1_EL2);
+
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR8_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR9_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR10_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR11_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR12_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR13_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR14_EL2);
+ write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR15_EL2);
+ }
+}
+
+void __vgic_v5_save_state(struct vgic_v5_cpu_if *cpu_if)
+{
+ cpu_if->vgic_vmcr = read_sysreg_s(SYS_ICH_VMCR_EL2);
+ cpu_if->vgic_icsr = read_sysreg_s(SYS_ICC_ICSR_EL1);
+}
+
+void __vgic_v5_restore_state(struct vgic_v5_cpu_if *cpu_if)
+{
+ write_sysreg_s(cpu_if->vgic_icsr, SYS_ICC_ICSR_EL1);
+}
diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile
index afc4aed9231a..9695328bbd96 100644
--- a/arch/arm64/kvm/hyp/vhe/Makefile
+++ b/arch/arm64/kvm/hyp/vhe/Makefile
@@ -10,4 +10,4 @@ CFLAGS_switch.o += -Wno-override-init
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
- ../fpsimd.o ../hyp-entry.o ../exception.o
+ ../fpsimd.o ../hyp-entry.o ../exception.o ../vgic-v5-sr.o
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
new file mode 100644
index 000000000000..8b7f2bf2fba8
--- /dev/null
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/cpumask.h>
+#include <linux/trace_remote.h>
+#include <linux/tracefs.h>
+#include <linux/simple_ring_buffer.h>
+
+#include <asm/arch_timer.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyptrace.h>
+#include <asm/kvm_mmu.h>
+
+#include "hyp_trace.h"
+
+/* Same 10min used by clocksource when width is more than 32-bits */
+#define CLOCK_MAX_CONVERSION_S 600
+/*
+ * Time to give for the clock init. Long enough to get a good mult/shift
+ * estimation. Short enough to not delay the tracing start too much.
+ */
+#define CLOCK_INIT_MS 100
+/*
+ * Time between clock checks. Must be small enough to catch clock deviation when
+ * it is still tiny.
+ */
+#define CLOCK_UPDATE_MS 500
+
+static struct hyp_trace_clock {
+ u64 cycles;
+ u64 cyc_overflow64;
+ u64 boot;
+ u32 mult;
+ u32 shift;
+ struct delayed_work work;
+ struct completion ready;
+ struct mutex lock;
+ bool running;
+} hyp_clock;
+
+static void __hyp_clock_work(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct hyp_trace_clock *hyp_clock;
+ struct system_time_snapshot snap;
+ u64 rate, delta_cycles;
+ u64 boot, delta_boot;
+
+ hyp_clock = container_of(dwork, struct hyp_trace_clock, work);
+
+ ktime_get_snapshot(&snap);
+ boot = ktime_to_ns(snap.boot);
+
+ delta_boot = boot - hyp_clock->boot;
+ delta_cycles = snap.cycles - hyp_clock->cycles;
+
+ /* Compare hyp clock with the kernel boot clock */
+ if (hyp_clock->mult) {
+ u64 err, cur = delta_cycles;
+
+ if (WARN_ON_ONCE(cur >= hyp_clock->cyc_overflow64)) {
+ __uint128_t tmp = (__uint128_t)cur * hyp_clock->mult;
+
+ cur = tmp >> hyp_clock->shift;
+ } else {
+ cur *= hyp_clock->mult;
+ cur >>= hyp_clock->shift;
+ }
+ cur += hyp_clock->boot;
+
+ err = abs_diff(cur, boot);
+ /* No deviation, only update epoch if necessary */
+ if (!err) {
+ if (delta_cycles >= (hyp_clock->cyc_overflow64 >> 1))
+ goto fast_forward;
+
+ goto resched;
+ }
+
+ /* Warn if the error is above tracing precision (1us) */
+ if (err > NSEC_PER_USEC)
+ pr_warn_ratelimited("hyp trace clock off by %lluus\n",
+ err / NSEC_PER_USEC);
+ }
+
+ rate = div64_u64(delta_cycles * NSEC_PER_SEC, delta_boot);
+
+ clocks_calc_mult_shift(&hyp_clock->mult, &hyp_clock->shift,
+ rate, NSEC_PER_SEC, CLOCK_MAX_CONVERSION_S);
+
+ /* Add a comfortable 50% margin */
+ hyp_clock->cyc_overflow64 = (U64_MAX / hyp_clock->mult) >> 1;
+
+fast_forward:
+ hyp_clock->cycles = snap.cycles;
+ hyp_clock->boot = boot;
+ kvm_call_hyp_nvhe(__tracing_update_clock, hyp_clock->mult,
+ hyp_clock->shift, hyp_clock->boot, hyp_clock->cycles);
+ complete(&hyp_clock->ready);
+
+resched:
+ schedule_delayed_work(&hyp_clock->work,
+ msecs_to_jiffies(CLOCK_UPDATE_MS));
+}
+
+static void hyp_trace_clock_enable(struct hyp_trace_clock *hyp_clock, bool enable)
+{
+ struct system_time_snapshot snap;
+
+ if (hyp_clock->running == enable)
+ return;
+
+ if (!enable) {
+ cancel_delayed_work_sync(&hyp_clock->work);
+ hyp_clock->running = false;
+ }
+
+ ktime_get_snapshot(&snap);
+
+ hyp_clock->boot = ktime_to_ns(snap.boot);
+ hyp_clock->cycles = snap.cycles;
+ hyp_clock->mult = 0;
+
+ init_completion(&hyp_clock->ready);
+ INIT_DELAYED_WORK(&hyp_clock->work, __hyp_clock_work);
+ schedule_delayed_work(&hyp_clock->work, msecs_to_jiffies(CLOCK_INIT_MS));
+ wait_for_completion(&hyp_clock->ready);
+ hyp_clock->running = true;
+}
+
+/* Access to this struct within the trace_remote_callbacks are protected by the trace_remote lock */
+static struct hyp_trace_buffer {
+ struct hyp_trace_desc *desc;
+ size_t desc_size;
+} trace_buffer;
+
+static int __map_hyp(void *start, size_t size)
+{
+ if (is_protected_kvm_enabled())
+ return 0;
+
+ return create_hyp_mappings(start, start + size, PAGE_HYP);
+}
+
+static int __share_page(unsigned long va)
+{
+ return kvm_share_hyp((void *)va, (void *)va + 1);
+}
+
+static void __unshare_page(unsigned long va)
+{
+ kvm_unshare_hyp((void *)va, (void *)va + 1);
+}
+
+static int hyp_trace_buffer_alloc_bpages_backing(struct hyp_trace_buffer *trace_buffer, size_t size)
+{
+ int nr_bpages = (PAGE_ALIGN(size) / PAGE_SIZE) + 1;
+ size_t backing_size;
+ void *start;
+
+ backing_size = PAGE_ALIGN(sizeof(struct simple_buffer_page) * nr_bpages *
+ num_possible_cpus());
+
+ start = alloc_pages_exact(backing_size, GFP_KERNEL_ACCOUNT);
+ if (!start)
+ return -ENOMEM;
+
+ trace_buffer->desc->bpages_backing_start = (unsigned long)start;
+ trace_buffer->desc->bpages_backing_size = backing_size;
+
+ return __map_hyp(start, backing_size);
+}
+
+static void hyp_trace_buffer_free_bpages_backing(struct hyp_trace_buffer *trace_buffer)
+{
+ free_pages_exact((void *)trace_buffer->desc->bpages_backing_start,
+ trace_buffer->desc->bpages_backing_size);
+}
+
+static void hyp_trace_buffer_unshare_hyp(struct hyp_trace_buffer *trace_buffer, int last_cpu)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu, p;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, &trace_buffer->desc->trace_buffer_desc) {
+ if (cpu > last_cpu)
+ break;
+
+ __share_page(rb_desc->meta_va);
+ for (p = 0; p < rb_desc->nr_page_va; p++)
+ __unshare_page(rb_desc->page_va[p]);
+ }
+}
+
+static int hyp_trace_buffer_share_hyp(struct hyp_trace_buffer *trace_buffer)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu, p, ret = 0;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, &trace_buffer->desc->trace_buffer_desc) {
+ ret = __share_page(rb_desc->meta_va);
+ if (ret)
+ break;
+
+ for (p = 0; p < rb_desc->nr_page_va; p++) {
+ ret = __share_page(rb_desc->page_va[p]);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ for (p--; p >= 0; p--)
+ __unshare_page(rb_desc->page_va[p]);
+ break;
+ }
+ }
+
+ if (ret)
+ hyp_trace_buffer_unshare_hyp(trace_buffer, cpu--);
+
+ return ret;
+}
+
+static struct trace_buffer_desc *hyp_trace_load(unsigned long size, void *priv)
+{
+ struct hyp_trace_buffer *trace_buffer = priv;
+ struct hyp_trace_desc *desc;
+ size_t desc_size;
+ int ret;
+
+ if (WARN_ON(trace_buffer->desc))
+ return ERR_PTR(-EINVAL);
+
+ desc_size = trace_buffer_desc_size(size, num_possible_cpus());
+ if (desc_size == SIZE_MAX)
+ return ERR_PTR(-E2BIG);
+
+ desc_size = PAGE_ALIGN(desc_size);
+ desc = (struct hyp_trace_desc *)alloc_pages_exact(desc_size, GFP_KERNEL);
+ if (!desc)
+ return ERR_PTR(-ENOMEM);
+
+ ret = __map_hyp(desc, desc_size);
+ if (ret)
+ goto err_free_desc;
+
+ trace_buffer->desc = desc;
+
+ ret = hyp_trace_buffer_alloc_bpages_backing(trace_buffer, size);
+ if (ret)
+ goto err_free_desc;
+
+ ret = trace_remote_alloc_buffer(&desc->trace_buffer_desc, desc_size, size,
+ cpu_possible_mask);
+ if (ret)
+ goto err_free_backing;
+
+ ret = hyp_trace_buffer_share_hyp(trace_buffer);
+ if (ret)
+ goto err_free_buffer;
+
+ ret = kvm_call_hyp_nvhe(__tracing_load, (unsigned long)desc, desc_size);
+ if (ret)
+ goto err_unload_pages;
+
+ return &desc->trace_buffer_desc;
+
+err_unload_pages:
+ hyp_trace_buffer_unshare_hyp(trace_buffer, INT_MAX);
+
+err_free_buffer:
+ trace_remote_free_buffer(&desc->trace_buffer_desc);
+
+err_free_backing:
+ hyp_trace_buffer_free_bpages_backing(trace_buffer);
+
+err_free_desc:
+ free_pages_exact(desc, desc_size);
+ trace_buffer->desc = NULL;
+
+ return ERR_PTR(ret);
+}
+
+static void hyp_trace_unload(struct trace_buffer_desc *desc, void *priv)
+{
+ struct hyp_trace_buffer *trace_buffer = priv;
+
+ if (WARN_ON(desc != &trace_buffer->desc->trace_buffer_desc))
+ return;
+
+ kvm_call_hyp_nvhe(__tracing_unload);
+ hyp_trace_buffer_unshare_hyp(trace_buffer, INT_MAX);
+ trace_remote_free_buffer(desc);
+ hyp_trace_buffer_free_bpages_backing(trace_buffer);
+ free_pages_exact(trace_buffer->desc, trace_buffer->desc_size);
+ trace_buffer->desc = NULL;
+}
+
+static int hyp_trace_enable_tracing(bool enable, void *priv)
+{
+ hyp_trace_clock_enable(&hyp_clock, enable);
+
+ return kvm_call_hyp_nvhe(__tracing_enable, enable);
+}
+
+static int hyp_trace_swap_reader_page(unsigned int cpu, void *priv)
+{
+ return kvm_call_hyp_nvhe(__tracing_swap_reader, cpu);
+}
+
+static int hyp_trace_reset(unsigned int cpu, void *priv)
+{
+ return kvm_call_hyp_nvhe(__tracing_reset, cpu);
+}
+
+static int hyp_trace_enable_event(unsigned short id, bool enable, void *priv)
+{
+ struct hyp_event_id *event_id = lm_alias(&__hyp_event_ids_start[id]);
+ struct page *page;
+ atomic_t *enabled;
+ void *map;
+
+ if (is_protected_kvm_enabled())
+ return kvm_call_hyp_nvhe(__tracing_enable_event, id, enable);
+
+ enabled = &event_id->enabled;
+ page = virt_to_page(enabled);
+ map = vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ enabled = map + offset_in_page(enabled);
+ atomic_set(enabled, enable);
+
+ vunmap(map);
+
+ return 0;
+}
+
+static int hyp_trace_clock_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "[boot]\n");
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(hyp_trace_clock);
+
+static ssize_t hyp_trace_write_event_write(struct file *f, const char __user *ubuf,
+ size_t cnt, loff_t *pos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ kvm_call_hyp_nvhe(__tracing_write_event, val);
+
+ return cnt;
+}
+
+static const struct file_operations hyp_trace_write_event_fops = {
+ .write = hyp_trace_write_event_write,
+};
+
+static int hyp_trace_init_tracefs(struct dentry *d, void *priv)
+{
+ if (!tracefs_create_file("write_event", 0200, d, NULL, &hyp_trace_write_event_fops))
+ return -ENOMEM;
+
+ return tracefs_create_file("trace_clock", 0440, d, NULL, &hyp_trace_clock_fops) ?
+ 0 : -ENOMEM;
+}
+
+static struct trace_remote_callbacks trace_remote_callbacks = {
+ .init = hyp_trace_init_tracefs,
+ .load_trace_buffer = hyp_trace_load,
+ .unload_trace_buffer = hyp_trace_unload,
+ .enable_tracing = hyp_trace_enable_tracing,
+ .swap_reader_page = hyp_trace_swap_reader_page,
+ .reset = hyp_trace_reset,
+ .enable_event = hyp_trace_enable_event,
+};
+
+static const char *__hyp_enter_exit_reason_str(u8 reason);
+
+#include <asm/kvm_define_hypevents.h>
+
+static const char *__hyp_enter_exit_reason_str(u8 reason)
+{
+ static const char strs[][12] = {
+ "smc",
+ "hvc",
+ "psci",
+ "host_abort",
+ "guest_exit",
+ "eret_host",
+ "eret_guest",
+ "unknown",
+ };
+
+ return strs[min(reason, HYP_REASON_UNKNOWN)];
+}
+
+static void __init hyp_trace_init_events(void)
+{
+ struct hyp_event_id *hyp_event_id = __hyp_event_ids_start;
+ struct remote_event *event = __hyp_events_start;
+ int id = 0;
+
+ /* Events on both sides hypervisor are sorted */
+ for (; event < __hyp_events_end; event++, hyp_event_id++, id++)
+ event->id = hyp_event_id->id = id;
+}
+
+int __init kvm_hyp_trace_init(void)
+{
+ int cpu;
+
+ if (is_kernel_in_hyp_mode())
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ const struct arch_timer_erratum_workaround *wa =
+ per_cpu(timer_unstable_counter_workaround, cpu);
+
+ if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND) &&
+ wa && wa->read_cntvct_el0) {
+ pr_warn("hyp trace can't handle CNTVCT workaround '%s'\n", wa->desc);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ hyp_trace_init_events();
+
+ return trace_remote_register("hypervisor", &trace_remote_callbacks, &trace_buffer,
+ __hyp_events_start, __hyp_events_end - __hyp_events_start);
+}
diff --git a/arch/arm64/kvm/hyp_trace.h b/arch/arm64/kvm/hyp_trace.h
new file mode 100644
index 000000000000..c991b1ec65f1
--- /dev/null
+++ b/arch/arm64/kvm/hyp_trace.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ARM64_KVM_HYP_TRACE_H__
+#define __ARM64_KVM_HYP_TRACE_H__
+
+#ifdef CONFIG_NVHE_EL2_TRACING
+int kvm_hyp_trace_init(void);
+#else
+static inline int kvm_hyp_trace_init(void) { return 0; }
+#endif
+#endif
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 17d64a1e11e5..d089c107d9b7 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -340,6 +340,9 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
u64 size, bool may_block)
{
+ if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu)))
+ return;
+
__unmap_stage2_range(mmu, start, size, may_block);
}
@@ -878,9 +881,6 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
u64 mmfr0, mmfr1;
u32 phys_shift;
- if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
- return -EINVAL;
-
phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
if (is_protected_kvm_enabled()) {
phys_shift = kvm_ipa_limit;
@@ -1013,6 +1013,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
out_destroy_pgtable:
kvm_stage2_destroy(pgt);
+ mmu->pgt = NULL;
out_free_pgtable:
kfree(pgt);
return err;
@@ -1400,10 +1401,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
*/
static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long hva, kvm_pfn_t *pfnp,
- phys_addr_t *ipap)
+ unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp)
{
kvm_pfn_t pfn = *pfnp;
+ gfn_t gfn = *gfnp;
/*
* Make sure the adjustment is done only for THP pages. Also make
@@ -1419,7 +1420,8 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (sz < PMD_SIZE)
return PAGE_SIZE;
- *ipap &= PMD_MASK;
+ gfn &= ~(PTRS_PER_PMD - 1);
+ *gfnp = gfn;
pfn &= ~(PTRS_PER_PMD - 1);
*pfnp = pfn;
@@ -1512,25 +1514,22 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
}
}
-static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
- void **memcache)
+static void *get_mmu_memcache(struct kvm_vcpu *vcpu)
{
- int min_pages;
-
if (!is_protected_kvm_enabled())
- *memcache = &vcpu->arch.mmu_page_cache;
+ return &vcpu->arch.mmu_page_cache;
else
- *memcache = &vcpu->arch.pkvm_memcache;
-
- if (!topup_memcache)
- return 0;
+ return &vcpu->arch.pkvm_memcache;
+}
- min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache)
+{
+ int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
if (!is_protected_kvm_enabled())
- return kvm_mmu_topup_memory_cache(*memcache, min_pages);
+ return kvm_mmu_topup_memory_cache(memcache, min_pages);
- return topup_hyp_memcache(*memcache, min_pages);
+ return topup_hyp_memcache(memcache, min_pages);
}
/*
@@ -1543,54 +1542,63 @@ static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
* TLB invalidation from the guest and used to limit the invalidation scope if a
* TTL hint or a range isn't provided.
*/
-static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
- enum kvm_pgtable_prot *prot,
- bool *writable)
+static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot prot)
{
- *writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_writable(nested))
+ prot &= ~KVM_PGTABLE_PROT_W;
if (!kvm_s2_trans_readable(nested))
- *prot &= ~KVM_PGTABLE_PROT_R;
+ prot &= ~KVM_PGTABLE_PROT_R;
- *prot |= kvm_encode_nested_level(nested);
+ return prot | kvm_encode_nested_level(nested);
}
-static void adjust_nested_exec_perms(struct kvm *kvm,
- struct kvm_s2_trans *nested,
- enum kvm_pgtable_prot *prot)
+static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm,
+ struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot prot)
{
if (!kvm_s2_trans_exec_el0(kvm, nested))
- *prot &= ~KVM_PGTABLE_PROT_UX;
+ prot &= ~KVM_PGTABLE_PROT_UX;
if (!kvm_s2_trans_exec_el1(kvm, nested))
- *prot &= ~KVM_PGTABLE_PROT_PX;
+ prot &= ~KVM_PGTABLE_PROT_PX;
+
+ return prot;
}
-static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, bool is_perm)
+struct kvm_s2_fault_desc {
+ struct kvm_vcpu *vcpu;
+ phys_addr_t fault_ipa;
+ struct kvm_s2_trans *nested;
+ struct kvm_memory_slot *memslot;
+ unsigned long hva;
+};
+
+static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
{
- bool write_fault, exec_fault, writable;
+ bool write_fault, exec_fault;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
- struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt;
unsigned long mmu_seq;
struct page *page;
- struct kvm *kvm = vcpu->kvm;
+ struct kvm *kvm = s2fd->vcpu->kvm;
void *memcache;
kvm_pfn_t pfn;
gfn_t gfn;
int ret;
- ret = prepare_mmu_memcache(vcpu, true, &memcache);
+ memcache = get_mmu_memcache(s2fd->vcpu);
+ ret = topup_mmu_memcache(s2fd->vcpu, memcache);
if (ret)
return ret;
- if (nested)
- gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
+ if (s2fd->nested)
+ gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT;
else
- gfn = fault_ipa >> PAGE_SHIFT;
+ gfn = s2fd->fault_ipa >> PAGE_SHIFT;
- write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
+ write_fault = kvm_is_write_fault(s2fd->vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu);
VM_WARN_ON_ONCE(write_fault && exec_fault);
@@ -1598,26 +1606,24 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
smp_rmb();
- ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
+ ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL);
if (ret) {
- kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
+ kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE,
write_fault, exec_fault, false);
return ret;
}
- writable = !(memslot->flags & KVM_MEM_READONLY);
-
- if (nested)
- adjust_nested_fault_perms(nested, &prot, &writable);
-
- if (writable)
+ if (!(s2fd->memslot->flags & KVM_MEM_READONLY))
prot |= KVM_PGTABLE_PROT_W;
+ if (s2fd->nested)
+ prot = adjust_nested_fault_perms(s2fd->nested, prot);
+
if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
- if (nested)
- adjust_nested_exec_perms(kvm, nested, &prot);
+ if (s2fd->nested)
+ prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot);
kvm_fault_lock(kvm);
if (mmu_invalidate_retry(kvm, mmu_seq)) {
@@ -1625,85 +1631,122 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
goto out_unlock;
}
- ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE,
__pfn_to_phys(pfn), prot,
memcache, flags);
out_unlock:
- kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W);
kvm_fault_unlock(kvm);
- if (writable && !ret)
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ if ((prot & KVM_PGTABLE_PROT_W) && !ret)
+ mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn);
return ret != -EAGAIN ? ret : 0;
}
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, unsigned long hva,
- bool fault_is_perm)
+struct kvm_s2_fault_vma_info {
+ unsigned long mmu_seq;
+ long vma_pagesize;
+ vm_flags_t vm_flags;
+ unsigned long max_map_size;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ bool device;
+ bool mte_allowed;
+ bool is_vma_cacheable;
+ bool map_writable;
+ bool map_non_cacheable;
+};
+
+static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd)
{
- int ret = 0;
- bool topup_memcache;
- bool write_fault, writable;
- bool exec_fault, mte_allowed, is_vma_cacheable;
- bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
- unsigned long mmu_seq;
- phys_addr_t ipa = fault_ipa;
+ unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
+ struct kvm_vcpu *vcpu = s2fd->vcpu;
+ struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ struct mm_struct *mm = current->mm;
struct kvm *kvm = vcpu->kvm;
- struct vm_area_struct *vma;
- short vma_shift;
- void *memcache;
- gfn_t gfn;
- kvm_pfn_t pfn;
- bool logging_active = memslot_is_logging(memslot);
- bool force_pte = logging_active;
- long vma_pagesize, fault_granule;
- enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
- struct kvm_pgtable *pgt;
+ void *hyp_memcache;
struct page *page;
- vm_flags_t vm_flags;
- enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
+ int ret;
- if (fault_is_perm)
- fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
- write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_WARN_ON_ONCE(write_fault && exec_fault);
+ hyp_memcache = get_mmu_memcache(vcpu);
+ ret = topup_mmu_memcache(vcpu, hyp_memcache);
+ if (ret)
+ return -ENOMEM;
- /*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
- */
- topup_memcache = !fault_is_perm || (logging_active && write_fault);
- ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
+ ret = account_locked_vm(mm, 1, true);
if (ret)
return ret;
- /*
- * Let's check if we will get back a huge page backed by hugetlbfs, or
- * get block mapping for device MMIO region.
- */
- mmap_read_lock(current->mm);
- vma = vma_lookup(current->mm, hva);
- if (unlikely(!vma)) {
- kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
- mmap_read_unlock(current->mm);
- return -EFAULT;
+ mmap_read_lock(mm);
+ ret = pin_user_pages(s2fd->hva, 1, flags, &page);
+ mmap_read_unlock(mm);
+
+ if (ret == -EHWPOISON) {
+ kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT);
+ ret = 0;
+ goto dec_account;
+ } else if (ret != 1) {
+ ret = -EFAULT;
+ goto dec_account;
+ } else if (!folio_test_swapbacked(page_folio(page))) {
+ /*
+ * We really can't deal with page-cache pages returned by GUP
+ * because (a) we may trigger writeback of a page for which we
+ * no longer have access and (b) page_mkclean() won't find the
+ * stage-2 mapping in the rmap so we can get out-of-whack with
+ * the filesystem when marking the page dirty during unpinning
+ * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
+ * without asking ext4 first")).
+ *
+ * Ideally we'd just restrict ourselves to anonymous pages, but
+ * we also want to allow memfd (i.e. shmem) pages, so check for
+ * pages backed by swap in the knowledge that the GUP pin will
+ * prevent try_to_unmap() from succeeding.
+ */
+ ret = -EIO;
+ goto unpin;
}
- if (force_pte)
+ write_lock(&kvm->mmu_lock);
+ ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE,
+ page_to_phys(page), KVM_PGTABLE_PROT_RWX,
+ hyp_memcache, 0);
+ write_unlock(&kvm->mmu_lock);
+ if (ret) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto unpin;
+ }
+
+ return 0;
+unpin:
+ unpin_user_pages(&page, 1);
+dec_account:
+ account_locked_vm(mm, 1, false);
+ return ret;
+}
+
+static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi,
+ struct vm_area_struct *vma)
+{
+ short vma_shift;
+
+ if (memslot_is_logging(s2fd->memslot)) {
+ s2vi->max_map_size = PAGE_SIZE;
vma_shift = PAGE_SHIFT;
- else
- vma_shift = get_vma_page_shift(vma, hva);
+ } else {
+ s2vi->max_map_size = PUD_SIZE;
+ vma_shift = get_vma_page_shift(vma, s2fd->hva);
+ }
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
- if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+ if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE))
break;
fallthrough;
#endif
@@ -1711,12 +1754,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_shift = PMD_SHIFT;
fallthrough;
case PMD_SHIFT:
- if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+ if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE))
break;
fallthrough;
case CONT_PTE_SHIFT:
vma_shift = PAGE_SHIFT;
- force_pte = true;
+ s2vi->max_map_size = PAGE_SIZE;
fallthrough;
case PAGE_SHIFT:
break;
@@ -1724,21 +1767,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
}
- vma_pagesize = 1UL << vma_shift;
-
- if (nested) {
+ if (s2fd->nested) {
unsigned long max_map_size;
- max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
-
- ipa = kvm_s2_trans_output(nested);
+ max_map_size = min(s2vi->max_map_size, PUD_SIZE);
/*
* If we're about to create a shadow stage 2 entry, then we
* can only create a block mapping if the guest stage 2 page
* table uses at least as big a mapping.
*/
- max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+ max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size);
/*
* Be careful that if the mapping size falls between
@@ -1749,30 +1788,46 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
max_map_size = PAGE_SIZE;
- force_pte = (max_map_size == PAGE_SIZE);
- vma_pagesize = min_t(long, vma_pagesize, max_map_size);
- vma_shift = __ffs(vma_pagesize);
+ s2vi->max_map_size = max_map_size;
+ vma_shift = min_t(short, vma_shift, __ffs(max_map_size));
}
+ return vma_shift;
+}
+
+static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd)
+{
+ return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu);
+}
+
+static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi)
+{
+ struct vm_area_struct *vma;
+ struct kvm *kvm = s2fd->vcpu->kvm;
+
+ mmap_read_lock(current->mm);
+ vma = vma_lookup(current->mm, s2fd->hva);
+ if (unlikely(!vma)) {
+ kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva);
+ mmap_read_unlock(current->mm);
+ return -EFAULT;
+ }
+
+ s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma));
+
/*
* Both the canonical IPA and fault IPA must be aligned to the
* mapping size to ensure we find the right PFN and lay down the
* mapping in the right place.
*/
- fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize);
- ipa = ALIGN_DOWN(ipa, vma_pagesize);
-
- gfn = ipa >> PAGE_SHIFT;
- mte_allowed = kvm_vma_mte_allowed(vma);
-
- vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+ s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT;
- vm_flags = vma->vm_flags;
+ s2vi->mte_allowed = kvm_vma_mte_allowed(vma);
- is_vma_cacheable = kvm_vma_is_cacheable(vma);
+ s2vi->vm_flags = vma->vm_flags;
- /* Don't use the VMA after the unlock -- it may have vanished */
- vma = NULL;
+ s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma);
/*
* Read mmu_invalidate_seq so that KVM can detect if the results of
@@ -1782,24 +1837,50 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
* with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- mmu_seq = kvm->mmu_invalidate_seq;
+ s2vi->mmu_seq = kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
- pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
- &writable, &page);
- if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(hva, vma_shift);
- return 0;
- }
- if (is_error_noslot_pfn(pfn))
+ return 0;
+}
+
+static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd,
+ const struct kvm_s2_fault_vma_info *s2vi)
+{
+ phys_addr_t ipa;
+
+ if (!s2fd->nested)
+ return s2vi->gfn;
+
+ ipa = kvm_s2_trans_output(s2fd->nested);
+ return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT;
+}
+
+static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi)
+{
+ int ret;
+
+ ret = kvm_s2_fault_get_vma_info(s2fd, s2vi);
+ if (ret)
+ return ret;
+
+ s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi),
+ kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0,
+ &s2vi->map_writable, &s2vi->page);
+ if (unlikely(is_error_noslot_pfn(s2vi->pfn))) {
+ if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize));
+ return 0;
+ }
return -EFAULT;
+ }
/*
* Check if this is non-struct page memory PFN, and cannot support
* CMOs. It could potentially be unsafe to access as cacheable.
*/
- if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
- if (is_vma_cacheable) {
+ if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) {
+ if (s2vi->is_vma_cacheable) {
/*
* Whilst the VMA owner expects cacheable mapping to this
* PFN, hardware also has to support the FWB and CACHE DIC
@@ -1812,8 +1893,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* S2FWB and CACHE DIC are mandatory to avoid the need for
* cache maintenance.
*/
- if (!kvm_supports_cacheable_pfnmap())
- ret = -EFAULT;
+ if (!kvm_supports_cacheable_pfnmap()) {
+ kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false);
+ return -EFAULT;
+ }
} else {
/*
* If the page was identified as device early by looking at
@@ -1825,21 +1908,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* In both cases, we don't let transparent_hugepage_adjust()
* change things at the last minute.
*/
- s2_force_noncacheable = true;
+ s2vi->map_non_cacheable = true;
}
- } else if (logging_active && !write_fault) {
- /*
- * Only actually map the page as writable if this was a write
- * fault.
- */
- writable = false;
+
+ s2vi->device = true;
}
- if (exec_fault && s2_force_noncacheable)
- ret = -ENOEXEC;
+ return 1;
+}
- if (ret)
- goto out_put_page;
+static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd,
+ const struct kvm_s2_fault_vma_info *s2vi,
+ enum kvm_pgtable_prot *prot)
+{
+ struct kvm *kvm = s2fd->vcpu->kvm;
+
+ if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable)
+ return -ENOEXEC;
/*
* Guest performs atomic/exclusive operations on memory with unsupported
@@ -1847,99 +1932,167 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* and trigger the exception here. Since the memslot is valid, inject
* the fault back to the guest.
*/
- if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) {
- kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- goto out_put_page;
+ if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) {
+ kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu));
+ return 1;
}
- if (nested)
- adjust_nested_fault_perms(nested, &prot, &writable);
+ *prot = KVM_PGTABLE_PROT_R;
+
+ if (s2vi->map_writable && (s2vi->device ||
+ !memslot_is_logging(s2fd->memslot) ||
+ kvm_is_write_fault(s2fd->vcpu)))
+ *prot |= KVM_PGTABLE_PROT_W;
+
+ if (s2fd->nested)
+ *prot = adjust_nested_fault_perms(s2fd->nested, *prot);
+
+ if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu))
+ *prot |= KVM_PGTABLE_PROT_X;
+
+ if (s2vi->map_non_cacheable)
+ *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ?
+ KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE;
+ else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+ *prot |= KVM_PGTABLE_PROT_X;
+
+ if (s2fd->nested)
+ *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot);
+
+ if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) {
+ /* Check the VMM hasn't introduced a new disallowed VMA */
+ if (!s2vi->mte_allowed)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd,
+ const struct kvm_s2_fault_vma_info *s2vi,
+ enum kvm_pgtable_prot prot,
+ void *memcache)
+{
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
+ bool writable = prot & KVM_PGTABLE_PROT_W;
+ struct kvm *kvm = s2fd->vcpu->kvm;
+ struct kvm_pgtable *pgt;
+ long perm_fault_granule;
+ long mapping_size;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int ret;
kvm_fault_lock(kvm);
- pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_invalidate_retry(kvm, mmu_seq)) {
- ret = -EAGAIN;
+ pgt = s2fd->vcpu->arch.hw_mmu->pgt;
+ ret = -EAGAIN;
+ if (mmu_invalidate_retry(kvm, s2vi->mmu_seq))
goto out_unlock;
- }
+
+ perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ?
+ kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0);
+ mapping_size = s2vi->vma_pagesize;
+ pfn = s2vi->pfn;
+ gfn = s2vi->gfn;
/*
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
- if (fault_is_perm && fault_granule > PAGE_SIZE)
- vma_pagesize = fault_granule;
- else
- vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
- hva, &pfn,
- &fault_ipa);
-
- if (vma_pagesize < 0) {
- ret = vma_pagesize;
- goto out_unlock;
- }
- }
-
- if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
- /* Check the VMM hasn't introduced a new disallowed VMA */
- if (mte_allowed) {
- sanitise_mte_tags(kvm, pfn, vma_pagesize);
+ if (mapping_size == PAGE_SIZE &&
+ !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) {
+ if (perm_fault_granule > PAGE_SIZE) {
+ mapping_size = perm_fault_granule;
} else {
- ret = -EFAULT;
- goto out_unlock;
+ mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot,
+ s2fd->hva, &pfn,
+ &gfn);
+ if (mapping_size < 0) {
+ ret = mapping_size;
+ goto out_unlock;
+ }
}
}
- if (writable)
- prot |= KVM_PGTABLE_PROT_W;
-
- if (exec_fault)
- prot |= KVM_PGTABLE_PROT_X;
-
- if (s2_force_noncacheable) {
- if (vfio_allow_any_uc)
- prot |= KVM_PGTABLE_PROT_NORMAL_NC;
- else
- prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
- prot |= KVM_PGTABLE_PROT_X;
- }
-
- if (nested)
- adjust_nested_exec_perms(kvm, nested, &prot);
+ if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm))
+ sanitise_mte_tags(kvm, pfn, mapping_size);
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
- * permissions only if vma_pagesize equals fault_granule. Otherwise,
+ * permissions only if mapping_size equals perm_fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_is_perm && vma_pagesize == fault_granule) {
+ if (mapping_size == perm_fault_granule) {
/*
* Drop the SW bits in favour of those stored in the
* PTE, which will be preserved.
*/
prot &= ~KVM_NV_GUEST_MAP_SZ;
- ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn),
+ prot, flags);
} else {
- ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
- __pfn_to_phys(pfn), prot,
- memcache, flags);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size,
+ __pfn_to_phys(pfn), prot,
+ memcache, flags);
}
out_unlock:
- kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable);
kvm_fault_unlock(kvm);
- /* Mark the page dirty only if the fault is handled successfully */
- if (writable && !ret)
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ /*
+ * Mark the page dirty only if the fault is handled successfully,
+ * making sure we adjust the canonical IPA if the mapping size has
+ * been updated (via a THP upgrade, for example).
+ */
+ if (writable && !ret) {
+ phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi));
+ ipa &= ~(mapping_size - 1);
+ mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa));
+ }
- return ret != -EAGAIN ? ret : 0;
+ if (ret != -EAGAIN)
+ return ret;
+ return 0;
+}
-out_put_page:
- kvm_release_page_unused(page);
- return ret;
+static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd)
+{
+ bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu);
+ struct kvm_s2_fault_vma_info s2vi = {};
+ enum kvm_pgtable_prot prot;
+ void *memcache;
+ int ret;
+
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ memcache = get_mmu_memcache(s2fd->vcpu);
+ if (!perm_fault || (memslot_is_logging(s2fd->memslot) &&
+ kvm_is_write_fault(s2fd->vcpu))) {
+ ret = topup_mmu_memcache(s2fd->vcpu, memcache);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Let's check if we will get back a huge page backed by hugetlbfs, or
+ * get block mapping for device MMIO region.
+ */
+ ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi);
+ if (ret != 1)
+ return ret;
+
+ ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot);
+ if (ret) {
+ kvm_release_page_unused(s2vi.page);
+ return ret;
+ }
+
+ return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache);
}
/* Resolve the access fault by making the page young again. */
@@ -2202,15 +2355,27 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
- !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
+ const struct kvm_s2_fault_desc s2fd = {
+ .vcpu = vcpu,
+ .fault_ipa = fault_ipa,
+ .nested = nested,
+ .memslot = memslot,
+ .hva = hva,
+ };
+
+ if (kvm_vm_is_protected(vcpu->kvm)) {
+ ret = pkvm_mem_abort(&s2fd);
+ } else {
+ VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+ !write_fault &&
+ !kvm_vcpu_trap_is_exec_fault(vcpu));
+
+ if (kvm_slot_has_gmem(memslot))
+ ret = gmem_abort(&s2fd);
+ else
+ ret = user_mem_abort(&s2fd);
+ }
- if (kvm_slot_has_gmem(memslot))
- ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
- esr_fsc_is_permission_fault(esr));
- else
- ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
- esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
@@ -2223,7 +2388,7 @@ out_unlock:
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- if (!kvm->arch.mmu.pgt)
+ if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
return false;
__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
@@ -2238,7 +2403,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- if (!kvm->arch.mmu.pgt)
+ if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
return false;
return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
@@ -2254,7 +2419,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- if (!kvm->arch.mmu.pgt)
+ if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
return false;
return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
@@ -2411,6 +2576,19 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
hva_t hva, reg_end;
int ret = 0;
+ if (kvm_vm_is_protected(kvm)) {
+ /* Cannot modify memslots once a pVM has run. */
+ if (pkvm_hyp_vm_is_created(kvm) &&
+ (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) {
+ return -EPERM;
+ }
+
+ if (new &&
+ new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) {
+ return -EPERM;
+ }
+ }
+
if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
change != KVM_MR_FLAGS_ONLY)
return 0;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 2c43097248b2..883b6c1008fb 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -735,8 +735,10 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
/* Make sure we don't forget to do the laundry */
- if (kvm_s2_mmu_valid(s2_mmu))
+ if (kvm_s2_mmu_valid(s2_mmu)) {
+ kvm_nested_s2_ptdump_remove_debugfs(s2_mmu);
s2_mmu->pending_unmap = true;
+ }
/*
* The virtual VMID (modulo CnP) will be used as a key when matching
@@ -750,6 +752,8 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
+ kvm_nested_s2_ptdump_create_debugfs(s2_mmu);
+
out:
atomic_inc(&s2_mmu->refcnt);
@@ -1558,6 +1562,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
ID_AA64PFR1_EL1_MTE);
break;
+ case SYS_ID_AA64PFR2_EL1:
+ /* GICv5 is not yet supported for NV */
+ val &= ~ID_AA64PFR2_EL1_GCIE;
+ break;
+
case SYS_ID_AA64MMFR0_EL1:
/* Hide ExS, Secure Memory */
val &= ~(ID_AA64MMFR0_EL1_EXS |
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index d7a0f69a9982..053e4f733e4b 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -88,7 +88,7 @@ void __init kvm_hyp_reserve(void)
static void __pkvm_destroy_hyp_vm(struct kvm *kvm)
{
if (pkvm_hyp_vm_is_created(kvm)) {
- WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm,
kvm->arch.pkvm.handle));
} else if (kvm->arch.pkvm.handle) {
/*
@@ -192,10 +192,16 @@ int pkvm_create_hyp_vm(struct kvm *kvm)
{
int ret = 0;
+ /*
+ * Synchronise with kvm_arch_prepare_memory_region(), as we
+ * prevent memslot modifications on a pVM that has been run.
+ */
+ mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->arch.config_lock);
if (!pkvm_hyp_vm_is_created(kvm))
ret = __pkvm_create_hyp_vm(kvm);
mutex_unlock(&kvm->arch.config_lock);
+ mutex_unlock(&kvm->slots_lock);
return ret;
}
@@ -219,9 +225,10 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm)
mutex_unlock(&kvm->arch.config_lock);
}
-int pkvm_init_host_vm(struct kvm *kvm)
+int pkvm_init_host_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ bool protected = type & KVM_VM_TYPE_ARM_PROTECTED;
if (pkvm_hyp_vm_is_created(kvm))
return -EINVAL;
@@ -236,6 +243,11 @@ int pkvm_init_host_vm(struct kvm *kvm)
return ret;
kvm->arch.pkvm.handle = ret;
+ kvm->arch.pkvm.is_protected = protected;
+ if (protected) {
+ pr_warn_once("kvm: protected VMs are experimental and for development only, tainting kernel\n");
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
return 0;
}
@@ -322,15 +334,38 @@ int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
return 0;
}
-static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
+static int __pkvm_pgtable_stage2_reclaim(struct kvm_pgtable *pgt, u64 start, u64 end)
{
struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
pkvm_handle_t handle = kvm->arch.pkvm.handle;
struct pkvm_mapping *mapping;
int ret;
- if (!handle)
- return 0;
+ for_each_mapping_in_range_safe(pgt, start, end, mapping) {
+ struct page *page;
+
+ ret = kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page,
+ handle, mapping->gfn);
+ if (WARN_ON(ret))
+ continue;
+
+ page = pfn_to_page(mapping->pfn);
+ WARN_ON_ONCE(mapping->nr_pages != 1);
+ unpin_user_pages_dirty_lock(&page, 1, true);
+ account_locked_vm(current->mm, 1, false);
+ pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
+ kfree(mapping);
+ }
+
+ return 0;
+}
+
+static int __pkvm_pgtable_stage2_unshare(struct kvm_pgtable *pgt, u64 start, u64 end)
+{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+ struct pkvm_mapping *mapping;
+ int ret;
for_each_mapping_in_range_safe(pgt, start, end, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
@@ -347,7 +382,21 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e
void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
u64 addr, u64 size)
{
- __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
+ pkvm_handle_t handle = kvm->arch.pkvm.handle;
+
+ if (!handle)
+ return;
+
+ if (pkvm_hyp_vm_is_created(kvm) && !kvm->arch.pkvm.is_dying) {
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, handle));
+ kvm->arch.pkvm.is_dying = true;
+ }
+
+ if (kvm_vm_is_protected(kvm))
+ __pkvm_pgtable_stage2_reclaim(pgt, addr, addr + size);
+ else
+ __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size);
}
void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
@@ -365,31 +414,58 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_hyp_memcache *cache = mc;
u64 gfn = addr >> PAGE_SHIFT;
u64 pfn = phys >> PAGE_SHIFT;
+ u64 end = addr + size;
int ret;
- if (size != PAGE_SIZE && size != PMD_SIZE)
- return -EINVAL;
-
lockdep_assert_held_write(&kvm->mmu_lock);
+ mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, end - 1);
- /*
- * Calling stage2_map() on top of existing mappings is either happening because of a race
- * with another vCPU, or because we're changing between page and block mappings. As per
- * user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
- */
- mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1);
- if (mapping) {
- if (size == (mapping->nr_pages * PAGE_SIZE))
- return -EAGAIN;
-
- /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
- ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
- if (ret)
- return ret;
- mapping = NULL;
+ if (kvm_vm_is_protected(kvm)) {
+ /* Protected VMs are mapped using RWX page-granular mappings */
+ if (WARN_ON_ONCE(size != PAGE_SIZE))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(prot != KVM_PGTABLE_PROT_RWX))
+ return -EINVAL;
+
+ /*
+ * We either raced with another vCPU or the guest PTE
+ * has been poisoned by an erroneous host access.
+ */
+ if (mapping) {
+ ret = kvm_call_hyp_nvhe(__pkvm_vcpu_in_poison_fault);
+ return ret ? -EFAULT : -EAGAIN;
+ }
+
+ ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest, pfn, gfn);
+ } else {
+ if (WARN_ON_ONCE(size != PAGE_SIZE && size != PMD_SIZE))
+ return -EINVAL;
+
+ /*
+ * We either raced with another vCPU or we're changing between
+ * page and block mappings. As per user_mem_abort(), same-size
+ * permission faults are handled in the relax_perms() path.
+ */
+ if (mapping) {
+ if (size == (mapping->nr_pages * PAGE_SIZE))
+ return -EAGAIN;
+
+ /*
+ * Remove _any_ pkvm_mapping overlapping with the range,
+ * bigger or smaller.
+ */
+ ret = __pkvm_pgtable_stage2_unshare(pgt, addr, end);
+ if (ret)
+ return ret;
+
+ mapping = NULL;
+ }
+
+ ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn,
+ size / PAGE_SIZE, prot);
}
- ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
if (WARN_ON(ret))
return ret;
@@ -404,9 +480,14 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
- lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
- return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+ if (WARN_ON(kvm_vm_is_protected(kvm)))
+ return -EPERM;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size);
}
int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
@@ -416,6 +497,9 @@ int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
struct pkvm_mapping *mapping;
int ret = 0;
+ if (WARN_ON(kvm_vm_is_protected(kvm)))
+ return -EPERM;
+
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
@@ -447,6 +531,9 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64
struct pkvm_mapping *mapping;
bool young = false;
+ if (WARN_ON(kvm_vm_is_protected(kvm)))
+ return false;
+
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
@@ -458,12 +545,18 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64
int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
enum kvm_pgtable_walk_flags flags)
{
+ if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu))))
+ return -EPERM;
+
return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
}
void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_walk_flags flags)
{
+ if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu))))
+ return;
+
WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
}
@@ -485,3 +578,15 @@ int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
WARN_ON_ONCE(1);
return -EINVAL;
}
+
+/*
+ * Forcefully reclaim a page from the guest, zeroing its contents and
+ * poisoning the stage-2 pte so that pages can no longer be mapped at
+ * the same IPA. The page remains pinned until the guest is destroyed.
+ */
+bool pkvm_force_reclaim_guest_page(phys_addr_t phys)
+{
+ int ret = kvm_call_hyp_nvhe(__pkvm_force_reclaim_guest_page, phys);
+
+ return !ret || ret == -EAGAIN;
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 93cc9bbb5cec..e1860acae641 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -939,7 +939,8 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
* number against the dimensions of the vgic and make sure
* it's valid.
*/
- if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
+ if (!irq_is_ppi(vcpu->kvm, irq) &&
+ !vgic_valid_spi(vcpu->kvm, irq))
return -EINVAL;
} else if (kvm_arm_pmu_irq_initialized(vcpu)) {
return -EINVAL;
@@ -961,8 +962,13 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
if (!vgic_initialized(vcpu->kvm))
return -ENODEV;
- if (!kvm_arm_pmu_irq_initialized(vcpu))
- return -ENXIO;
+ if (!kvm_arm_pmu_irq_initialized(vcpu)) {
+ if (!vgic_is_v5(vcpu->kvm))
+ return -ENXIO;
+
+ /* Use the architected irq number for GICv5. */
+ vcpu->arch.pmu.irq_num = KVM_ARMV8_PMU_GICV5_IRQ;
+ }
ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
&vcpu->arch.pmu);
@@ -987,11 +993,15 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
unsigned long i;
struct kvm_vcpu *vcpu;
+ /* On GICv5, the PMUIRQ is architecturally mandated to be PPI 23 */
+ if (vgic_is_v5(kvm) && irq != KVM_ARMV8_PMU_GICV5_IRQ)
+ return false;
+
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_arm_pmu_irq_initialized(vcpu))
continue;
- if (irq_is_ppi(irq)) {
+ if (irq_is_ppi(vcpu->kvm, irq)) {
if (vcpu->arch.pmu.irq_num != irq)
return false;
} else {
@@ -1142,7 +1152,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
return -EFAULT;
/* The PMU overflow interrupt can be a PPI or a valid SPI. */
- if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
+ if (!(irq_is_ppi(vcpu->kvm, irq) || irq_is_spi(vcpu->kvm, irq)))
return -EINVAL;
if (!pmu_irq_is_valid(kvm, irq))
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index 6a8836207a79..c9140e22abcf 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -10,19 +10,20 @@
#include <linux/kvm_host.h>
#include <linux/seq_file.h>
+#include <asm/cpufeature.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/ptdump.h>
#define MARKERS_LEN 2
#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1)
+#define S2FNAMESZ sizeof("0x0123456789abcdef-0x0123456789abcdef-s2-disabled")
struct kvm_ptdump_guest_state {
- struct kvm *kvm;
+ struct kvm_s2_mmu *mmu;
struct ptdump_pg_state parser_state;
struct addr_marker ipa_marker[MARKERS_LEN];
struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS];
- struct ptdump_range range[MARKERS_LEN];
};
static const struct ptdump_prot_bits stage2_pte_bits[] = {
@@ -112,10 +113,9 @@ static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
return 0;
}
-static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
+static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm_s2_mmu *mmu)
{
struct kvm_ptdump_guest_state *st;
- struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
struct kvm_pgtable *pgtable = mmu->pgt;
int ret;
@@ -131,17 +131,8 @@ static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
st->ipa_marker[0].name = "Guest IPA";
st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
- st->range[0].end = BIT(pgtable->ia_bits);
-
- st->kvm = kvm;
- st->parser_state = (struct ptdump_pg_state) {
- .marker = &st->ipa_marker[0],
- .level = -1,
- .pg_level = &st->level[0],
- .ptdump.range = &st->range[0],
- .start_address = 0,
- };
+ st->mmu = mmu;
return st;
}
@@ -149,16 +140,20 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
{
int ret;
struct kvm_ptdump_guest_state *st = m->private;
- struct kvm *kvm = st->kvm;
- struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
- struct ptdump_pg_state *parser_state = &st->parser_state;
+ struct kvm_s2_mmu *mmu = st->mmu;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
.cb = kvm_ptdump_visitor,
- .arg = parser_state,
+ .arg = &st->parser_state,
.flags = KVM_PGTABLE_WALK_LEAF,
};
- parser_state->seq = m;
+ st->parser_state = (struct ptdump_pg_state) {
+ .marker = &st->ipa_marker[0],
+ .level = -1,
+ .pg_level = &st->level[0],
+ .seq = m,
+ };
write_lock(&kvm->mmu_lock);
ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
@@ -169,14 +164,15 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
{
- struct kvm *kvm = m->i_private;
+ struct kvm_s2_mmu *mmu = m->i_private;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
struct kvm_ptdump_guest_state *st;
int ret;
if (!kvm_get_kvm_safe(kvm))
return -ENOENT;
- st = kvm_ptdump_parser_create(kvm);
+ st = kvm_ptdump_parser_create(mmu);
if (IS_ERR(st)) {
ret = PTR_ERR(st);
goto err_with_kvm_ref;
@@ -194,7 +190,7 @@ err_with_kvm_ref:
static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
{
- struct kvm *kvm = m->i_private;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(m->i_private);
void *st = ((struct seq_file *)file->private_data)->private;
kfree(st);
@@ -229,14 +225,15 @@ static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
int (*show)(struct seq_file *, void *))
{
- struct kvm *kvm = m->i_private;
+ struct kvm_s2_mmu *mmu = m->i_private;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
struct kvm_pgtable *pgtable;
int ret;
if (!kvm_get_kvm_safe(kvm))
return -ENOENT;
- pgtable = kvm->arch.mmu.pgt;
+ pgtable = mmu->pgt;
ret = single_open(file, show, pgtable);
if (ret < 0)
@@ -256,7 +253,7 @@ static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
{
- struct kvm *kvm = m->i_private;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(m->i_private);
kvm_put_kvm(kvm);
return single_release(m, file);
@@ -276,12 +273,36 @@ static const struct file_operations kvm_pgtable_levels_fops = {
.release = kvm_pgtable_debugfs_close,
};
+void kvm_nested_s2_ptdump_create_debugfs(struct kvm_s2_mmu *mmu)
+{
+ struct dentry *dent;
+ char file_name[S2FNAMESZ];
+
+ snprintf(file_name, sizeof(file_name), "0x%016llx-0x%016llx-s2-%sabled",
+ mmu->tlb_vttbr,
+ mmu->tlb_vtcr,
+ mmu->nested_stage2_enabled ? "en" : "dis");
+
+ dent = debugfs_create_file(file_name, 0400,
+ mmu->arch->debugfs_nv_dentry, mmu,
+ &kvm_ptdump_guest_fops);
+
+ mmu->shadow_pt_debugfs_dentry = dent;
+}
+
+void kvm_nested_s2_ptdump_remove_debugfs(struct kvm_s2_mmu *mmu)
+{
+ debugfs_remove(mmu->shadow_pt_debugfs_dentry);
+}
+
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
{
debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
- kvm, &kvm_ptdump_guest_fops);
- debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
- &kvm_pgtable_range_fops);
+ &kvm->arch.mmu, &kvm_ptdump_guest_fops);
+ debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry,
+ &kvm->arch.mmu, &kvm_pgtable_range_fops);
debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
- kvm, &kvm_pgtable_levels_fops);
+ &kvm->arch.mmu, &kvm_pgtable_levels_fops);
+ if (cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
+ kvm->arch.debugfs_nv_dentry = debugfs_create_dir("nested", kvm->debugfs_dentry);
}
diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c
index af5eec681127..9724c320126b 100644
--- a/arch/arm64/kvm/stacktrace.c
+++ b/arch/arm64/kvm/stacktrace.c
@@ -197,7 +197,7 @@ static void hyp_dump_backtrace(unsigned long hyp_offset)
kvm_nvhe_dump_backtrace_end();
}
-#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE
+#ifdef CONFIG_PKVM_STACKTRACE
DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)],
pkvm_stacktrace);
@@ -225,12 +225,12 @@ static void pkvm_dump_backtrace(unsigned long hyp_offset)
kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]);
kvm_nvhe_dump_backtrace_end();
}
-#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */
+#else /* !CONFIG_PKVM_STACKTRACE */
static void pkvm_dump_backtrace(unsigned long hyp_offset)
{
- kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n");
+ kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PKVM_STACKTRACE\n");
}
-#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */
+#endif /* CONFIG_PKVM_STACKTRACE */
/*
* kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace.
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1b4cacb6e918..6a96cb7ba9a3 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -681,6 +681,91 @@ static bool access_gic_dir(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_gicv5_idr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return undef_access(vcpu, p, r);
+
+ /*
+ * Expose KVM's priority- and ID-bits to the guest, but not GCIE_LEGACY.
+ *
+ * Note: for GICv5 the mimic the way that the num_pri_bits and
+ * num_id_bits fields are used with GICv3:
+ * - num_pri_bits stores the actual number of priority bits, whereas the
+ * register field stores num_pri_bits - 1.
+ * - num_id_bits stores the raw field value, which is 0b0000 for 16 bits
+ * and 0b0001 for 24 bits.
+ */
+ p->regval = FIELD_PREP(ICC_IDR0_EL1_PRI_BITS, vcpu->arch.vgic_cpu.num_pri_bits - 1) |
+ FIELD_PREP(ICC_IDR0_EL1_ID_BITS, vcpu->arch.vgic_cpu.num_id_bits);
+
+ return true;
+}
+
+static bool access_gicv5_iaffid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ return undef_access(vcpu, p, r);
+
+ /*
+ * For GICv5 VMs, the IAFFID value is the same as the VPE ID. The VPE ID
+ * is the same as the VCPU's ID.
+ */
+ p->regval = FIELD_PREP(ICC_IAFFIDR_EL1_IAFFID, vcpu->vcpu_id);
+
+ return true;
+}
+
+static bool access_gicv5_ppi_enabler(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ unsigned long *mask = vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask;
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ int i;
+
+ /* We never expect to get here with a read! */
+ if (WARN_ON_ONCE(!p->is_write))
+ return undef_access(vcpu, p, r);
+
+ /*
+ * If we're only handling architected PPIs and the guest writes to the
+ * enable for the non-architected PPIs, we just return as there's
+ * nothing to do at all. We don't even allocate the storage for them in
+ * this case.
+ */
+ if (VGIC_V5_NR_PRIVATE_IRQS == 64 && p->Op2 % 2)
+ return true;
+
+ /*
+ * Merge the raw guest write into out bitmap at an offset of either 0 or
+ * 64, then and it with our PPI mask.
+ */
+ bitmap_write(cpu_if->vgic_ppi_enabler, p->regval, 64 * (p->Op2 % 2), 64);
+ bitmap_and(cpu_if->vgic_ppi_enabler, cpu_if->vgic_ppi_enabler, mask,
+ VGIC_V5_NR_PRIVATE_IRQS);
+
+ /*
+ * Sync the change in enable states to the vgic_irqs. We consider all
+ * PPIs as we don't expose many to the guest.
+ */
+ for_each_set_bit(i, mask, VGIC_V5_NR_PRIVATE_IRQS) {
+ u32 intid = vgic_v5_make_ppi(i);
+ struct vgic_irq *irq;
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock)
+ irq->enabled = test_bit(i, cpu_if->vgic_ppi_enabler);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return true;
+}
+
static bool trap_raz_wi(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -1758,6 +1843,7 @@ static u8 pmuver_to_perfmon(u8 pmuver)
static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val);
+static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val);
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
/* Read a sanitised cpufeature ID register by sys_reg_desc */
@@ -1783,10 +1869,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val = sanitise_id_aa64pfr1_el1(vcpu, val);
break;
case SYS_ID_AA64PFR2_EL1:
- val &= ID_AA64PFR2_EL1_FPMR |
- (kvm_has_mte(vcpu->kvm) ?
- ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY :
- 0);
+ val = sanitise_id_aa64pfr2_el1(vcpu, val);
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
@@ -1985,7 +2068,7 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
}
- if (vgic_is_v3(vcpu->kvm)) {
+ if (vgic_host_has_gicv3()) {
val &= ~ID_AA64PFR0_EL1_GIC_MASK;
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
}
@@ -2027,6 +2110,23 @@ static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val)
return val;
}
+static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+ val &= ID_AA64PFR2_EL1_FPMR |
+ ID_AA64PFR2_EL1_MTEFAR |
+ ID_AA64PFR2_EL1_MTESTOREONLY;
+
+ if (!kvm_has_mte(vcpu->kvm)) {
+ val &= ~ID_AA64PFR2_EL1_MTEFAR;
+ val &= ~ID_AA64PFR2_EL1_MTESTOREONLY;
+ }
+
+ if (vgic_host_has_gicv5())
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP);
+
+ return val;
+}
+
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
@@ -2177,14 +2277,6 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
(vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
return -EINVAL;
- /*
- * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then
- * we support GICv3. Fail attempts to do anything but set that to IMP.
- */
- if (vgic_is_v3_compat(vcpu->kvm) &&
- FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP)
- return -EINVAL;
-
return set_id_reg(vcpu, rd, user_val);
}
@@ -2224,6 +2316,12 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
return set_id_reg(vcpu, rd, user_val);
}
+static int set_id_aa64pfr2_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd, u64 user_val)
+{
+ return set_id_reg(vcpu, rd, user_val);
+}
+
/*
* Allow userspace to de-feature a stage-2 translation granule but prevent it
* from claiming the impossible.
@@ -3205,10 +3303,11 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR1_EL1_RES0 |
ID_AA64PFR1_EL1_MPAM_frac |
ID_AA64PFR1_EL1_MTE)),
- ID_WRITABLE(ID_AA64PFR2_EL1,
- ID_AA64PFR2_EL1_FPMR |
- ID_AA64PFR2_EL1_MTEFAR |
- ID_AA64PFR2_EL1_MTESTOREONLY),
+ ID_FILTERED(ID_AA64PFR2_EL1, id_aa64pfr2_el1,
+ (ID_AA64PFR2_EL1_FPMR |
+ ID_AA64PFR2_EL1_MTEFAR |
+ ID_AA64PFR2_EL1_MTESTOREONLY |
+ ID_AA64PFR2_EL1_GCIE)),
ID_UNALLOCATED(4,3),
ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
ID_HIDDEN(ID_AA64SMFR0_EL1),
@@ -3391,6 +3490,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
{ SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
{ SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_IDR0_EL1), access_gicv5_idr0 },
+ { SYS_DESC(SYS_ICC_IAFFIDR_EL1), access_gicv5_iaffid },
+ { SYS_DESC(SYS_ICC_PPI_ENABLER0_EL1), access_gicv5_ppi_enabler },
+ { SYS_DESC(SYS_ICC_PPI_ENABLER1_EL1), access_gicv5_ppi_enabler },
{ SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
{ SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
@@ -5647,6 +5750,8 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
compute_fgu(kvm, HFGRTR2_GROUP);
compute_fgu(kvm, HFGITR2_GROUP);
compute_fgu(kvm, HDFGRTR2_GROUP);
+ compute_fgu(kvm, ICH_HFGRTR_GROUP);
+ compute_fgu(kvm, ICH_HFGITR_GROUP);
set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags);
out:
@@ -5667,23 +5772,58 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
guard(mutex)(&kvm->arch.config_lock);
+ if (vcpu_has_nv(vcpu)) {
+ int ret = kvm_init_nv_sysregs(vcpu);
+ if (ret)
+ return ret;
+ }
+
+ if (kvm_vm_has_ran_once(kvm))
+ return 0;
+
/*
* This hacks into the ID registers, so only perform it when the
* first vcpu runs, or the kvm_set_vm_id_reg() helper will scream.
*/
- if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) {
+ if (!irqchip_in_kernel(kvm)) {
u64 val;
val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE;
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, val);
val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val);
- }
-
- if (vcpu_has_nv(vcpu)) {
- int ret = kvm_init_nv_sysregs(vcpu);
- if (ret)
- return ret;
+ } else {
+ /*
+ * Certain userspace software - QEMU - samples the system
+ * register state without creating an irqchip, then blindly
+ * restores the state prior to running the final guest. This
+ * means that it restores the virtualization & emulation
+ * capabilities of the host system, rather than something that
+ * reflects the final guest state. Moreover, it checks that the
+ * state was "correctly" restored (i.e., verbatim), bailing if
+ * it isn't, so masking off invalid state isn't an option.
+ *
+ * On GICv5 hardware that supports FEAT_GCIE_LEGACY we can run
+ * both GICv3- and GICv5-based guests. Therefore, we initially
+ * present both ID_AA64PFR0.GIC and ID_AA64PFR2.GCIE as IMP to
+ * reflect that userspace can create EITHER a vGICv3 or a
+ * vGICv5. This is an architecturally invalid combination, of
+ * course. Once an in-kernel GIC is created, the sysreg state is
+ * updated to reflect the actual, valid configuration.
+ *
+ * Setting both the GIC and GCIE features to IMP unsurprisingly
+ * results in guests falling over, and hence we need to fix up
+ * this mess in KVM. Before running for the first time we yet
+ * again ensure that the GIC and GCIE fields accurately reflect
+ * the actual hardware the guest should see.
+ *
+ * This hack allows legacy QEMU-based GICv3 guests to run
+ * unmodified on compatible GICv5 hosts, and avoids the inverse
+ * problem for GICv5-based guests in the future.
+ */
+ kvm_vgic_finalize_idregs(kvm);
}
return 0;
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index e9b8b5fc480c..933983bb2005 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -66,12 +66,11 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
* or through the generic KVM_CREATE_DEVICE API ioctl.
* irqchip_in_kernel() tells you if this function succeeded or not.
* @kvm: kvm struct pointer
- * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
+ * @type: KVM_DEV_TYPE_ARM_VGIC_V[235]
*/
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
struct kvm_vcpu *vcpu;
- u64 aa64pfr0, pfr1;
unsigned long i;
int ret;
@@ -132,8 +131,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
kvm->max_vcpus = VGIC_V2_MAX_CPUS;
- else
+ else if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->max_vcpus = VGIC_V3_MAX_CPUS;
+ else if (type == KVM_DEV_TYPE_ARM_VGIC_V5)
+ kvm->max_vcpus = min(VGIC_V5_MAX_CPUS,
+ kvm_vgic_global_state.max_gic_vcpus);
if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
ret = -E2BIG;
@@ -145,19 +147,20 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
- pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
-
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ switch (type) {
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- } else {
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
- aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
- pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3);
+ break;
}
- kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
- kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
+ /*
+ * We've now created the GIC. Update the system register state
+ * to accurately reflect what we've created.
+ */
+ kvm_vgic_finalize_idregs(kvm);
kvm_for_each_vcpu(i, vcpu, kvm) {
ret = vgic_allocate_private_irqs_locked(vcpu, type);
@@ -179,6 +182,15 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
+ /*
+ * We now know that we have a GICv5. The Arch Timer PPI interrupts may
+ * have been initialised at this stage, but will have done so assuming
+ * that we have an older GIC, meaning that the IntIDs won't be
+ * correct. We init them again, and this time they will be correct.
+ */
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V5)
+ kvm_timer_init_vm(kvm);
+
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
kvm_unlock_all_vcpus(kvm);
@@ -259,9 +271,65 @@ int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu)
return ret;
}
+static void vgic_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type)
+{
+ struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i];
+
+ INIT_LIST_HEAD(&irq->ap_list);
+ raw_spin_lock_init(&irq->irq_lock);
+ irq->vcpu = NULL;
+ irq->target_vcpu = vcpu;
+ refcount_set(&irq->refcount, 0);
+
+ irq->intid = i;
+ if (vgic_irq_is_sgi(i)) {
+ /* SGIs */
+ irq->enabled = 1;
+ irq->config = VGIC_CONFIG_EDGE;
+ } else {
+ /* PPIs */
+ irq->config = VGIC_CONFIG_LEVEL;
+ }
+
+ switch (type) {
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ irq->group = 1;
+ irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ irq->group = 0;
+ irq->targets = BIT(vcpu->vcpu_id);
+ break;
+ }
+}
+
+static void vgic_v5_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type)
+{
+ struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i];
+ u32 intid = vgic_v5_make_ppi(i);
+
+ INIT_LIST_HEAD(&irq->ap_list);
+ raw_spin_lock_init(&irq->irq_lock);
+ irq->vcpu = NULL;
+ irq->target_vcpu = vcpu;
+ refcount_set(&irq->refcount, 0);
+
+ irq->intid = intid;
+
+ /* The only Edge architected PPI is the SW_PPI */
+ if (i == GICV5_ARCH_PPI_SW_PPI)
+ irq->config = VGIC_CONFIG_EDGE;
+ else
+ irq->config = VGIC_CONFIG_LEVEL;
+
+ /* Register the GICv5-specific PPI ops */
+ vgic_v5_set_ppi_ops(vcpu, intid);
+}
+
static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u32 num_private_irqs;
int i;
lockdep_assert_held(&vcpu->kvm->arch.config_lock);
@@ -269,8 +337,13 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
if (vgic_cpu->private_irqs)
return 0;
+ if (vgic_is_v5(vcpu->kvm))
+ num_private_irqs = VGIC_V5_NR_PRIVATE_IRQS;
+ else
+ num_private_irqs = VGIC_NR_PRIVATE_IRQS;
+
vgic_cpu->private_irqs = kzalloc_objs(struct vgic_irq,
- VGIC_NR_PRIVATE_IRQS,
+ num_private_irqs,
GFP_KERNEL_ACCOUNT);
if (!vgic_cpu->private_irqs)
@@ -280,34 +353,11 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
* Enable and configure all SGIs to be edge-triggered and
* configure all PPIs as level-triggered.
*/
- for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
- struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-
- INIT_LIST_HEAD(&irq->ap_list);
- raw_spin_lock_init(&irq->irq_lock);
- irq->intid = i;
- irq->vcpu = NULL;
- irq->target_vcpu = vcpu;
- refcount_set(&irq->refcount, 0);
- if (vgic_irq_is_sgi(i)) {
- /* SGIs */
- irq->enabled = 1;
- irq->config = VGIC_CONFIG_EDGE;
- } else {
- /* PPIs */
- irq->config = VGIC_CONFIG_LEVEL;
- }
-
- switch (type) {
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- irq->group = 1;
- irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
- break;
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- irq->group = 0;
- irq->targets = BIT(vcpu->vcpu_id);
- break;
- }
+ for (i = 0; i < num_private_irqs; i++) {
+ if (vgic_is_v5(vcpu->kvm))
+ vgic_v5_allocate_private_irq(vcpu, i, type);
+ else
+ vgic_allocate_private_irq(vcpu, i, type);
}
return 0;
@@ -366,7 +416,11 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu)
{
- if (kvm_vgic_global_state.type == VGIC_V2)
+ const struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V5)
+ vgic_v5_reset(vcpu);
+ else if (kvm_vgic_global_state.type == VGIC_V2)
vgic_v2_reset(vcpu);
else
vgic_v3_reset(vcpu);
@@ -397,22 +451,28 @@ int vgic_init(struct kvm *kvm)
if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
return -EBUSY;
- /* freeze the number of spis */
- if (!dist->nr_spis)
- dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
+ if (!vgic_is_v5(kvm)) {
+ /* freeze the number of spis */
+ if (!dist->nr_spis)
+ dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
- ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
- if (ret)
- goto out;
+ ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
+ if (ret)
+ return ret;
- /*
- * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs,
- * vLPIs) is supported.
- */
- if (vgic_supports_direct_irqs(kvm)) {
- ret = vgic_v4_init(kvm);
+ /*
+ * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs,
+ * vLPIs) is supported.
+ */
+ if (vgic_supports_direct_irqs(kvm)) {
+ ret = vgic_v4_init(kvm);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ret = vgic_v5_init(kvm);
if (ret)
- goto out;
+ return ret;
}
kvm_for_each_vcpu(idx, vcpu, kvm)
@@ -420,12 +480,12 @@ int vgic_init(struct kvm *kvm)
ret = kvm_vgic_setup_default_irq_routing(kvm);
if (ret)
- goto out;
+ return ret;
vgic_debug_init(kvm);
dist->initialized = true;
-out:
- return ret;
+
+ return 0;
}
static void kvm_vgic_dist_destroy(struct kvm *kvm)
@@ -569,6 +629,7 @@ int vgic_lazy_init(struct kvm *kvm)
int kvm_vgic_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ bool needs_dist = true;
enum vgic_type type;
gpa_t dist_base;
int ret = 0;
@@ -587,21 +648,29 @@ int kvm_vgic_map_resources(struct kvm *kvm)
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
ret = vgic_v2_map_resources(kvm);
type = VGIC_V2;
- } else {
+ } else if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
ret = vgic_v3_map_resources(kvm);
type = VGIC_V3;
+ } else {
+ ret = vgic_v5_map_resources(kvm);
+ type = VGIC_V5;
+ needs_dist = false;
}
if (ret)
goto out;
- dist_base = dist->vgic_dist_base;
- mutex_unlock(&kvm->arch.config_lock);
+ if (needs_dist) {
+ dist_base = dist->vgic_dist_base;
+ mutex_unlock(&kvm->arch.config_lock);
- ret = vgic_register_dist_iodev(kvm, dist_base, type);
- if (ret) {
- kvm_err("Unable to register VGIC dist MMIO regions\n");
- goto out_slots;
+ ret = vgic_register_dist_iodev(kvm, dist_base, type);
+ if (ret) {
+ kvm_err("Unable to register VGIC dist MMIO regions\n");
+ goto out_slots;
+ }
+ } else {
+ mutex_unlock(&kvm->arch.config_lock);
}
smp_store_release(&dist->ready, true);
@@ -617,6 +686,35 @@ out_slots:
return ret;
}
+void kvm_vgic_finalize_idregs(struct kvm *kvm)
+{
+ u32 type = kvm->arch.vgic.vgic_model;
+ u64 aa64pfr0, aa64pfr2, pfr1;
+
+ aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ aa64pfr2 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE;
+ pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+
+ switch (type) {
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ if (kvm_supports_32bit_el0())
+ pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ aa64pfr2 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP);
+ break;
+ default:
+ WARN_ONCE(1, "Unknown VGIC type!!!\n");
+ }
+
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, aa64pfr2);
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
+}
+
/* GENERIC PROBE */
void kvm_vgic_cpu_up(void)
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 3d1a776b716d..a96c77dccf35 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -336,6 +336,10 @@ int kvm_register_vgic_device(unsigned long type)
break;
ret = kvm_vgic_register_its_device();
break;
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v5_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V5);
+ break;
}
return ret;
@@ -639,7 +643,7 @@ static int vgic_v3_set_attr(struct kvm_device *dev,
if (vgic_initialized(dev->kvm))
return -EBUSY;
- if (!irq_is_ppi(val))
+ if (!irq_is_ppi(dev->kvm, val))
return -EINVAL;
dev->kvm->arch.vgic.mi_intid = val;
@@ -715,3 +719,104 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
.get_attr = vgic_v3_get_attr,
.has_attr = vgic_v3_has_attr,
};
+
+static int vgic_v5_get_userspace_ppis(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct vgic_v5_vm *gicv5_vm = &dev->kvm->arch.vgic.gicv5_vm;
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ int ret;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+
+ /*
+ * We either support 64 or 128 PPIs. In the former case, we need to
+ * return 0s for the second 64 bits as we have no storage backing those.
+ */
+ ret = put_user(bitmap_read(gicv5_vm->userspace_ppis, 0, 64), uaddr);
+ if (ret)
+ return ret;
+ uaddr++;
+
+ if (VGIC_V5_NR_PRIVATE_IRQS == 128)
+ ret = put_user(bitmap_read(gicv5_vm->userspace_ppis, 64, 128), uaddr);
+ else
+ ret = put_user(0, uaddr);
+
+ return ret;
+}
+
+static int vgic_v5_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ return -ENXIO;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return vgic_set_common_attr(dev, attr);
+ case KVM_DEV_ARM_VGIC_USERSPACE_PPIS:
+ default:
+ return -ENXIO;
+ }
+ default:
+ return -ENXIO;
+ }
+
+}
+
+static int vgic_v5_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ return -ENXIO;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return vgic_get_common_attr(dev, attr);
+ case KVM_DEV_ARM_VGIC_USERSPACE_PPIS:
+ return vgic_v5_get_userspace_ppis(dev, attr);
+ default:
+ return -ENXIO;
+ }
+ default:
+ return -ENXIO;
+ }
+}
+
+static int vgic_v5_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ return -ENXIO;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return 0;
+ case KVM_DEV_ARM_VGIC_USERSPACE_PPIS:
+ return 0;
+ default:
+ return -ENXIO;
+ }
+ default:
+ return -ENXIO;
+ }
+}
+
+struct kvm_device_ops kvm_arm_vgic_v5_ops = {
+ .name = "kvm-arm-vgic-v5",
+ .create = vgic_create,
+ .destroy = vgic_destroy,
+ .set_attr = vgic_v5_set_attr,
+ .get_attr = vgic_v5_get_attr,
+ .has_attr = vgic_v5_has_attr,
+};
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index a573b1f0c6cb..74d76dec9730 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -842,18 +842,46 @@ vgic_find_mmio_region(const struct vgic_register_region *regions,
void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
{
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_set_vmcr(vcpu, vmcr);
- else
+ const struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ switch (dist->vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ vgic_v5_set_vmcr(vcpu, vmcr);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
vgic_v3_set_vmcr(vcpu, vmcr);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v3_set_vmcr(vcpu, vmcr);
+ else
+ vgic_v2_set_vmcr(vcpu, vmcr);
+ break;
+ default:
+ BUG();
+ }
}
void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
{
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_get_vmcr(vcpu, vmcr);
- else
+ const struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ switch (dist->vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ vgic_v5_get_vmcr(vcpu, vmcr);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
vgic_v3_get_vmcr(vcpu, vmcr);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v3_get_vmcr(vcpu, vmcr);
+ else
+ vgic_v2_get_vmcr(vcpu, vmcr);
+ break;
+ default:
+ BUG();
+ }
}
/*
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 6a355eca1934..9e841e7afd4a 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -499,7 +499,7 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
- if (!vgic_is_v3(vcpu->kvm))
+ if (!vgic_host_has_gicv3())
return;
/* Hide GICv3 sysreg if necessary */
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 331651087e2c..fdd39ea7f83e 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -1,28 +1,52 @@
// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025, 2026 Arm Ltd.
+ */
#include <kvm/arm_vgic.h>
+
+#include <linux/bitops.h>
#include <linux/irqchip/arm-vgic-info.h>
#include "vgic.h"
+static struct vgic_v5_ppi_caps ppi_caps;
+
+/*
+ * Not all PPIs are guaranteed to be implemented for GICv5. Deterermine which
+ * ones are, and generate a mask.
+ */
+static void vgic_v5_get_implemented_ppis(void)
+{
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return;
+
+ /*
+ * If we have KVM, we have EL2, which means that we have support for the
+ * EL1 and EL2 Physical & Virtual timers.
+ */
+ __assign_bit(GICV5_ARCH_PPI_CNTHP, ppi_caps.impl_ppi_mask, 1);
+ __assign_bit(GICV5_ARCH_PPI_CNTV, ppi_caps.impl_ppi_mask, 1);
+ __assign_bit(GICV5_ARCH_PPI_CNTHV, ppi_caps.impl_ppi_mask, 1);
+ __assign_bit(GICV5_ARCH_PPI_CNTP, ppi_caps.impl_ppi_mask, 1);
+
+ /* The SW_PPI should be available */
+ __assign_bit(GICV5_ARCH_PPI_SW_PPI, ppi_caps.impl_ppi_mask, 1);
+
+ /* The PMUIRQ is available if we have the PMU */
+ __assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3());
+}
+
/*
* Probe for a vGICv5 compatible interrupt controller, returning 0 on success.
- * Currently only supports GICv3-based VMs on a GICv5 host, and hence only
- * registers a VGIC_V3 device.
*/
int vgic_v5_probe(const struct gic_kvm_info *info)
{
+ bool v5_registered = false;
u64 ich_vtr_el2;
int ret;
- if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY))
- return -ENODEV;
-
kvm_vgic_global_state.type = VGIC_V5;
- kvm_vgic_global_state.has_gcie_v3_compat = true;
-
- /* We only support v3 compat mode - use vGICv3 limits */
- kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
kvm_vgic_global_state.vcpu_base = 0;
kvm_vgic_global_state.vctrl_base = NULL;
@@ -30,6 +54,38 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
kvm_vgic_global_state.has_gicv4 = false;
kvm_vgic_global_state.has_gicv4_1 = false;
+ /*
+ * GICv5 is currently not supported in Protected mode. Skip the
+ * registration of GICv5 completely to make sure no guests can create a
+ * GICv5-based guest.
+ */
+ if (is_protected_kvm_enabled()) {
+ kvm_info("GICv5-based guests are not supported with pKVM\n");
+ goto skip_v5;
+ }
+
+ kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
+
+ vgic_v5_get_implemented_ppis();
+
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5);
+ if (ret) {
+ kvm_err("Cannot register GICv5 KVM device.\n");
+ goto skip_v5;
+ }
+
+ v5_registered = true;
+ kvm_info("GCIE system register CPU interface\n");
+
+skip_v5:
+ /* If we don't support the GICv3 compat mode we're done. */
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) {
+ if (!v5_registered)
+ return -ENODEV;
+ return 0;
+ }
+
+ kvm_vgic_global_state.has_gcie_v3_compat = true;
ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2;
@@ -45,6 +101,10 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
return ret;
}
+ /* We potentially limit the max VCPUs further than we need to here */
+ kvm_vgic_global_state.max_gic_vcpus = min(VGIC_V3_MAX_CPUS,
+ VGIC_V5_MAX_CPUS);
+
static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
kvm_info("GCIE legacy system register CPU interface\n");
@@ -52,3 +112,424 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
return 0;
}
+
+void vgic_v5_reset(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We always present 16-bits of ID space to the guest, irrespective of
+ * the host allowing more.
+ */
+ vcpu->arch.vgic_cpu.num_id_bits = ICC_IDR0_EL1_ID_BITS_16BITS;
+
+ /*
+ * The GICv5 architeture only supports 5-bits of priority in the
+ * CPUIF (but potentially fewer in the IRS).
+ */
+ vcpu->arch.vgic_cpu.num_pri_bits = 5;
+}
+
+int vgic_v5_init(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long idx;
+
+ if (vgic_initialized(kvm))
+ return 0;
+
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ if (vcpu_has_nv(vcpu)) {
+ kvm_err("Nested GICv5 VMs are currently unsupported\n");
+ return -EINVAL;
+ }
+ }
+
+ /* We only allow userspace to drive the SW_PPI, if it is implemented. */
+ bitmap_zero(kvm->arch.vgic.gicv5_vm.userspace_ppis,
+ VGIC_V5_NR_PRIVATE_IRQS);
+ __assign_bit(GICV5_ARCH_PPI_SW_PPI,
+ kvm->arch.vgic.gicv5_vm.userspace_ppis,
+ VGIC_V5_NR_PRIVATE_IRQS);
+ bitmap_and(kvm->arch.vgic.gicv5_vm.userspace_ppis,
+ kvm->arch.vgic.gicv5_vm.userspace_ppis,
+ ppi_caps.impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS);
+
+ return 0;
+}
+
+int vgic_v5_map_resources(struct kvm *kvm)
+{
+ if (!vgic_initialized(kvm))
+ return -EBUSY;
+
+ return 0;
+}
+
+int vgic_v5_finalize_ppi_state(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu0;
+ int i;
+
+ if (!vgic_is_v5(kvm))
+ return 0;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ /*
+ * If SW_PPI has been advertised, then we know we already
+ * initialised the whole thing, and we can return early. Yes,
+ * this is pretty hackish as far as state tracking goes...
+ */
+ if (test_bit(GICV5_ARCH_PPI_SW_PPI, kvm->arch.vgic.gicv5_vm.vgic_ppi_mask))
+ return 0;
+
+ /* The PPI state for all VCPUs should be the same. Pick the first. */
+ vcpu0 = kvm_get_vcpu(kvm, 0);
+
+ bitmap_zero(kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS);
+ bitmap_zero(kvm->arch.vgic.gicv5_vm.vgic_ppi_hmr, VGIC_V5_NR_PRIVATE_IRQS);
+
+ for_each_set_bit(i, ppi_caps.impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) {
+ const u32 intid = vgic_v5_make_ppi(i);
+ struct vgic_irq *irq;
+
+ irq = vgic_get_vcpu_irq(vcpu0, intid);
+
+ /* Expose PPIs with an owner or the SW_PPI, only */
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
+ if (irq->owner || i == GICV5_ARCH_PPI_SW_PPI) {
+ __assign_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, 1);
+ __assign_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_hmr,
+ irq->config == VGIC_CONFIG_LEVEL);
+ }
+ }
+
+ vgic_put_irq(vcpu0->kvm, irq);
+ }
+
+ return 0;
+}
+
+static u32 vgic_v5_get_effective_priority_mask(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ u32 highest_ap, priority_mask, apr;
+
+ /*
+ * If the guest's CPU has not opted to receive interrupts, then the
+ * effective running priority is the highest priority. Just return 0
+ * (the highest priority).
+ */
+ if (!FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_EN, cpu_if->vgic_vmcr))
+ return 0;
+
+ /*
+ * Counting the number of trailing zeros gives the current active
+ * priority. Explicitly use the 32-bit version here as we have 32
+ * priorities. 32 then means that there are no active priorities.
+ */
+ apr = cpu_if->vgic_apr;
+ highest_ap = apr ? __builtin_ctz(apr) : 32;
+
+ /*
+ * An interrupt is of sufficient priority if it is equal to or
+ * greater than the priority mask. Add 1 to the priority mask
+ * (i.e., lower priority) to match the APR logic before taking
+ * the min. This gives us the lowest priority that is masked.
+ */
+ priority_mask = FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_VPMR, cpu_if->vgic_vmcr);
+
+ return min(highest_ap, priority_mask + 1);
+}
+
+/*
+ * For GICv5, the PPIs are mostly directly managed by the hardware. We (the
+ * hypervisor) handle the pending, active, enable state save/restore, but don't
+ * need the PPIs to be queued on a per-VCPU AP list. Therefore, sanity check the
+ * state, unlock, and return.
+ */
+bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+ unsigned long flags)
+ __releases(&irq->irq_lock)
+{
+ struct kvm_vcpu *vcpu;
+
+ lockdep_assert_held(&irq->irq_lock);
+
+ if (WARN_ON_ONCE(!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, irq->intid)))
+ goto out_unlock_fail;
+
+ vcpu = irq->target_vcpu;
+ if (WARN_ON_ONCE(!vcpu))
+ goto out_unlock_fail;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ /* Directly kick the target VCPU to make sure it sees the IRQ */
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return true;
+
+out_unlock_fail:
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ return false;
+}
+
+/*
+ * Sets/clears the corresponding bit in the ICH_PPI_DVIR register.
+ */
+void vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool dvi)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ u32 ppi;
+
+ lockdep_assert_held(&irq->irq_lock);
+
+ ppi = vgic_v5_get_hwirq_id(irq->intid);
+ __assign_bit(ppi, cpu_if->vgic_ppi_dvir, dvi);
+}
+
+static struct irq_ops vgic_v5_ppi_irq_ops = {
+ .queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock,
+ .set_direct_injection = vgic_v5_set_ppi_dvi,
+};
+
+void vgic_v5_set_ppi_ops(struct kvm_vcpu *vcpu, u32 vintid)
+{
+ kvm_vgic_set_irq_ops(vcpu, vintid, &vgic_v5_ppi_irq_ops);
+}
+
+/*
+ * Sync back the PPI priorities to the vgic_irq shadow state for any interrupts
+ * exposed to the guest (skipping all others).
+ */
+static void vgic_v5_sync_ppi_priorities(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ u64 priorityr;
+ int i;
+
+ /*
+ * We have up to 16 PPI Priority regs, but only have a few interrupts
+ * that the guest is allowed to use. Limit our sync of PPI priorities to
+ * those actually exposed to the guest by first iterating over the mask
+ * of exposed PPIs.
+ */
+ for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) {
+ u32 intid = vgic_v5_make_ppi(i);
+ struct vgic_irq *irq;
+ int pri_idx, pri_reg, pri_bit;
+ u8 priority;
+
+ /*
+ * Determine which priority register and the field within it to
+ * extract.
+ */
+ pri_reg = i / 8;
+ pri_idx = i % 8;
+ pri_bit = pri_idx * 8;
+
+ priorityr = cpu_if->vgic_ppi_priorityr[pri_reg];
+ priority = field_get(GENMASK(pri_bit + 4, pri_bit), priorityr);
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock)
+ irq->priority = priority;
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+bool vgic_v5_has_pending_ppi(struct kvm_vcpu *vcpu)
+{
+ unsigned int priority_mask;
+ int i;
+
+ priority_mask = vgic_v5_get_effective_priority_mask(vcpu);
+
+ /*
+ * If the combined priority mask is 0, nothing can be signalled! In the
+ * case where the guest has disabled interrupt delivery for the vcpu
+ * (via ICV_CR0_EL1.EN->ICH_VMCR_EL2.EN), we calculate the priority mask
+ * as 0 too (the highest possible priority).
+ */
+ if (!priority_mask)
+ return false;
+
+ for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) {
+ u32 intid = vgic_v5_make_ppi(i);
+ bool has_pending = false;
+ struct vgic_irq *irq;
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock)
+ if (irq->enabled && irq->priority < priority_mask)
+ has_pending = irq->hw ? vgic_get_phys_line_level(irq) : irq_is_pending(irq);
+
+ vgic_put_irq(vcpu->kvm, irq);
+
+ if (has_pending)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Detect any PPIs state changes, and propagate the state with KVM's
+ * shadow structures.
+ */
+void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ unsigned long *activer, *pendr;
+ int i;
+
+ activer = host_data_ptr(vgic_v5_ppi_state)->activer_exit;
+ pendr = host_data_ptr(vgic_v5_ppi_state)->pendr;
+
+ for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask,
+ VGIC_V5_NR_PRIVATE_IRQS) {
+ u32 intid = vgic_v5_make_ppi(i);
+ struct vgic_irq *irq;
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
+ irq->active = test_bit(i, activer);
+
+ /* This is an OR to avoid losing incoming edges! */
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch |= test_bit(i, pendr);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ /*
+ * Re-inject the exit state as entry state next time!
+ *
+ * Note that the write of the Enable state is trapped, and hence there
+ * is nothing to explcitly sync back here as we already have the latest
+ * copy by definition.
+ */
+ bitmap_copy(cpu_if->vgic_ppi_activer, activer, VGIC_V5_NR_PRIVATE_IRQS);
+}
+
+void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu)
+{
+ DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS);
+ int i;
+
+ /*
+ * Time to enter the guest - we first need to build the guest's
+ * ICC_PPI_PENDRx_EL1, however.
+ */
+ bitmap_zero(pendr, VGIC_V5_NR_PRIVATE_IRQS);
+ for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask,
+ VGIC_V5_NR_PRIVATE_IRQS) {
+ u32 intid = vgic_v5_make_ppi(i);
+ struct vgic_irq *irq;
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
+ __assign_bit(i, pendr, irq_is_pending(irq));
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch = false;
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ /*
+ * Copy the shadow state to the pending reg that will be written to the
+ * ICH_PPI_PENDRx_EL2 regs. While the guest is running we track any
+ * incoming changes to the pending state in the vgic_irq structures. The
+ * incoming changes are merged with the outgoing changes on the return
+ * path.
+ */
+ bitmap_copy(host_data_ptr(vgic_v5_ppi_state)->pendr, pendr,
+ VGIC_V5_NR_PRIVATE_IRQS);
+}
+
+void vgic_v5_load(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+ /*
+ * On the WFI path, vgic_load is called a second time. The first is when
+ * scheduling in the vcpu thread again, and the second is when leaving
+ * WFI. Skip the second instance as it serves no purpose and just
+ * restores the same state again.
+ */
+ if (cpu_if->gicv5_vpe.resident)
+ return;
+
+ kvm_call_hyp(__vgic_v5_restore_vmcr_apr, cpu_if);
+
+ cpu_if->gicv5_vpe.resident = true;
+}
+
+void vgic_v5_put(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+ /*
+ * Do nothing if we're not resident. This can happen in the WFI path
+ * where we do a vgic_put in the WFI path and again later when
+ * descheduling the thread. We risk losing VMCR state if we sync it
+ * twice, so instead return early in this case.
+ */
+ if (!cpu_if->gicv5_vpe.resident)
+ return;
+
+ kvm_call_hyp(__vgic_v5_save_apr, cpu_if);
+
+ cpu_if->gicv5_vpe.resident = false;
+
+ /* The shadow priority is only updated on entering WFI */
+ if (vcpu_get_flag(vcpu, IN_WFI))
+ vgic_v5_sync_ppi_priorities(vcpu);
+}
+
+void vgic_v5_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ u64 vmcr = cpu_if->vgic_vmcr;
+
+ vmcrp->en = FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_EN, vmcr);
+ vmcrp->pmr = FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_VPMR, vmcr);
+}
+
+void vgic_v5_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+ u64 vmcr;
+
+ vmcr = FIELD_PREP(FEAT_GCIE_ICH_VMCR_EL2_VPMR, vmcrp->pmr) |
+ FIELD_PREP(FEAT_GCIE_ICH_VMCR_EL2_EN, vmcrp->en);
+
+ cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v5_restore_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+ __vgic_v5_restore_state(cpu_if);
+ __vgic_v5_restore_ppi_state(cpu_if);
+ dsb(sy);
+}
+
+void vgic_v5_save_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+ __vgic_v5_save_state(cpu_if);
+ __vgic_v5_save_ppi_state(cpu_if);
+ dsb(sy);
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index e22b79cfff96..1e9fe8764584 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -86,6 +86,10 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
*/
struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid)
{
+ /* Non-private IRQs are not yet implemented for GICv5 */
+ if (vgic_is_v5(kvm))
+ return NULL;
+
/* SPIs */
if (intid >= VGIC_NR_PRIVATE_IRQS &&
intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
@@ -94,7 +98,7 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid)
}
/* LPIs */
- if (intid >= VGIC_MIN_LPI)
+ if (irq_is_lpi(kvm, intid))
return vgic_get_lpi(kvm, intid);
return NULL;
@@ -105,6 +109,18 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
if (WARN_ON(!vcpu))
return NULL;
+ if (vgic_is_v5(vcpu->kvm)) {
+ u32 int_num, hwirq_id;
+
+ if (!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, intid))
+ return NULL;
+
+ hwirq_id = FIELD_GET(GICV5_HWIRQ_ID, intid);
+ int_num = array_index_nospec(hwirq_id, VGIC_V5_NR_PRIVATE_IRQS);
+
+ return &vcpu->arch.vgic_cpu.private_irqs[int_num];
+ }
+
/* SGIs and PPIs */
if (intid < VGIC_NR_PRIVATE_IRQS) {
intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
@@ -123,7 +139,7 @@ static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq
static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
{
- if (irq->intid < VGIC_MIN_LPI)
+ if (!irq_is_lpi(kvm, irq->intid))
return false;
return refcount_dec_and_test(&irq->refcount);
@@ -148,7 +164,7 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
* Acquire/release it early on lockdep kernels to make locking issues
* in rare release paths a bit more obvious.
*/
- if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) {
+ if (IS_ENABLED(CONFIG_LOCKDEP) && irq_is_lpi(kvm, irq->intid)) {
guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock);
}
@@ -186,7 +202,7 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
- if (irq->intid >= VGIC_MIN_LPI) {
+ if (irq_is_lpi(vcpu->kvm, irq->intid)) {
raw_spin_lock(&irq->irq_lock);
list_del(&irq->ap_list);
irq->vcpu = NULL;
@@ -404,6 +420,9 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
lockdep_assert_held(&irq->irq_lock);
+ if (irq->ops && irq->ops->queue_irq_unlock)
+ return irq->ops->queue_irq_unlock(kvm, irq, flags);
+
retry:
vcpu = vgic_target_oracle(irq);
if (irq->vcpu || !vcpu) {
@@ -521,12 +540,12 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
if (ret)
return ret;
- if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
+ if (!vcpu && irq_is_private(kvm, intid))
return -EINVAL;
trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level);
- if (intid < VGIC_NR_PRIVATE_IRQS)
+ if (irq_is_private(kvm, intid))
irq = vgic_get_vcpu_irq(vcpu, intid);
else
irq = vgic_get_irq(kvm, intid);
@@ -553,10 +572,27 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
return 0;
}
+void kvm_vgic_set_irq_ops(struct kvm_vcpu *vcpu, u32 vintid,
+ struct irq_ops *ops)
+{
+ struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
+
+ BUG_ON(!irq);
+
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock)
+ irq->ops = ops;
+
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+void kvm_vgic_clear_irq_ops(struct kvm_vcpu *vcpu, u32 vintid)
+{
+ kvm_vgic_set_irq_ops(vcpu, vintid, NULL);
+}
+
/* @irq->irq_lock must be held */
static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
- unsigned int host_irq,
- struct irq_ops *ops)
+ unsigned int host_irq)
{
struct irq_desc *desc;
struct irq_data *data;
@@ -576,20 +612,25 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
irq->hw = true;
irq->host_irq = host_irq;
irq->hwintid = data->hwirq;
- irq->ops = ops;
+
+ if (irq->ops && irq->ops->set_direct_injection)
+ irq->ops->set_direct_injection(vcpu, irq, true);
+
return 0;
}
/* @irq->irq_lock must be held */
static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
{
+ if (irq->ops && irq->ops->set_direct_injection)
+ irq->ops->set_direct_injection(irq->target_vcpu, irq, false);
+
irq->hw = false;
irq->hwintid = 0;
- irq->ops = NULL;
}
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
- u32 vintid, struct irq_ops *ops)
+ u32 vintid)
{
struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
unsigned long flags;
@@ -598,7 +639,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
BUG_ON(!irq);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
- ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops);
+ ret = kvm_vgic_map_irq(vcpu, irq, host_irq);
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
@@ -685,7 +726,7 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
return -EAGAIN;
/* SGIs and LPIs cannot be wired up to any device */
- if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
+ if (!irq_is_ppi(vcpu->kvm, intid) && !vgic_valid_spi(vcpu->kvm, intid))
return -EINVAL;
irq = vgic_get_vcpu_irq(vcpu, intid);
@@ -812,8 +853,13 @@ retry:
vgic_release_deleted_lpis(vcpu->kvm);
}
-static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+static void vgic_fold_state(struct kvm_vcpu *vcpu)
{
+ if (vgic_is_v5(vcpu->kvm)) {
+ vgic_v5_fold_ppi_state(vcpu);
+ return;
+ }
+
if (!*host_data_ptr(last_lr_irq))
return;
@@ -1002,7 +1048,10 @@ static inline bool can_access_vgic_from_kernel(void)
static inline void vgic_save_state(struct kvm_vcpu *vcpu)
{
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ /* No switch statement here. See comment in vgic_restore_state() */
+ if (vgic_is_v5(vcpu->kvm))
+ vgic_v5_save_state(vcpu);
+ else if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_save_state(vcpu);
else
__vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
@@ -1011,20 +1060,24 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu)
/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
- /* If nesting, emulate the HW effect from L0 to L1 */
- if (vgic_state_is_nested(vcpu)) {
- vgic_v3_sync_nested(vcpu);
- return;
- }
+ if (vgic_is_v3(vcpu->kvm)) {
+ /* If nesting, emulate the HW effect from L0 to L1 */
+ if (vgic_state_is_nested(vcpu)) {
+ vgic_v3_sync_nested(vcpu);
+ return;
+ }
- if (vcpu_has_nv(vcpu))
- vgic_v3_nested_update_mi(vcpu);
+ if (vcpu_has_nv(vcpu))
+ vgic_v3_nested_update_mi(vcpu);
+ }
if (can_access_vgic_from_kernel())
vgic_save_state(vcpu);
- vgic_fold_lr_state(vcpu);
- vgic_prune_ap_list(vcpu);
+ vgic_fold_state(vcpu);
+
+ if (!vgic_is_v5(vcpu->kvm))
+ vgic_prune_ap_list(vcpu);
}
/* Sync interrupts that were deactivated through a DIR trap */
@@ -1040,12 +1093,34 @@ void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu)
static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
{
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ /*
+ * As nice as it would be to restructure this code into a switch
+ * statement as can be found elsewhere, the logic quickly gets ugly.
+ *
+ * __vgic_v3_restore_state() is doing a lot of heavy lifting here. It is
+ * required for GICv3-on-GICv3, GICv2-on-GICv3, GICv3-on-GICv5, and the
+ * no-in-kernel-irqchip case on GICv3 hardware. Hence, adding a switch
+ * here results in much more complex code.
+ */
+ if (vgic_is_v5(vcpu->kvm))
+ vgic_v5_restore_state(vcpu);
+ else if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_restore_state(vcpu);
else
__vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
}
+static void vgic_flush_state(struct kvm_vcpu *vcpu)
+{
+ if (vgic_is_v5(vcpu->kvm)) {
+ vgic_v5_flush_ppi_state(vcpu);
+ return;
+ }
+
+ scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
+ vgic_flush_lr_state(vcpu);
+}
+
/* Flush our emulation state into the GIC hardware before entering the guest. */
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
@@ -1082,42 +1157,69 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
- vgic_flush_lr_state(vcpu);
+ vgic_flush_state(vcpu);
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
- if (vgic_supports_direct_irqs(vcpu->kvm))
+ if (vgic_supports_direct_irqs(vcpu->kvm) && kvm_vgic_global_state.has_gicv4)
vgic_v4_commit(vcpu);
}
void kvm_vgic_load(struct kvm_vcpu *vcpu)
{
+ const struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
__vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
}
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
- vgic_v2_load(vcpu);
- else
+ switch (dist->vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ vgic_v5_load(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
vgic_v3_load(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v3_load(vcpu);
+ else
+ vgic_v2_load(vcpu);
+ break;
+ default:
+ BUG();
+ }
}
void kvm_vgic_put(struct kvm_vcpu *vcpu)
{
+ const struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
__vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
}
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
- vgic_v2_put(vcpu);
- else
+ switch (dist->vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ vgic_v5_put(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
vgic_v3_put(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v3_put(vcpu);
+ else
+ vgic_v2_put(vcpu);
+ break;
+ default:
+ BUG();
+ }
}
int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
@@ -1128,6 +1230,9 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
unsigned long flags;
struct vgic_vmcr vmcr;
+ if (vgic_is_v5(vcpu->kvm))
+ return vgic_v5_has_pending_ppi(vcpu);
+
if (!vcpu->kvm->arch.vgic.enabled)
return false;
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index c9b3bb07e483..9d941241c8a2 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -187,6 +187,7 @@ static inline u64 vgic_ich_hcr_trap_bits(void)
* registers regardless of the hardware backed GIC used.
*/
struct vgic_vmcr {
+ u32 en; /* GICv5-specific */
u32 grpen0;
u32 grpen1;
@@ -363,6 +364,19 @@ void vgic_debug_init(struct kvm *kvm);
void vgic_debug_destroy(struct kvm *kvm);
int vgic_v5_probe(const struct gic_kvm_info *info);
+void vgic_v5_reset(struct kvm_vcpu *vcpu);
+int vgic_v5_init(struct kvm *kvm);
+int vgic_v5_map_resources(struct kvm *kvm);
+void vgic_v5_set_ppi_ops(struct kvm_vcpu *vcpu, u32 vintid);
+bool vgic_v5_has_pending_ppi(struct kvm_vcpu *vcpu);
+void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu);
+void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu);
+void vgic_v5_load(struct kvm_vcpu *vcpu);
+void vgic_v5_put(struct kvm_vcpu *vcpu);
+void vgic_v5_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v5_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v5_restore_state(struct kvm_vcpu *vcpu);
+void vgic_v5_save_state(struct kvm_vcpu *vcpu);
static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
{
@@ -425,15 +439,6 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm);
int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
int vgic_its_invall(struct kvm_vcpu *vcpu);
-bool system_supports_direct_sgis(void);
-bool vgic_supports_direct_msis(struct kvm *kvm);
-bool vgic_supports_direct_sgis(struct kvm *kvm);
-
-static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
-{
- return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
-}
-
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
void vgic_v4_configure_vsgis(struct kvm *kvm);
@@ -447,6 +452,11 @@ static inline bool kvm_has_gicv3(struct kvm *kvm)
return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
}
+static inline bool kvm_has_gicv5(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, ID_AA64PFR2_EL1, GCIE, IMP);
+}
+
void vgic_v3_flush_nested(struct kvm_vcpu *vcpu);
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
@@ -454,15 +464,32 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
-static inline bool vgic_is_v3_compat(struct kvm *kvm)
+static inline bool vgic_host_has_gicv3(void)
{
- return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) &&
+ /*
+ * Either the host is a native GICv3, or it is GICv5 with
+ * FEAT_GCIE_LEGACY.
+ */
+ return kvm_vgic_global_state.type == VGIC_V3 ||
kvm_vgic_global_state.has_gcie_v3_compat;
}
-static inline bool vgic_is_v3(struct kvm *kvm)
+static inline bool vgic_host_has_gicv5(void)
+{
+ return kvm_vgic_global_state.type == VGIC_V5;
+}
+
+bool system_supports_direct_sgis(void);
+bool vgic_supports_direct_msis(struct kvm *kvm);
+bool vgic_supports_direct_sgis(struct kvm *kvm);
+
+static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
{
- return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm);
+ /* GICv5 always supports direct IRQs */
+ if (vgic_is_v5(kvm))
+ return true;
+
+ return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
}
int vgic_its_debug_init(struct kvm_device *dev);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index be9dab2c7d6a..7eacc7b45c1f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -43,6 +43,7 @@
#include <asm/system_misc.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
+#include <asm/virt.h>
struct fault_info {
int (*fn)(unsigned long far, unsigned long esr,
@@ -269,6 +270,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr
return false;
}
+static bool is_pkvm_stage2_abort(unsigned int esr)
+{
+ /*
+ * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor
+ * injected a stage-2 abort -- see host_inject_mem_abort().
+ */
+ return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW);
+}
+
static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
unsigned long esr,
struct pt_regs *regs)
@@ -289,8 +299,14 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
* If we now have a valid translation, treat the translation fault as
* spurious.
*/
- if (!(par & SYS_PAR_EL1_F))
+ if (!(par & SYS_PAR_EL1_F)) {
+ if (is_pkvm_stage2_abort(esr)) {
+ par &= SYS_PAR_EL1_PA;
+ return pkvm_force_reclaim_guest_page(par);
+ }
+
return true;
+ }
/*
* If we got a different type of fault from the AT instruction,
@@ -376,9 +392,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
return;
- if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
- "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
+ if (is_spurious_el1_translation_fault(addr, esr, regs)) {
+ WARN_RATELIMIT(!is_pkvm_stage2_abort(esr),
+ "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr);
return;
+ }
if (is_el1_mte_sync_tag_check_fault(esr)) {
do_tag_recovery(addr, esr, regs);
@@ -395,6 +413,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
msg = "read from unreadable memory";
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
+ } else if (is_pkvm_stage2_abort(esr)) {
+ msg = "access to hypervisor-protected memory";
} else {
if (esr_fsc_is_translation_fault(esr) &&
kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
@@ -621,6 +641,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
addr, esr, regs);
}
+ if (is_pkvm_stage2_abort(esr)) {
+ if (!user_mode(regs))
+ goto no_context;
+ arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault");
+ return 0;
+ }
+
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (!(mm_flags & FAULT_FLAG_USER))
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 9d1c21108057..3b57cb692c5b 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -3243,6 +3243,14 @@ UnsignedEnum 3:0 ID_BITS
EndEnum
EndSysreg
+Sysreg ICC_HPPIR_EL1 3 0 12 10 3
+Res0 63:33
+Field 32 HPPIV
+Field 31:29 TYPE
+Res0 28:24
+Field 23:0 ID
+EndSysreg
+
Sysreg ICC_ICSR_EL1 3 0 12 10 4
Res0 63:48
Field 47:32 IAFFID
@@ -3257,6 +3265,11 @@ Field 1 Enabled
Field 0 F
EndSysreg
+Sysreg ICC_IAFFIDR_EL1 3 0 12 10 5
+Res0 63:16
+Field 15:0 IAFFID
+EndSysreg
+
SysregFields ICC_PPI_ENABLERx_EL1
Field 63 EN63
Field 62 EN62
@@ -3663,6 +3676,42 @@ Res0 14:12
Field 11:0 AFFINITY
EndSysreg
+Sysreg ICC_APR_EL1 3 1 12 0 0
+Res0 63:32
+Field 31 P31
+Field 30 P30
+Field 29 P29
+Field 28 P28
+Field 27 P27
+Field 26 P26
+Field 25 P25
+Field 24 P24
+Field 23 P23
+Field 22 P22
+Field 21 P21
+Field 20 P20
+Field 19 P19
+Field 18 P18
+Field 17 P17
+Field 16 P16
+Field 15 P15
+Field 14 P14
+Field 13 P13
+Field 12 P12
+Field 11 P11
+Field 10 P10
+Field 9 P9
+Field 8 P8
+Field 7 P7
+Field 6 P6
+Field 5 P5
+Field 4 P4
+Field 3 P3
+Field 2 P2
+Field 1 P1
+Field 0 P0
+EndSysreg
+
Sysreg ICC_CR0_EL1 3 1 12 0 1
Res0 63:39
Field 38 PID
@@ -4687,6 +4736,42 @@ Field 31:16 PhyPARTID29
Field 15:0 PhyPARTID28
EndSysreg
+Sysreg ICH_APR_EL2 3 4 12 8 4
+Res0 63:32
+Field 31 P31
+Field 30 P30
+Field 29 P29
+Field 28 P28
+Field 27 P27
+Field 26 P26
+Field 25 P25
+Field 24 P24
+Field 23 P23
+Field 22 P22
+Field 21 P21
+Field 20 P20
+Field 19 P19
+Field 18 P18
+Field 17 P17
+Field 16 P16
+Field 15 P15
+Field 14 P14
+Field 13 P13
+Field 12 P12
+Field 11 P11
+Field 10 P10
+Field 9 P9
+Field 8 P8
+Field 7 P7
+Field 6 P6
+Field 5 P5
+Field 4 P4
+Field 3 P3
+Field 2 P2
+Field 1 P1
+Field 0 P0
+EndSysreg
+
Sysreg ICH_HFGRTR_EL2 3 4 12 9 4
Res0 63:21
Field 20 ICC_PPI_ACTIVERn_EL1
@@ -4735,6 +4820,306 @@ Field 1 GICCDDIS
Field 0 GICCDEN
EndSysreg
+SysregFields ICH_PPI_DVIRx_EL2
+Field 63 DVI63
+Field 62 DVI62
+Field 61 DVI61
+Field 60 DVI60
+Field 59 DVI59
+Field 58 DVI58
+Field 57 DVI57
+Field 56 DVI56
+Field 55 DVI55
+Field 54 DVI54
+Field 53 DVI53
+Field 52 DVI52
+Field 51 DVI51
+Field 50 DVI50
+Field 49 DVI49
+Field 48 DVI48
+Field 47 DVI47
+Field 46 DVI46
+Field 45 DVI45
+Field 44 DVI44
+Field 43 DVI43
+Field 42 DVI42
+Field 41 DVI41
+Field 40 DVI40
+Field 39 DVI39
+Field 38 DVI38
+Field 37 DVI37
+Field 36 DVI36
+Field 35 DVI35
+Field 34 DVI34
+Field 33 DVI33
+Field 32 DVI32
+Field 31 DVI31
+Field 30 DVI30
+Field 29 DVI29
+Field 28 DVI28
+Field 27 DVI27
+Field 26 DVI26
+Field 25 DVI25
+Field 24 DVI24
+Field 23 DVI23
+Field 22 DVI22
+Field 21 DVI21
+Field 20 DVI20
+Field 19 DVI19
+Field 18 DVI18
+Field 17 DVI17
+Field 16 DVI16
+Field 15 DVI15
+Field 14 DVI14
+Field 13 DVI13
+Field 12 DVI12
+Field 11 DVI11
+Field 10 DVI10
+Field 9 DVI9
+Field 8 DVI8
+Field 7 DVI7
+Field 6 DVI6
+Field 5 DVI5
+Field 4 DVI4
+Field 3 DVI3
+Field 2 DVI2
+Field 1 DVI1
+Field 0 DVI0
+EndSysregFields
+
+Sysreg ICH_PPI_DVIR0_EL2 3 4 12 10 0
+Fields ICH_PPI_DVIRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_DVIR1_EL2 3 4 12 10 1
+Fields ICH_PPI_DVIRx_EL2
+EndSysreg
+
+SysregFields ICH_PPI_ENABLERx_EL2
+Field 63 EN63
+Field 62 EN62
+Field 61 EN61
+Field 60 EN60
+Field 59 EN59
+Field 58 EN58
+Field 57 EN57
+Field 56 EN56
+Field 55 EN55
+Field 54 EN54
+Field 53 EN53
+Field 52 EN52
+Field 51 EN51
+Field 50 EN50
+Field 49 EN49
+Field 48 EN48
+Field 47 EN47
+Field 46 EN46
+Field 45 EN45
+Field 44 EN44
+Field 43 EN43
+Field 42 EN42
+Field 41 EN41
+Field 40 EN40
+Field 39 EN39
+Field 38 EN38
+Field 37 EN37
+Field 36 EN36
+Field 35 EN35
+Field 34 EN34
+Field 33 EN33
+Field 32 EN32
+Field 31 EN31
+Field 30 EN30
+Field 29 EN29
+Field 28 EN28
+Field 27 EN27
+Field 26 EN26
+Field 25 EN25
+Field 24 EN24
+Field 23 EN23
+Field 22 EN22
+Field 21 EN21
+Field 20 EN20
+Field 19 EN19
+Field 18 EN18
+Field 17 EN17
+Field 16 EN16
+Field 15 EN15
+Field 14 EN14
+Field 13 EN13
+Field 12 EN12
+Field 11 EN11
+Field 10 EN10
+Field 9 EN9
+Field 8 EN8
+Field 7 EN7
+Field 6 EN6
+Field 5 EN5
+Field 4 EN4
+Field 3 EN3
+Field 2 EN2
+Field 1 EN1
+Field 0 EN0
+EndSysregFields
+
+Sysreg ICH_PPI_ENABLER0_EL2 3 4 12 10 2
+Fields ICH_PPI_ENABLERx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_ENABLER1_EL2 3 4 12 10 3
+Fields ICH_PPI_ENABLERx_EL2
+EndSysreg
+
+SysregFields ICH_PPI_PENDRx_EL2
+Field 63 PEND63
+Field 62 PEND62
+Field 61 PEND61
+Field 60 PEND60
+Field 59 PEND59
+Field 58 PEND58
+Field 57 PEND57
+Field 56 PEND56
+Field 55 PEND55
+Field 54 PEND54
+Field 53 PEND53
+Field 52 PEND52
+Field 51 PEND51
+Field 50 PEND50
+Field 49 PEND49
+Field 48 PEND48
+Field 47 PEND47
+Field 46 PEND46
+Field 45 PEND45
+Field 44 PEND44
+Field 43 PEND43
+Field 42 PEND42
+Field 41 PEND41
+Field 40 PEND40
+Field 39 PEND39
+Field 38 PEND38
+Field 37 PEND37
+Field 36 PEND36
+Field 35 PEND35
+Field 34 PEND34
+Field 33 PEND33
+Field 32 PEND32
+Field 31 PEND31
+Field 30 PEND30
+Field 29 PEND29
+Field 28 PEND28
+Field 27 PEND27
+Field 26 PEND26
+Field 25 PEND25
+Field 24 PEND24
+Field 23 PEND23
+Field 22 PEND22
+Field 21 PEND21
+Field 20 PEND20
+Field 19 PEND19
+Field 18 PEND18
+Field 17 PEND17
+Field 16 PEND16
+Field 15 PEND15
+Field 14 PEND14
+Field 13 PEND13
+Field 12 PEND12
+Field 11 PEND11
+Field 10 PEND10
+Field 9 PEND9
+Field 8 PEND8
+Field 7 PEND7
+Field 6 PEND6
+Field 5 PEND5
+Field 4 PEND4
+Field 3 PEND3
+Field 2 PEND2
+Field 1 PEND1
+Field 0 PEND0
+EndSysregFields
+
+Sysreg ICH_PPI_PENDR0_EL2 3 4 12 10 4
+Fields ICH_PPI_PENDRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PENDR1_EL2 3 4 12 10 5
+Fields ICH_PPI_PENDRx_EL2
+EndSysreg
+
+SysregFields ICH_PPI_ACTIVERx_EL2
+Field 63 ACTIVE63
+Field 62 ACTIVE62
+Field 61 ACTIVE61
+Field 60 ACTIVE60
+Field 59 ACTIVE59
+Field 58 ACTIVE58
+Field 57 ACTIVE57
+Field 56 ACTIVE56
+Field 55 ACTIVE55
+Field 54 ACTIVE54
+Field 53 ACTIVE53
+Field 52 ACTIVE52
+Field 51 ACTIVE51
+Field 50 ACTIVE50
+Field 49 ACTIVE49
+Field 48 ACTIVE48
+Field 47 ACTIVE47
+Field 46 ACTIVE46
+Field 45 ACTIVE45
+Field 44 ACTIVE44
+Field 43 ACTIVE43
+Field 42 ACTIVE42
+Field 41 ACTIVE41
+Field 40 ACTIVE40
+Field 39 ACTIVE39
+Field 38 ACTIVE38
+Field 37 ACTIVE37
+Field 36 ACTIVE36
+Field 35 ACTIVE35
+Field 34 ACTIVE34
+Field 33 ACTIVE33
+Field 32 ACTIVE32
+Field 31 ACTIVE31
+Field 30 ACTIVE30
+Field 29 ACTIVE29
+Field 28 ACTIVE28
+Field 27 ACTIVE27
+Field 26 ACTIVE26
+Field 25 ACTIVE25
+Field 24 ACTIVE24
+Field 23 ACTIVE23
+Field 22 ACTIVE22
+Field 21 ACTIVE21
+Field 20 ACTIVE20
+Field 19 ACTIVE19
+Field 18 ACTIVE18
+Field 17 ACTIVE17
+Field 16 ACTIVE16
+Field 15 ACTIVE15
+Field 14 ACTIVE14
+Field 13 ACTIVE13
+Field 12 ACTIVE12
+Field 11 ACTIVE11
+Field 10 ACTIVE10
+Field 9 ACTIVE9
+Field 8 ACTIVE8
+Field 7 ACTIVE7
+Field 6 ACTIVE6
+Field 5 ACTIVE5
+Field 4 ACTIVE4
+Field 3 ACTIVE3
+Field 2 ACTIVE2
+Field 1 ACTIVE1
+Field 0 ACTIVE0
+EndSysregFields
+
+Sysreg ICH_PPI_ACTIVER0_EL2 3 4 12 10 6
+Fields ICH_PPI_ACTIVERx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_ACTIVER1_EL2 3 4 12 10 7
+Fields ICH_PPI_ACTIVERx_EL2
+EndSysreg
+
Sysreg ICH_HCR_EL2 3 4 12 11 0
Res0 63:32
Field 31:27 EOIcount
@@ -4789,6 +5174,18 @@ Field 1 V3
Field 0 En
EndSysreg
+Sysreg ICH_CONTEXTR_EL2 3 4 12 11 6
+Field 63 V
+Field 62 F
+Field 61 IRICHPPIDIS
+Field 60 DB
+Field 59:55 DBPM
+Res0 54:48
+Field 47:32 VPE
+Res0 31:16
+Field 15:0 VM
+EndSysreg
+
Sysreg ICH_VMCR_EL2 3 4 12 11 7
Prefix FEAT_GCIE
Res0 63:32
@@ -4810,6 +5207,89 @@ Field 1 VENG1
Field 0 VENG0
EndSysreg
+SysregFields ICH_PPI_PRIORITYRx_EL2
+Res0 63:61
+Field 60:56 Priority7
+Res0 55:53
+Field 52:48 Priority6
+Res0 47:45
+Field 44:40 Priority5
+Res0 39:37
+Field 36:32 Priority4
+Res0 31:29
+Field 28:24 Priority3
+Res0 23:21
+Field 20:16 Priority2
+Res0 15:13
+Field 12:8 Priority1
+Res0 7:5
+Field 4:0 Priority0
+EndSysregFields
+
+Sysreg ICH_PPI_PRIORITYR0_EL2 3 4 12 14 0
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR1_EL2 3 4 12 14 1
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR2_EL2 3 4 12 14 2
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR3_EL2 3 4 12 14 3
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR4_EL2 3 4 12 14 4
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR5_EL2 3 4 12 14 5
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR6_EL2 3 4 12 14 6
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR7_EL2 3 4 12 14 7
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR8_EL2 3 4 12 15 0
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR9_EL2 3 4 12 15 1
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR10_EL2 3 4 12 15 2
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR11_EL2 3 4 12 15 3
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR12_EL2 3 4 12 15 4
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR13_EL2 3 4 12 15 5
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR14_EL2 3 4 12 15 6
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
+Sysreg ICH_PPI_PRIORITYR15_EL2 3 4 12 15 7
+Fields ICH_PPI_PRIORITYRx_EL2
+EndSysreg
+
Sysreg CONTEXTIDR_EL2 3 4 13 0 1
Fields CONTEXTIDR_ELx
EndSysreg
diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 405a5eee847b..6b0903be8ebf 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -511,6 +511,23 @@ static bool gicv5_ppi_irq_is_level(irq_hw_number_t hwirq)
return !!(read_ppi_sysreg_s(hwirq, PPI_HM) & bit);
}
+static int gicv5_ppi_irq_set_type(struct irq_data *d, unsigned int type)
+{
+ /*
+ * GICv5's PPIs do not have a configurable trigger or handling
+ * mode. Check that the attempt to set a type matches what the
+ * hardware reports in the HMR, and error on a mismatch.
+ */
+
+ if (type & IRQ_TYPE_EDGE_BOTH && gicv5_ppi_irq_is_level(d->hwirq))
+ return -EINVAL;
+
+ if (type & IRQ_TYPE_LEVEL_MASK && !gicv5_ppi_irq_is_level(d->hwirq))
+ return -EINVAL;
+
+ return 0;
+}
+
static int gicv5_ppi_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
{
if (vcpu)
@@ -526,6 +543,7 @@ static const struct irq_chip gicv5_ppi_irq_chip = {
.irq_mask = gicv5_ppi_irq_mask,
.irq_unmask = gicv5_ppi_irq_unmask,
.irq_eoi = gicv5_ppi_irq_eoi,
+ .irq_set_type = gicv5_ppi_irq_set_type,
.irq_get_irqchip_state = gicv5_ppi_irq_get_irqchip_state,
.irq_set_irqchip_state = gicv5_ppi_irq_set_irqchip_state,
.irq_set_vcpu_affinity = gicv5_ppi_irq_set_vcpu_affinity,
diff --git a/drivers/virt/coco/pkvm-guest/Kconfig b/drivers/virt/coco/pkvm-guest/Kconfig
index d2f344f1f98f..928b8e1668cc 100644
--- a/drivers/virt/coco/pkvm-guest/Kconfig
+++ b/drivers/virt/coco/pkvm-guest/Kconfig
@@ -1,6 +1,6 @@
config ARM_PKVM_GUEST
bool "Arm pKVM protected guest driver"
- depends on ARM64
+ depends on ARM64 && DMA_RESTRICTED_POOL
help
Protected guests running under the pKVM hypervisor on arm64
are isolated from the host and must issue hypercalls to enable
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 51c00c8fa175..edfdf139cb02 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -664,6 +664,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
fsnotify_create(d_inode(dentry->d_parent), dentry);
return tracefs_end_creating(dentry);
}
+EXPORT_SYMBOL_GPL(tracefs_create_file);
static struct dentry *__create_dir(const char *name, struct dentry *parent,
const struct inode_operations *ops)
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 7310841f4512..bf8cc9589bd0 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -10,6 +10,8 @@
#include <linux/clocksource.h>
#include <linux/hrtimer.h>
+#include <linux/irqchip/arm-gic-v5.h>
+
enum kvm_arch_timers {
TIMER_PTIMER,
TIMER_VTIMER,
@@ -47,7 +49,7 @@ struct arch_timer_vm_data {
u64 poffset;
/* The PPI for each timer, global to the VM */
- u8 ppi[NR_KVM_TIMERS];
+ u32 ppi[NR_KVM_TIMERS];
};
struct arch_timer_context {
@@ -130,6 +132,10 @@ void kvm_timer_init_vhe(void);
#define timer_vm_data(ctx) (&(timer_context_to_vcpu(ctx)->kvm->arch.timer_data))
#define timer_irq(ctx) (timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)])
+#define get_vgic_ppi(k, i) (((k)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V5) ? \
+ (i) : (FIELD_PREP(GICV5_HWIRQ_ID, i) | \
+ FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI)))
+
u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
enum kvm_arch_timers tmr,
enum kvm_arch_timer_regs treg);
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 96754b51b411..0a36a3d5c894 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -12,6 +12,9 @@
#define KVM_ARMV8_PMU_MAX_COUNTERS 32
+/* PPI #23 - architecturally specified for GICv5 */
+#define KVM_ARMV8_PMU_GICV5_IRQ 0x20000017
+
#if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM)
struct kvm_pmc {
u8 idx; /* index into the pmu->pmc array */
@@ -38,7 +41,7 @@ struct arm_pmu_entry {
};
bool kvm_supports_guest_pmuv3(void);
-#define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS)
+#define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num != 0)
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx);
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val);
void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f2eafc65bbf4..1388dc6028a9 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -19,7 +19,9 @@
#include <linux/jump_label.h>
#include <linux/irqchip/arm-gic-v4.h>
+#include <linux/irqchip/arm-gic-v5.h>
+#define VGIC_V5_MAX_CPUS 512
#define VGIC_V3_MAX_CPUS 512
#define VGIC_V2_MAX_CPUS 8
#define VGIC_NR_IRQS_LEGACY 256
@@ -31,9 +33,96 @@
#define VGIC_MIN_LPI 8192
#define KVM_IRQCHIP_NUM_PINS (1020 - 32)
-#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
-#define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
- (irq) <= VGIC_MAX_SPI)
+/*
+ * GICv5 supports 128 PPIs, but only the first 64 are architected. We only
+ * support the timers and PMU in KVM, both of which are architected. Rather than
+ * handling twice the state, we instead opt to only support the architected set
+ * in KVM for now. At a future stage, this can be bumped up to 128, if required.
+ */
+#define VGIC_V5_NR_PRIVATE_IRQS 64
+
+#define is_v5_type(t, i) (FIELD_GET(GICV5_HWIRQ_TYPE, (i)) == (t))
+
+#define __irq_is_sgi(t, i) \
+ ({ \
+ bool __ret; \
+ \
+ switch (t) { \
+ case KVM_DEV_TYPE_ARM_VGIC_V5: \
+ __ret = false; \
+ break; \
+ default: \
+ __ret = (i) < VGIC_NR_SGIS; \
+ } \
+ \
+ __ret; \
+ })
+
+#define __irq_is_ppi(t, i) \
+ ({ \
+ bool __ret; \
+ \
+ switch (t) { \
+ case KVM_DEV_TYPE_ARM_VGIC_V5: \
+ __ret = is_v5_type(GICV5_HWIRQ_TYPE_PPI, (i)); \
+ break; \
+ default: \
+ __ret = (i) >= VGIC_NR_SGIS; \
+ __ret &= (i) < VGIC_NR_PRIVATE_IRQS; \
+ } \
+ \
+ __ret; \
+ })
+
+#define __irq_is_spi(t, i) \
+ ({ \
+ bool __ret; \
+ \
+ switch (t) { \
+ case KVM_DEV_TYPE_ARM_VGIC_V5: \
+ __ret = is_v5_type(GICV5_HWIRQ_TYPE_SPI, (i)); \
+ break; \
+ default: \
+ __ret = (i) <= VGIC_MAX_SPI; \
+ __ret &= (i) >= VGIC_NR_PRIVATE_IRQS; \
+ } \
+ \
+ __ret; \
+ })
+
+#define __irq_is_lpi(t, i) \
+ ({ \
+ bool __ret; \
+ \
+ switch (t) { \
+ case KVM_DEV_TYPE_ARM_VGIC_V5: \
+ __ret = is_v5_type(GICV5_HWIRQ_TYPE_LPI, (i)); \
+ break; \
+ default: \
+ __ret = (i) >= 8192; \
+ } \
+ \
+ __ret; \
+ })
+
+#define irq_is_sgi(k, i) __irq_is_sgi((k)->arch.vgic.vgic_model, i)
+#define irq_is_ppi(k, i) __irq_is_ppi((k)->arch.vgic.vgic_model, i)
+#define irq_is_spi(k, i) __irq_is_spi((k)->arch.vgic.vgic_model, i)
+#define irq_is_lpi(k, i) __irq_is_lpi((k)->arch.vgic.vgic_model, i)
+
+#define irq_is_private(k, i) (irq_is_ppi(k, i) || irq_is_sgi(k, i))
+
+#define vgic_v5_get_hwirq_id(x) FIELD_GET(GICV5_HWIRQ_ID, (x))
+#define vgic_v5_set_hwirq_id(x) FIELD_PREP(GICV5_HWIRQ_ID, (x))
+
+#define __vgic_v5_set_type(t) (FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_##t))
+#define vgic_v5_make_ppi(x) (__vgic_v5_set_type(PPI) | vgic_v5_set_hwirq_id(x))
+#define vgic_v5_make_spi(x) (__vgic_v5_set_type(SPI) | vgic_v5_set_hwirq_id(x))
+#define vgic_v5_make_lpi(x) (__vgic_v5_set_type(LPI) | vgic_v5_set_hwirq_id(x))
+
+#define __vgic_is_v(k, v) ((k)->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V##v)
+#define vgic_is_v3(k) (__vgic_is_v(k, 3))
+#define vgic_is_v5(k) (__vgic_is_v(k, 5))
enum vgic_type {
VGIC_V2, /* Good ol' GICv2 */
@@ -101,6 +190,8 @@ enum vgic_irq_config {
VGIC_CONFIG_LEVEL
};
+struct vgic_irq;
+
/*
* Per-irq ops overriding some common behavious.
*
@@ -119,6 +210,19 @@ struct irq_ops {
* peaking into the physical GIC.
*/
bool (*get_input_level)(int vintid);
+
+ /*
+ * Function pointer to override the queuing of an IRQ.
+ */
+ bool (*queue_irq_unlock)(struct kvm *kvm, struct vgic_irq *irq,
+ unsigned long flags) __releases(&irq->irq_lock);
+
+ /*
+ * Callback function pointer to either enable or disable direct
+ * injection for a mapped interrupt.
+ */
+ void (*set_direct_injection)(struct kvm_vcpu *vcpu,
+ struct vgic_irq *irq, bool direct);
};
struct vgic_irq {
@@ -238,6 +342,26 @@ struct vgic_redist_region {
struct list_head list;
};
+struct vgic_v5_vm {
+ /*
+ * We only expose a subset of PPIs to the guest. This subset is a
+ * combination of the PPIs that are actually implemented and what we
+ * actually choose to expose.
+ */
+ DECLARE_BITMAP(vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS);
+
+ /* A mask of the PPIs that are exposed for userspace to drive. */
+ DECLARE_BITMAP(userspace_ppis, VGIC_V5_NR_PRIVATE_IRQS);
+
+ /*
+ * The HMR itself is handled by the hardware, but we still need to have
+ * a mask that we can use when merging in pending state (only the state
+ * of Edge PPIs is merged back in from the guest an the HMR provides a
+ * convenient way to do that).
+ */
+ DECLARE_BITMAP(vgic_ppi_hmr, VGIC_V5_NR_PRIVATE_IRQS);
+};
+
struct vgic_dist {
bool in_kernel;
bool ready;
@@ -310,6 +434,11 @@ struct vgic_dist {
* else.
*/
struct its_vm its_vm;
+
+ /*
+ * GICv5 per-VM data.
+ */
+ struct vgic_v5_vm gicv5_vm;
};
struct vgic_v2_cpu_if {
@@ -340,11 +469,40 @@ struct vgic_v3_cpu_if {
unsigned int used_lrs;
};
+struct vgic_v5_cpu_if {
+ u64 vgic_apr;
+ u64 vgic_vmcr;
+
+ /* PPI register state */
+ DECLARE_BITMAP(vgic_ppi_dvir, VGIC_V5_NR_PRIVATE_IRQS);
+ DECLARE_BITMAP(vgic_ppi_activer, VGIC_V5_NR_PRIVATE_IRQS);
+ DECLARE_BITMAP(vgic_ppi_enabler, VGIC_V5_NR_PRIVATE_IRQS);
+ /* We have one byte (of which 5 bits are used) per PPI for priority */
+ u64 vgic_ppi_priorityr[VGIC_V5_NR_PRIVATE_IRQS / 8];
+
+ /*
+ * The ICSR is re-used across host and guest, and hence it needs to be
+ * saved/restored. Only one copy is required as the host should block
+ * preemption between executing GIC CDRCFG and acccessing the
+ * ICC_ICSR_EL1. A guest, of course, can never guarantee this, and hence
+ * it is the hyp's responsibility to keep the state constistent.
+ */
+ u64 vgic_icsr;
+
+ struct gicv5_vpe gicv5_vpe;
+};
+
+/* What PPI capabilities does a GICv5 host have */
+struct vgic_v5_ppi_caps {
+ DECLARE_BITMAP(impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS);
+};
+
struct vgic_cpu {
/* CPU vif control registers for world switch */
union {
struct vgic_v2_cpu_if vgic_v2;
struct vgic_v3_cpu_if vgic_v3;
+ struct vgic_v5_cpu_if vgic_v5;
};
struct vgic_irq *private_irqs;
@@ -392,13 +550,17 @@ int kvm_vgic_create(struct kvm *kvm, u32 type);
void kvm_vgic_destroy(struct kvm *kvm);
void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
int kvm_vgic_map_resources(struct kvm *kvm);
+void kvm_vgic_finalize_idregs(struct kvm *kvm);
int kvm_vgic_hyp_init(void);
void kvm_vgic_init_cpu_hardware(void);
int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
unsigned int intid, bool level, void *owner);
+void kvm_vgic_set_irq_ops(struct kvm_vcpu *vcpu, u32 vintid,
+ struct irq_ops *ops);
+void kvm_vgic_clear_irq_ops(struct kvm_vcpu *vcpu, u32 vintid);
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
- u32 vintid, struct irq_ops *ops);
+ u32 vintid);
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid);
int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid);
bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid);
@@ -414,8 +576,20 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu);
#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
#define vgic_initialized(k) ((k)->arch.vgic.initialized)
-#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \
- ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+#define vgic_valid_spi(k, i) \
+ ({ \
+ bool __ret = irq_is_spi(k, i); \
+ \
+ switch ((k)->arch.vgic.vgic_model) { \
+ case KVM_DEV_TYPE_ARM_VGIC_V5: \
+ __ret &= FIELD_GET(GICV5_HWIRQ_ID, i) < (k)->arch.vgic.nr_spis; \
+ break; \
+ default: \
+ __ret &= (i) < ((k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); \
+ } \
+ \
+ __ret; \
+ })
bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
@@ -455,6 +629,11 @@ int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);
int vgic_v4_put(struct kvm_vcpu *vcpu);
+int vgic_v5_finalize_ppi_state(struct kvm *kvm);
+bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+ unsigned long flags);
+void vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool dvi);
+
bool vgic_state_is_nested(struct kvm_vcpu *vcpu);
/* CPU HP callbacks */
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index b78488df6c98..40d2fce68294 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -25,6 +25,28 @@
#define GICV5_HWIRQ_TYPE_SPI UL(0x3)
/*
+ * Architected PPIs
+ */
+#define GICV5_ARCH_PPI_S_DB_PPI 0x0
+#define GICV5_ARCH_PPI_RL_DB_PPI 0x1
+#define GICV5_ARCH_PPI_NS_DB_PPI 0x2
+#define GICV5_ARCH_PPI_SW_PPI 0x3
+#define GICV5_ARCH_PPI_HACDBSIRQ 0xf
+#define GICV5_ARCH_PPI_CNTHVS 0x13
+#define GICV5_ARCH_PPI_CNTHPS 0x14
+#define GICV5_ARCH_PPI_PMBIRQ 0x15
+#define GICV5_ARCH_PPI_COMMIRQ 0x16
+#define GICV5_ARCH_PPI_PMUIRQ 0x17
+#define GICV5_ARCH_PPI_CTIIRQ 0x18
+#define GICV5_ARCH_PPI_GICMNT 0x19
+#define GICV5_ARCH_PPI_CNTHP 0x1a
+#define GICV5_ARCH_PPI_CNTV 0x1b
+#define GICV5_ARCH_PPI_CNTHV 0x1c
+#define GICV5_ARCH_PPI_CNTPS 0x1d
+#define GICV5_ARCH_PPI_CNTP 0x1e
+#define GICV5_ARCH_PPI_TRBIRQ 0x1f
+
+/*
* Tables attributes
*/
#define GICV5_NO_READ_ALLOC 0b0
@@ -365,6 +387,11 @@ int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
int gicv5_irs_iste_alloc(u32 lpi);
void gicv5_irs_syncr(void);
+/* Embedded in kvm.arch */
+struct gicv5_vpe {
+ bool resident;
+};
+
struct gicv5_its_devtab_cfg {
union {
struct {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6b76e7a6f4c2..779d9ed85cbf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2366,6 +2366,7 @@ void kvm_unregister_device_ops(u32 type);
extern struct kvm_device_ops kvm_mpic_ops;
extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v5_ops;
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index d862fa610270..994f52b34344 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -251,4 +251,62 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu);
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
+
+struct ring_buffer_desc {
+ int cpu;
+ unsigned int nr_page_va; /* excludes the meta page */
+ unsigned long meta_va;
+ unsigned long page_va[] __counted_by(nr_page_va);
+};
+
+struct trace_buffer_desc {
+ int nr_cpus;
+ size_t struct_len;
+ char __data[]; /* list of ring_buffer_desc */
+};
+
+static inline struct ring_buffer_desc *__next_ring_buffer_desc(struct ring_buffer_desc *desc)
+{
+ size_t len = struct_size(desc, page_va, desc->nr_page_va);
+
+ return (struct ring_buffer_desc *)((void *)desc + len);
+}
+
+static inline struct ring_buffer_desc *__first_ring_buffer_desc(struct trace_buffer_desc *desc)
+{
+ return (struct ring_buffer_desc *)(&desc->__data[0]);
+}
+
+static inline size_t trace_buffer_desc_size(size_t buffer_size, unsigned int nr_cpus)
+{
+ unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1;
+ struct ring_buffer_desc *rbdesc;
+
+ return size_add(offsetof(struct trace_buffer_desc, __data),
+ size_mul(nr_cpus, struct_size(rbdesc, page_va, nr_pages)));
+}
+
+#define for_each_ring_buffer_desc(__pdesc, __cpu, __trace_pdesc) \
+ for (__pdesc = __first_ring_buffer_desc(__trace_pdesc), __cpu = 0; \
+ (__cpu) < (__trace_pdesc)->nr_cpus; \
+ (__cpu)++, __pdesc = __next_ring_buffer_desc(__pdesc))
+
+struct ring_buffer_remote {
+ struct trace_buffer_desc *desc;
+ int (*swap_reader_page)(unsigned int cpu, void *priv);
+ int (*reset)(unsigned int cpu, void *priv);
+ void *priv;
+};
+
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu);
+
+struct trace_buffer *
+__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key);
+
+#define ring_buffer_alloc_remote(remote) \
+({ \
+ static struct lock_class_key __key; \
+ __ring_buffer_alloc_remote(remote, &__key); \
+})
#endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/linux/ring_buffer_types.h b/include/linux/ring_buffer_types.h
new file mode 100644
index 000000000000..54577021a49d
--- /dev/null
+++ b/include/linux/ring_buffer_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RING_BUFFER_TYPES_H
+#define _LINUX_RING_BUFFER_TYPES_H
+
+#include <asm/local.h>
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline bool test_time_stamp(u64 delta)
+{
+ return !!(delta & TS_DELTA_TEST);
+}
+
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+
+#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
+#define RB_ALIGNMENT 4U
+#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
+
+struct buffer_data_page {
+ u64 time_stamp; /* page time stamp */
+ local_t commit; /* write committed index */
+ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
+};
+#endif
diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h
new file mode 100644
index 000000000000..21aec556293e
--- /dev/null
+++ b/include/linux/simple_ring_buffer.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SIMPLE_RING_BUFFER_H
+#define _LINUX_SIMPLE_RING_BUFFER_H
+
+#include <linux/list.h>
+#include <linux/ring_buffer.h>
+#include <linux/ring_buffer_types.h>
+#include <linux/types.h>
+
+/*
+ * Ideally those struct would stay private but the caller needs to know
+ * the allocation size for simple_ring_buffer_init().
+ */
+struct simple_buffer_page {
+ struct list_head link;
+ struct buffer_data_page *page;
+ u64 entries;
+ u32 write;
+ u32 id;
+};
+
+struct simple_rb_per_cpu {
+ struct simple_buffer_page *tail_page;
+ struct simple_buffer_page *reader_page;
+ struct simple_buffer_page *head_page;
+ struct simple_buffer_page *bpages;
+ struct trace_buffer_meta *meta;
+ u32 nr_pages;
+
+#define SIMPLE_RB_UNAVAILABLE 0
+#define SIMPLE_RB_READY 1
+#define SIMPLE_RB_WRITING 2
+ u32 status;
+
+ u64 last_overrun;
+ u64 write_stamp;
+
+ struct simple_rb_cbs *cbs;
+};
+
+int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc);
+
+void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer);
+
+void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length,
+ u64 timestamp);
+
+void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer);
+
+int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable);
+
+int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer);
+
+int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer);
+
+int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
+ struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc,
+ void *(*load_page)(unsigned long va),
+ void (*unload_page)(void *va));
+
+void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer,
+ void (*unload_page)(void *));
+#endif
diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h
new file mode 100644
index 000000000000..fcd1d46ea466
--- /dev/null
+++ b/include/linux/trace_remote.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_TRACE_REMOTE_H
+#define _LINUX_TRACE_REMOTE_H
+
+#include <linux/dcache.h>
+#include <linux/ring_buffer.h>
+#include <linux/trace_remote_event.h>
+
+/**
+ * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote
+ * @init: Called once the remote has been registered. Allows the
+ * caller to extend the Tracefs remote directory
+ * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first
+ * time. Must return a &trace_buffer_desc
+ * (most likely filled with trace_remote_alloc_buffer())
+ * @unload_trace_buffer:
+ * Called once Tracefs has no use for the trace buffer
+ * (most likely call trace_remote_free_buffer())
+ * @enable_tracing: Called on Tracefs tracing_on. It is expected from the
+ * remote to allow writing.
+ * @swap_reader_page: Called when Tracefs consumes a new page from a
+ * ring-buffer. It is expected from the remote to isolate a
+ * @reset: Called on `echo 0 > trace`. It is expected from the
+ * remote to reset all ring-buffer pages.
+ * new reader-page from the @cpu ring-buffer.
+ * @enable_event: Called on events/event_name/enable. It is expected from
+ * the remote to allow the writing event @id.
+ */
+struct trace_remote_callbacks {
+ int (*init)(struct dentry *d, void *priv);
+ struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv);
+ void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv);
+ int (*enable_tracing)(bool enable, void *priv);
+ int (*swap_reader_page)(unsigned int cpu, void *priv);
+ int (*reset)(unsigned int cpu, void *priv);
+ int (*enable_event)(unsigned short id, bool enable, void *priv);
+};
+
+int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
+ struct remote_event *events, size_t nr_events);
+
+int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size,
+ const struct cpumask *cpumask);
+
+void trace_remote_free_buffer(struct trace_buffer_desc *desc);
+
+#endif
diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h
new file mode 100644
index 000000000000..c8ae1e1f5e72
--- /dev/null
+++ b/include/linux/trace_remote_event.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_TRACE_REMOTE_EVENTS_H
+#define _LINUX_TRACE_REMOTE_EVENTS_H
+
+struct trace_remote;
+struct trace_event_fields;
+struct trace_seq;
+
+struct remote_event_hdr {
+ unsigned short id;
+};
+
+#define REMOTE_EVENT_NAME_MAX 30
+struct remote_event {
+ char name[REMOTE_EVENT_NAME_MAX];
+ unsigned short id;
+ bool enabled;
+ struct trace_remote *remote;
+ struct trace_event_fields *fields;
+ char *print_fmt;
+ void (*print)(void *evt, struct trace_seq *seq);
+};
+
+#define RE_STRUCT(__args...) __args
+#define re_field(__type, __field) __type __field;
+
+#define REMOTE_EVENT_FORMAT(__name, __struct) \
+ struct remote_event_format_##__name { \
+ struct remote_event_hdr hdr; \
+ __struct \
+ }
+#endif
diff --git a/include/trace/define_remote_events.h b/include/trace/define_remote_events.h
new file mode 100644
index 000000000000..676e803dc144
--- /dev/null
+++ b/include/trace/define_remote_events.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/trace_events.h>
+#include <linux/trace_remote_event.h>
+#include <linux/trace_seq.h>
+#include <linux/stringify.h>
+
+#define REMOTE_EVENT_INCLUDE(__file) __stringify(../../__file)
+
+#ifdef REMOTE_EVENT_SECTION
+# define __REMOTE_EVENT_SECTION(__name) __used __section(REMOTE_EVENT_SECTION"."#__name)
+#else
+# define __REMOTE_EVENT_SECTION(__name)
+#endif
+
+#define REMOTE_PRINTK_COUNT_ARGS(__args...) \
+ __COUNT_ARGS(, ##__args, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0)
+
+#define __remote_printk0() \
+ trace_seq_putc(seq, '\n')
+
+#define __remote_printk1(__fmt) \
+ trace_seq_puts(seq, " " __fmt "\n") \
+
+#define __remote_printk2(__fmt, __args...) \
+do { \
+ trace_seq_putc(seq, ' '); \
+ trace_seq_printf(seq, __fmt, __args); \
+ trace_seq_putc(seq, '\n'); \
+} while (0)
+
+/* Apply the appropriate trace_seq sequence according to the number of arguments */
+#define remote_printk(__args...) \
+ CONCATENATE(__remote_printk, REMOTE_PRINTK_COUNT_ARGS(__args))(__args)
+
+#define RE_PRINTK(__args...) __args
+
+#define REMOTE_EVENT(__name, __id, __struct, __printk) \
+ REMOTE_EVENT_FORMAT(__name, __struct); \
+ static void remote_event_print_##__name(void *evt, struct trace_seq *seq) \
+ { \
+ struct remote_event_format_##__name __maybe_unused *__entry = evt; \
+ trace_seq_puts(seq, #__name); \
+ remote_printk(__printk); \
+ }
+#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE)
+
+#undef REMOTE_EVENT
+#undef RE_PRINTK
+#undef re_field
+#define re_field(__type, __field) \
+ { \
+ .type = #__type, .name = #__field, \
+ .size = sizeof(__type), .align = __alignof__(__type), \
+ .is_signed = is_signed_type(__type), \
+ },
+#define __entry REC
+#define RE_PRINTK(__fmt, __args...) "\"" __fmt "\", " __stringify(__args)
+#define REMOTE_EVENT(__name, __id, __struct, __printk) \
+ static struct trace_event_fields remote_event_fields_##__name[] = { \
+ __struct \
+ {} \
+ }; \
+ static char remote_event_print_fmt_##__name[] = __printk; \
+ static struct remote_event __REMOTE_EVENT_SECTION(__name) \
+ remote_event_##__name = { \
+ .name = #__name, \
+ .id = __id, \
+ .fields = remote_event_fields_##__name, \
+ .print_fmt = remote_event_print_fmt_##__name, \
+ .print = remote_event_print_##__name, \
+ }
+#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7fb4fea9d38e..1cc0c9463f71 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -704,6 +704,11 @@ struct kvm_enable_cap {
#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL
#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \
((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+
+#define KVM_VM_TYPE_ARM_PROTECTED (1UL << 31)
+#define KVM_VM_TYPE_ARM_MASK (KVM_VM_TYPE_ARM_IPA_SIZE_MASK | \
+ KVM_VM_TYPE_ARM_PROTECTED)
+
/*
* ioctls for /dev/kvm fds:
*/
@@ -1227,6 +1232,8 @@ enum kvm_device_type {
#define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC
KVM_DEV_TYPE_LOONGARCH_DMSINTC,
#define KVM_DEV_TYPE_LOONGARCH_DMSINTC KVM_DEV_TYPE_LOONGARCH_DMSINTC
+ KVM_DEV_TYPE_ARM_VGIC_V5,
+#define KVM_DEV_TYPE_ARM_VGIC_V5 KVM_DEV_TYPE_ARM_VGIC_V5
KVM_DEV_TYPE_MAX,
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index c102ef35d11e..e8185889a1c8 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -17,8 +17,8 @@
* @entries: Number of entries in the ring-buffer.
* @overrun: Number of entries lost in the ring-buffer.
* @read: Number of entries that have been read.
- * @Reserved1: Internal use only.
- * @Reserved2: Internal use only.
+ * @pages_lost: Number of pages overwritten by the writer.
+ * @pages_touched: Number of pages written by the writer.
*/
struct trace_buffer_meta {
__u32 meta_page_size;
@@ -39,8 +39,8 @@ struct trace_buffer_meta {
__u64 overrun;
__u64 read;
- __u64 Reserved1;
- __u64 Reserved2;
+ __u64 pages_lost;
+ __u64 pages_touched;
};
#define TRACE_MMAP_IOCTL_GET_READER _IO('R', 0x20)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 49de13cae428..e130da35808f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1281,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG
source "kernel/trace/rv/Kconfig"
+config TRACE_REMOTE
+ bool
+
+config SIMPLE_RING_BUFFER
+ bool
+
+config TRACE_REMOTE_TEST
+ tristate "Test module for remote tracing"
+ select TRACE_REMOTE
+ select SIMPLE_RING_BUFFER
+ help
+ This trace remote includes a ring-buffer writer implementation using
+ "simple_ring_buffer". This is solely intending for testing.
+
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 04096c21d06b..d662c1a64cd5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -128,4 +128,63 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
obj-$(CONFIG_RV) += rv/
+obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o
+obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o
+obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
+
+#
+# simple_ring_buffer is used by the pKVM hypervisor which does not have access
+# to all kernel symbols. Fail the build if forbidden symbols are found.
+#
+# undefsyms_base generates a set of compiler and tooling-generated symbols that can
+# safely be ignored for simple_ring_buffer.
+#
+filechk_undefsyms_base = \
+ echo '$(pound)include <linux/atomic.h>'; \
+ echo '$(pound)include <linux/string.h>'; \
+ echo '$(pound)include <asm/page.h>'; \
+ echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \
+ echo 'void undefsyms_base(void *p, int n);'; \
+ echo 'void undefsyms_base(void *p, int n) {'; \
+ echo ' char buffer[256] = { 0 };'; \
+ echo ' u32 u = 0;'; \
+ echo ' memset((char * volatile)page, 8, PAGE_SIZE);'; \
+ echo ' memset((char * volatile)buffer, 8, sizeof(buffer));'; \
+ echo ' memcpy((void * volatile)p, buffer, sizeof(buffer));'; \
+ echo ' cmpxchg((u32 * volatile)&u, 0, 8);'; \
+ echo ' WARN_ON(n == 0xdeadbeef);'; \
+ echo '}'
+
+$(obj)/undefsyms_base.c: FORCE
+ $(call filechk,undefsyms_base)
+
+clean-files += undefsyms_base.c
+
+$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
+
+targets += undefsyms_base.o
+
+# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when
+# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags
+# because it is not linked into vmlinux.
+KASAN_SANITIZE_undefsyms_base.o := y
+
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
+ __msan simple_ring_buffer \
+ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
+
+quiet_cmd_check_undefined = NM $<
+ cmd_check_undefined = \
+ undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \
+ if [ -n "$$undefsyms" ]; then \
+ echo "Unexpected symbols in $<:" >&2; \
+ echo "$$undefsyms" >&2; \
+ false; \
+ fi
+
+$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
+ $(call if_changed,check_undefined)
+
+always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked
+
libftrace-y := ftrace.o
diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c
new file mode 100644
index 000000000000..6c1b7701ddae
--- /dev/null
+++ b/kernel/trace/remote_test.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/simple_ring_buffer.h>
+#include <linux/trace_remote.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+
+#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h
+#include <trace/define_remote_events.h>
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs);
+static struct trace_buffer_desc *remote_test_buffer_desc;
+
+/*
+ * The trace_remote lock already serializes accesses from the trace_remote_callbacks.
+ * However write_event can still race with load/unload.
+ */
+static DEFINE_MUTEX(simple_rbs_lock);
+
+static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+ struct simple_buffer_page *bpages;
+ int ret = -ENOMEM;
+
+ cpu_buffer = kmalloc_obj(*cpu_buffer);
+ if (!cpu_buffer)
+ return ret;
+
+ bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va);
+ if (!bpages)
+ goto err_free_cpu_buffer;
+
+ ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc);
+ if (ret)
+ goto err_free_bpages;
+
+ scoped_guard(mutex, &simple_rbs_lock) {
+ WARN_ON(*per_cpu_ptr(&simple_rbs, cpu));
+ *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer;
+ }
+
+ return 0;
+
+err_free_bpages:
+ kfree(bpages);
+
+err_free_cpu_buffer:
+ kfree(cpu_buffer);
+
+ return ret;
+}
+
+static void remote_test_unload_simple_rb(int cpu)
+{
+ struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ struct simple_buffer_page *bpages;
+
+ if (!cpu_buffer)
+ return;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ bpages = cpu_buffer->bpages;
+ simple_ring_buffer_unload(cpu_buffer);
+ kfree(bpages);
+ kfree(cpu_buffer);
+ *per_cpu_ptr(&simple_rbs, cpu) = NULL;
+}
+
+static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ struct trace_buffer_desc *desc;
+ size_t desc_size;
+ int cpu, ret;
+
+ if (WARN_ON(remote_test_buffer_desc))
+ return ERR_PTR(-EINVAL);
+
+ desc_size = trace_buffer_desc_size(size, num_possible_cpus());
+ if (desc_size == SIZE_MAX) {
+ ret = -E2BIG;
+ goto err;
+ }
+
+ desc = kmalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask);
+ if (ret)
+ goto err_free_desc;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc);
+ if (ret)
+ goto err_unload;
+ }
+
+ remote_test_buffer_desc = desc;
+
+ return remote_test_buffer_desc;
+
+err_unload:
+ for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+ trace_remote_free_buffer(remote_test_buffer_desc);
+
+err_free_desc:
+ kfree(desc);
+
+err:
+ return ERR_PTR(ret);
+}
+
+static void remote_test_unload(struct trace_buffer_desc *desc, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (WARN_ON(desc != remote_test_buffer_desc))
+ return;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc)
+ remote_test_unload_simple_rb(rb_desc->cpu);
+
+ remote_test_buffer_desc = NULL;
+ trace_remote_free_buffer(desc);
+ kfree(desc);
+}
+
+static int remote_test_enable_tracing(bool enable, void *unused)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ if (!remote_test_buffer_desc)
+ return -ENODEV;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc)
+ WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu),
+ enable));
+ return 0;
+}
+
+static int remote_test_swap_reader_page(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_swap_reader_page(cpu_buffer);
+}
+
+static int remote_test_reset(unsigned int cpu, void *unused)
+{
+ struct simple_rb_per_cpu *cpu_buffer;
+
+ if (cpu >= NR_CPUS)
+ return -EINVAL;
+
+ cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu);
+ if (!cpu_buffer)
+ return -EINVAL;
+
+ return simple_ring_buffer_reset(cpu_buffer);
+}
+
+static int remote_test_enable_event(unsigned short id, bool enable, void *unused)
+{
+ if (id != REMOTE_TEST_EVENT_ID)
+ return -EINVAL;
+
+ /*
+ * Let's just use the struct remote_event enabled field that is turned on and off by
+ * trace_remote. This is a bit racy but good enough for a simple test module.
+ */
+ return 0;
+}
+
+static ssize_t
+write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos)
+{
+ struct remote_event_format_selftest *evt_test;
+ struct simple_rb_per_cpu *cpu_buffer;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&simple_rbs_lock);
+
+ if (!remote_event_selftest.enabled)
+ return -ENODEV;
+
+ guard(preempt)();
+
+ cpu_buffer = *this_cpu_ptr(&simple_rbs);
+ if (!cpu_buffer)
+ return -ENODEV;
+
+ evt_test = simple_ring_buffer_reserve(cpu_buffer,
+ sizeof(struct remote_event_format_selftest),
+ trace_clock_global());
+ if (!evt_test)
+ return -ENODEV;
+
+ evt_test->hdr.id = REMOTE_TEST_EVENT_ID;
+ evt_test->id = val;
+
+ simple_ring_buffer_commit(cpu_buffer);
+
+ return cnt;
+}
+
+static const struct file_operations write_event_fops = {
+ .write = write_event_write,
+};
+
+static int remote_test_init_tracefs(struct dentry *d, void *unused)
+{
+ return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ?
+ 0 : -ENOMEM;
+}
+
+static struct trace_remote_callbacks trace_remote_callbacks = {
+ .init = remote_test_init_tracefs,
+ .load_trace_buffer = remote_test_load,
+ .unload_trace_buffer = remote_test_unload,
+ .enable_tracing = remote_test_enable_tracing,
+ .swap_reader_page = remote_test_swap_reader_page,
+ .reset = remote_test_reset,
+ .enable_event = remote_test_enable_event,
+};
+
+static int __init remote_test_init(void)
+{
+ return trace_remote_register("test", &trace_remote_callbacks, NULL,
+ &remote_event_selftest, 1);
+}
+
+module_init(remote_test_init);
+
+MODULE_DESCRIPTION("Test module for the trace remote interface");
+MODULE_AUTHOR("Vincent Donnefort");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h
new file mode 100644
index 000000000000..26b93b3406fc
--- /dev/null
+++ b/kernel/trace/remote_test_events.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define REMOTE_TEST_EVENT_ID 1
+
+REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID,
+ RE_STRUCT(
+ re_field(u64, id)
+ ),
+ RE_PRINTK("id=%llu", __entry->id)
+);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 170170bd83bd..d6bebb782efc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
@@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
-#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
-
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
-#define TS_SHIFT 27
-#define TS_MASK ((1ULL << TS_SHIFT) - 1)
-#define TS_DELTA_TEST (~TS_MASK)
-
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
#define RB_MISSED_MASK (3 << 30)
-struct buffer_data_page {
- u64 time_stamp; /* page time stamp */
- local_t commit; /* write committed index */
- unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
-};
-
struct buffer_data_read_page {
unsigned order; /* order of the page */
struct buffer_data_page *data; /* actual data, stored in this page */
@@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
return dpage;
}
-/*
- * We need to fit the time_stamp delta into 27 bits.
- */
-static inline bool test_time_stamp(u64 delta)
-{
- return !!(delta & TS_DELTA_TEST);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -555,10 +521,12 @@ struct ring_buffer_per_cpu {
unsigned int mapped;
unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
- unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct buffer_page **subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
struct ring_buffer_cpu_meta *ring_meta;
+ struct ring_buffer_remote *remote;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -581,6 +549,8 @@ struct trace_buffer {
struct ring_buffer_per_cpu **buffers;
+ struct ring_buffer_remote *remote;
+
struct hlist_node node;
u64 (*clock)(void);
@@ -636,7 +606,8 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)buffer->subbuf_size,
+ (unsigned int)(buffer ? buffer->subbuf_size :
+ PAGE_SIZE - BUF_PAGE_HDR_SIZE),
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -2238,6 +2209,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
+static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu)
+{
+ struct ring_buffer_desc *desc, *end;
+ size_t len;
+ int i;
+
+ if (!trace_desc)
+ return NULL;
+
+ if (cpu >= trace_desc->nr_cpus)
+ return NULL;
+
+ end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len);
+ desc = __first_ring_buffer_desc(trace_desc);
+ len = struct_size(desc, page_va, desc->nr_page_va);
+ desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu));
+
+ if (desc < end && desc->cpu == cpu)
+ return desc;
+
+ /* Missing CPUs, need to linear search */
+ for_each_ring_buffer_desc(desc, i, trace_desc) {
+ if (desc->cpu == cpu)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, int page_id)
+{
+ return page_id > desc->nr_page_va ? NULL : (void *)desc->page_va[page_id];
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
@@ -2245,6 +2250,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
+ struct ring_buffer_desc *desc = NULL;
long i;
/*
@@ -2273,6 +2279,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
if (buffer->range_addr_start)
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+ if (buffer->remote) {
+ desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu);
+ if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1)))
+ return -EINVAL;
+ }
+
for (i = 0; i < nr_pages; i++) {
bpage = alloc_cpu_page(cpu_buffer->cpu);
@@ -2297,6 +2309,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
bpage->id = i + 1;
+ } else if (desc) {
+ void *p = ring_buffer_desc_page(desc, i + 1);
+
+ if (WARN_ON(!p))
+ goto free_pages;
+
+ bpage->page = p;
+ bpage->range = 1; /* bpage->page can't be freed */
+ bpage->id = i + 1;
+ cpu_buffer->subbuf_ids[i + 1] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
@@ -2394,6 +2416,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (cpu_buffer->ring_meta->head_buffer)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
+ } else if (buffer->remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu);
+
+ if (!desc)
+ goto fail_free_reader;
+
+ cpu_buffer->remote = buffer->remote;
+ cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va;
+ cpu_buffer->nr_pages = nr_pages;
+ cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1,
+ sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL);
+ if (!cpu_buffer->subbuf_ids)
+ goto fail_free_reader;
+
+ /* Remote buffers are read-only and immutable */
+ atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id);
+ if (!bpage->page)
+ goto fail_free_reader;
+
+ bpage->range = 1;
+ cpu_buffer->subbuf_ids[0] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu, order);
@@ -2453,6 +2499,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
irq_work_sync(&cpu_buffer->irq_work.work);
+ if (cpu_buffer->remote)
+ kfree(cpu_buffer->subbuf_ids);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -2475,7 +2524,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
unsigned long scratch_size,
- struct lock_class_key *key)
+ struct lock_class_key *key,
+ struct ring_buffer_remote *remote)
{
struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
@@ -2515,6 +2565,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ cpu = raw_smp_processor_id();
+
/* If start/end are specified, then that overrides size */
if (start && end) {
unsigned long buffers_start;
@@ -2570,6 +2622,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_end = end;
rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else if (remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu);
+
+ buffer->remote = remote;
+ /* The writer is remote. This ring-buffer is read-only */
+ atomic_inc(&buffer->record_disabled);
+ nr_pages = desc->nr_page_va - 1;
+ if (nr_pages < 2)
+ goto fail_free_buffers;
} else {
/* need at least two pages */
@@ -2578,7 +2639,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
nr_pages = 2;
}
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
@@ -2620,7 +2680,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0, 0, key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2647,7 +2707,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag
struct lock_class_key *key)
{
return alloc_buffer(size, flags, order, start, start + range_size,
- scratch_size, key);
+ scratch_size, key, NULL);
+}
+
+/**
+ * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote
+ * @remote: Contains a description of the ring-buffer pages and remote callbacks.
+ * @key: ring buffer reader_lock_key.
+ */
+struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote);
}
void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
@@ -5274,10 +5345,61 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
+ local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
+ local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched));
+ local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost));
+
+ return rb_num_of_entries(cpu_buffer);
+}
+
+static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *next, *orig;
+ int retry = 3;
+
+ orig = next = cpu_buffer->head_page;
+ rb_inc_page(&next);
+
+ /* Run after the writer */
+ while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) {
+ rb_inc_page(&next);
+
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+ rb_inc_page(&cpu_buffer->head_page);
+ rb_set_list_to_head(cpu_buffer->head_page->list.prev);
+
+ if (cpu_buffer->head_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+
+ orig = cpu_buffer->commit_page = cpu_buffer->head_page;
+ retry = 3;
+
+ while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) {
+ rb_inc_page(&next);
+ rb_inc_page(&cpu_buffer->commit_page);
+
+ if (cpu_buffer->commit_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+}
+
static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ if (cpu_buffer->remote) {
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_update_remote_head(cpu_buffer);
+ }
+
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
@@ -5428,7 +5550,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
}
static struct buffer_page *
-rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last;
+
+ if (!rb_read_remote_meta_page(cpu_buffer))
+ return NULL;
+
+ /* More to read on the reader page */
+ if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) {
+ if (!cpu_buffer->reader_page->read)
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ return cpu_buffer->reader_page;
+ }
+
+ prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu,
+ cpu_buffer->remote->priv));
+ /* nr_pages doesn't include the reader page */
+ if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages))
+ return NULL;
+
+ new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(prev_reader == new_reader);
+
+ prev_head = new_reader; /* New reader was also the previous head */
+ new_head = prev_head;
+ rb_inc_page(&new_head);
+ last = prev_head;
+ rb_dec_page(&last);
+
+ /* Clear the old HEAD flag */
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+
+ prev_reader->list.next = prev_head->list.next;
+ prev_reader->list.prev = prev_head->list.prev;
+
+ /* Swap prev_reader with new_reader */
+ last->list.next = &prev_reader->list;
+ new_head->list.prev = &prev_reader->list;
+
+ new_reader->list.prev = &new_reader->list;
+ new_reader->list.next = &new_head->list;
+
+ /* Reactivate the HEAD flag */
+ rb_set_list_to_head(&last->list);
+
+ cpu_buffer->head_page = new_head;
+ cpu_buffer->reader_page = new_reader;
+ cpu_buffer->pages = &new_head->list;
+ cpu_buffer->read_stamp = new_reader->page->time_stamp;
+ cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
+
+ return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL;
+}
+
+static struct buffer_page *
+__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
@@ -5598,6 +5778,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
return reader;
}
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) :
+ __rb_get_reader_page(cpu_buffer);
+}
+
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
@@ -6154,6 +6341,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
meta->entries = local_read(&cpu_buffer->entries);
meta->overrun = local_read(&cpu_buffer->overrun);
meta->read = cpu_buffer->read;
+ meta->pages_lost = local_read(&cpu_buffer->pages_lost);
+ meta->pages_touched = local_read(&cpu_buffer->pages_touched);
/* Some archs do not have data cache coherency between kernel and user-space */
flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
@@ -6164,6 +6353,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *page;
+ if (cpu_buffer->remote) {
+ if (!cpu_buffer->remote->reset)
+ return;
+
+ cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv);
+ rb_read_remote_meta_page(cpu_buffer);
+
+ /* Read related values, not covered by the meta-page */
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->reader_page->read = 0;
+
+ return;
+ }
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
@@ -6394,6 +6600,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ if (rb_read_remote_meta_page(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ /*
+ * Make sure all the ring buffers are up to date before we start reading
+ * them.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (rb_num_of_entries(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -6632,6 +6878,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int commit;
unsigned int read;
u64 save_timestamp;
+ bool force_memcpy;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -1;
@@ -6669,6 +6916,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
+ force_memcpy = cpu_buffer->mapped || cpu_buffer->remote;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -6678,7 +6927,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
- cpu_buffer->mapped) {
+ force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -7034,7 +7283,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
}
static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long *subbuf_ids)
+ struct buffer_page **subbuf_ids)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
@@ -7043,7 +7292,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
int id = 0;
id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
- subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ subbuf_ids[id++] = cpu_buffer->reader_page;
cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
@@ -7053,7 +7302,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
if (WARN_ON(id >= nr_subbufs))
break;
- subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf_ids[id] = subbuf;
rb_inc_page(&subbuf);
id++;
@@ -7062,7 +7311,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(cnt != nr_subbufs);
- /* install subbuf ID to kern VA translation */
+ /* install subbuf ID to bpage translation */
cpu_buffer->subbuf_ids = subbuf_ids;
meta->meta_struct_len = sizeof(*meta);
@@ -7218,13 +7467,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
+ struct buffer_page *subbuf;
struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs))
return -EINVAL;
- page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ subbuf = cpu_buffer->subbuf_ids[s];
+ page = virt_to_page((void *)subbuf->page);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
@@ -7251,10 +7502,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags, *subbuf_ids;
+ struct buffer_page **subbuf_ids;
+ unsigned long flags;
int err;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote)
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
@@ -7275,7 +7527,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
if (err)
return err;
- /* subbuf_ids include the reader while nr_pages does not */
+ /* subbuf_ids includes the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
new file mode 100644
index 000000000000..02af2297ae5a
--- /dev/null
+++ b/kernel/trace/simple_ring_buffer.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/simple_ring_buffer.h>
+
+#include <asm/barrier.h>
+#include <asm/local.h>
+
+enum simple_rb_link_type {
+ SIMPLE_RB_LINK_NORMAL = 0,
+ SIMPLE_RB_LINK_HEAD = 1,
+ SIMPLE_RB_LINK_HEAD_MOVING
+};
+
+#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING)
+
+static void simple_bpage_set_head_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ link &= SIMPLE_RB_LINK_MASK;
+ link |= SIMPLE_RB_LINK_HEAD;
+
+ /*
+ * Paired with simple_rb_find_head() to order access between the head
+ * link and overrun. It ensures we always report an up-to-date value
+ * after swapping the reader page.
+ */
+ smp_store_release(&bpage->link.next, (struct list_head *)link);
+}
+
+static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage,
+ struct simple_buffer_page *dst,
+ enum simple_rb_link_type new_type)
+{
+ unsigned long *link = (unsigned long *)(&bpage->link.next);
+ unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD;
+ unsigned long new = (unsigned long)(&dst->link) | new_type;
+
+ return try_cmpxchg(link, &old, new);
+}
+
+static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage)
+{
+ unsigned long link = (unsigned long)bpage->link.next;
+
+ WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK));
+}
+
+static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link)
+{
+ unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK;
+
+ return container_of((struct list_head *)ptr, struct simple_buffer_page, link);
+}
+
+static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage)
+{
+ return simple_bpage_from_link(bpage->link.next);
+}
+
+static void simple_bpage_reset(struct simple_buffer_page *bpage)
+{
+ bpage->write = 0;
+ bpage->entries = 0;
+
+ local_set(&bpage->page->commit, 0);
+}
+
+static void simple_bpage_init(struct simple_buffer_page *bpage, void *page)
+{
+ INIT_LIST_HEAD(&bpage->link);
+ bpage->page = (struct buffer_data_page *)page;
+
+ simple_bpage_reset(bpage);
+}
+
+#define simple_rb_meta_inc(__meta, __inc) \
+ WRITE_ONCE((__meta), (__meta + __inc))
+
+static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return !!cpu_buffer->bpages;
+}
+
+static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer)
+{
+ int retry = cpu_buffer->nr_pages * 2;
+ struct simple_buffer_page *head;
+
+ head = cpu_buffer->head_page;
+
+ while (retry--) {
+ unsigned long link;
+
+spin:
+ /* See smp_store_release in simple_bpage_set_head_link() */
+ link = (unsigned long)smp_load_acquire(&head->link.prev->next);
+
+ switch (link & ~SIMPLE_RB_LINK_MASK) {
+ /* Found the head */
+ case SIMPLE_RB_LINK_HEAD:
+ cpu_buffer->head_page = head;
+ return 0;
+ /* The writer caught the head, we can spin, that won't be long */
+ case SIMPLE_RB_LINK_HEAD_MOVING:
+ goto spin;
+ }
+
+ head = simple_bpage_next_page(head);
+ }
+
+ return -EBUSY;
+}
+
+/**
+ * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This function enables consuming reading. It ensures the current head page will not be overwritten
+ * and can be safely read.
+ *
+ * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the
+ * head page.
+ */
+int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *last, *head, *reader;
+ unsigned long overrun;
+ int retry = 8;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ reader = cpu_buffer->reader_page;
+
+ do {
+ /* Run after the writer to find the head */
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ head = cpu_buffer->head_page;
+
+ /* Connect the reader page around the header page */
+ reader->link.next = head->link.next;
+ reader->link.prev = head->link.prev;
+
+ /* The last page before the head */
+ last = simple_bpage_from_link(head->link.prev);
+
+ /* The reader page points to the new header page */
+ simple_bpage_set_head_link(reader);
+
+ overrun = cpu_buffer->meta->overrun;
+ } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--);
+
+ if (!retry)
+ return -EINVAL;
+
+ cpu_buffer->head_page = simple_bpage_from_link(reader->link.next);
+ cpu_buffer->head_page->link.prev = &reader->link;
+ cpu_buffer->reader_page = head;
+ cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun;
+ cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id;
+ cpu_buffer->last_overrun = overrun;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page);
+
+static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *tail, *new_tail;
+
+ tail = cpu_buffer->tail_page;
+ new_tail = simple_bpage_next_page(tail);
+
+ if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) {
+ /*
+ * Oh no! we've caught the head. There is none anymore and
+ * swap_reader will spin until we set the new one. Overrun must
+ * be written first, to make sure we report the correct number
+ * of lost events.
+ */
+ simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries);
+ simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1);
+
+ simple_bpage_set_head_link(new_tail);
+ simple_bpage_set_normal_link(tail);
+ }
+
+ simple_bpage_reset(new_tail);
+ cpu_buffer->tail_page = new_tail;
+
+ simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1);
+
+ return new_tail;
+}
+
+static unsigned long rb_event_size(unsigned long length)
+{
+ struct ring_buffer_event *event;
+
+ return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
+}
+
+static struct ring_buffer_event *
+rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+
+ return (struct ring_buffer_event *)((unsigned long)event + 8);
+}
+
+static struct ring_buffer_event *
+simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
+{
+ unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
+ struct simple_buffer_page *tail = cpu_buffer->tail_page;
+ struct ring_buffer_event *event;
+ u32 write, prev_write;
+ u64 time_delta;
+
+ time_delta = timestamp - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(time_delta))
+ ts_ext_size = 8;
+
+ prev_write = tail->write;
+ write = prev_write + event_size + ts_ext_size;
+
+ if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE)))
+ tail = simple_rb_move_tail(cpu_buffer);
+
+ if (!tail->entries) {
+ tail->page->time_stamp = timestamp;
+ time_delta = 0;
+ ts_ext_size = 0;
+ write = event_size;
+ prev_write = 0;
+ }
+
+ tail->write = write;
+ tail->entries++;
+
+ cpu_buffer->write_stamp = timestamp;
+
+ event = (struct ring_buffer_event *)(tail->page->data + prev_write);
+ if (ts_ext_size) {
+ event = rb_event_add_ts_extend(event, time_delta);
+ time_delta = 0;
+ }
+
+ event->type_len = 0;
+ event->time_delta = time_delta;
+ event->array[0] = event_size - RB_EVNT_HDR_SIZE;
+
+ return event;
+}
+
+/**
+ * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @length: Size of the entry in bytes
+ * @timestamp: Timestamp of the entry
+ *
+ * Returns the address of the entry where to write data or NULL
+ */
+void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length,
+ u64 timestamp)
+{
+ struct ring_buffer_event *rb_event;
+
+ if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY)
+ return NULL;
+
+ rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
+
+ return &rb_event->array[1];
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
+
+/**
+ * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve()
+ * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved
+ */
+void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->tail_page->page->commit,
+ cpu_buffer->tail_page->write);
+ simple_rb_meta_inc(cpu_buffer->meta->entries, 1);
+
+ /*
+ * Paired with simple_rb_enable_tracing() to ensure data is
+ * written to the ring-buffer before teardown.
+ */
+ smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_commit);
+
+static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ u32 prev_status;
+
+ if (enable)
+ return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY);
+
+ /* Wait for the buffer to be released */
+ do {
+ prev_status = cmpxchg_acquire(&cpu_buffer->status,
+ SIMPLE_RB_READY,
+ SIMPLE_RB_UNAVAILABLE);
+ } while (prev_status == SIMPLE_RB_WRITING);
+
+ return prev_status;
+}
+
+/**
+ * simple_ring_buffer_reset - Reset @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ *
+ * This will not clear the content of the data, only reset counters and pointers
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded.
+ */
+int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer)
+{
+ struct simple_buffer_page *bpage;
+ u32 prev_status;
+ int ret;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ prev_status = simple_rb_enable_tracing(cpu_buffer, false);
+
+ ret = simple_rb_find_head(cpu_buffer);
+ if (ret)
+ return ret;
+
+ bpage = cpu_buffer->tail_page = cpu_buffer->head_page;
+ do {
+ simple_bpage_reset(bpage);
+ bpage = simple_bpage_next_page(bpage);
+ } while (bpage != cpu_buffer->head_page);
+
+ simple_bpage_reset(cpu_buffer->reader_page);
+
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->write_stamp = 0;
+
+ cpu_buffer->meta->reader.read = 0;
+ cpu_buffer->meta->reader.lost_events = 0;
+ cpu_buffer->meta->entries = 0;
+ cpu_buffer->meta->overrun = 0;
+ cpu_buffer->meta->read = 0;
+ cpu_buffer->meta->pages_lost = 0;
+ cpu_buffer->meta->pages_touched = 0;
+
+ if (prev_status == SIMPLE_RB_READY)
+ simple_rb_enable_tracing(cpu_buffer, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_reset);
+
+int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer,
+ struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc,
+ void *(*load_page)(unsigned long va),
+ void (*unload_page)(void *va))
+{
+ struct simple_buffer_page *bpage = bpages;
+ int ret = 0;
+ void *page;
+ int i;
+
+ /* At least 1 reader page and two pages in the ring-buffer */
+ if (desc->nr_page_va < 3)
+ return -EINVAL;
+
+ memset(cpu_buffer, 0, sizeof(*cpu_buffer));
+
+ cpu_buffer->meta = load_page(desc->meta_va);
+ if (!cpu_buffer->meta)
+ return -EINVAL;
+
+ memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta));
+ cpu_buffer->meta->meta_page_size = PAGE_SIZE;
+ cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages;
+
+ /* The reader page is not part of the ring initially */
+ page = load_page(desc->page_va[0]);
+ if (!page) {
+ unload_page(cpu_buffer->meta);
+ return -EINVAL;
+ }
+
+ simple_bpage_init(bpage, page);
+ bpage->id = 0;
+
+ cpu_buffer->nr_pages = 1;
+
+ cpu_buffer->reader_page = bpage;
+ cpu_buffer->tail_page = bpage + 1;
+ cpu_buffer->head_page = bpage + 1;
+
+ for (i = 1; i < desc->nr_page_va; i++) {
+ page = load_page(desc->page_va[i]);
+ if (!page) {
+ ret = -EINVAL;
+ break;
+ }
+
+ simple_bpage_init(++bpage, page);
+
+ bpage->link.next = &(bpage + 1)->link;
+ bpage->link.prev = &(bpage - 1)->link;
+ bpage->id = i;
+
+ cpu_buffer->nr_pages = i + 1;
+ }
+
+ if (ret) {
+ for (i--; i >= 0; i--)
+ unload_page((void *)desc->page_va[i]);
+ unload_page(cpu_buffer->meta);
+
+ return ret;
+ }
+
+ /* Close the ring */
+ bpage->link.next = &cpu_buffer->tail_page->link;
+ cpu_buffer->tail_page->link.prev = &bpage->link;
+
+ /* The last init'ed page points to the head page */
+ simple_bpage_set_head_link(bpage);
+
+ cpu_buffer->bpages = bpages;
+
+ return 0;
+}
+
+static void *__load_page(unsigned long page)
+{
+ return (void *)page;
+}
+
+static void __unload_page(void *page) { }
+
+/**
+ * simple_ring_buffer_init - Init @cpu_buffer based on @desc
+ * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller.
+ * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va
+ * @desc: A ring_buffer_desc
+ *
+ * Returns 0 on success or -EINVAL if the content of @desc is invalid
+ */
+int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages,
+ const struct ring_buffer_desc *desc)
+{
+ return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_init);
+
+void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer,
+ void (*unload_page)(void *))
+{
+ int p;
+
+ if (!simple_rb_loaded(cpu_buffer))
+ return;
+
+ simple_rb_enable_tracing(cpu_buffer, false);
+
+ unload_page(cpu_buffer->meta);
+ for (p = 0; p < cpu_buffer->nr_pages; p++)
+ unload_page(cpu_buffer->bpages[p].page);
+
+ cpu_buffer->bpages = NULL;
+}
+
+/**
+ * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion
+ * @cpu_buffer: A simple_rb_per_cpu that will be deleted.
+ */
+void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer)
+{
+ return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page);
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_unload);
+
+/**
+ * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer
+ * @cpu_buffer: A simple_rb_per_cpu
+ * @enable: True to enable tracing, False to disable it
+ *
+ * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded
+ */
+int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable)
+{
+ if (!simple_rb_loaded(cpu_buffer))
+ return -ENODEV;
+
+ simple_rb_enable_tracing(cpu_buffer, enable);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a626211ceb9a..4189ec9df6a5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3856,7 +3856,7 @@ static int s_show(struct seq_file *m, void *v)
* Should be used after trace_array_get(), trace_types_lock
* ensures that i_cdev was already initialized.
*/
-static inline int tracing_get_cpu(struct inode *inode)
+int tracing_get_cpu(struct inode *inode)
{
if (inode->i_cdev) /* See trace_create_cpu_file() */
return (long)inode->i_cdev - 1;
@@ -8606,7 +8606,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
return tr->percpu_dir;
}
-static struct dentry *
+struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
void *data, long cpu, const struct file_operations *fops)
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b8f3804586a0..fbbd0b2ee76f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -689,6 +689,13 @@ struct dentry *trace_create_file(const char *name,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+struct dentry *trace_create_cpu_file(const char *name,
+ umode_t mode,
+ struct dentry *parent,
+ void *data,
+ long cpu,
+ const struct file_operations *fops);
+int tracing_get_cpu(struct inode *inode);
/**
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
new file mode 100644
index 000000000000..d6c3f94d67cd
--- /dev/null
+++ b/kernel/trace/trace_remote.c
@@ -0,0 +1,1384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 - Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/kstrtox.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/tracefs.h>
+#include <linux/trace_remote.h>
+#include <linux/trace_seq.h>
+#include <linux/types.h>
+
+#include "trace.h"
+
+#define TRACEFS_DIR "remotes"
+#define TRACEFS_MODE_WRITE 0640
+#define TRACEFS_MODE_READ 0440
+
+enum tri_type {
+ TRI_CONSUMING,
+ TRI_NONCONSUMING,
+};
+
+struct trace_remote_iterator {
+ struct trace_remote *remote;
+ struct trace_seq seq;
+ struct delayed_work poll_work;
+ unsigned long lost_events;
+ u64 ts;
+ struct ring_buffer_iter *rb_iter;
+ struct ring_buffer_iter **rb_iters;
+ struct remote_event_hdr *evt;
+ int cpu;
+ int evt_cpu;
+ loff_t pos;
+ enum tri_type type;
+};
+
+struct trace_remote {
+ struct trace_remote_callbacks *cbs;
+ void *priv;
+ struct trace_buffer *trace_buffer;
+ struct trace_buffer_desc *trace_buffer_desc;
+ struct dentry *dentry;
+ struct eventfs_inode *eventfs;
+ struct remote_event *events;
+ unsigned long nr_events;
+ unsigned long trace_buffer_size;
+ struct ring_buffer_remote rb_remote;
+ struct mutex lock;
+ struct rw_semaphore reader_lock;
+ struct rw_semaphore *pcpu_reader_locks;
+ unsigned int nr_readers;
+ unsigned int poll_ms;
+ bool tracing_on;
+};
+
+static bool trace_remote_loaded(struct trace_remote *remote)
+{
+ return !!remote->trace_buffer;
+}
+
+static int trace_remote_load(struct trace_remote *remote)
+{
+ struct ring_buffer_remote *rb_remote = &remote->rb_remote;
+ struct trace_buffer_desc *desc;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return 0;
+
+ desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv);
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
+
+ rb_remote->desc = desc;
+ rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
+ rb_remote->priv = remote->priv;
+ rb_remote->reset = remote->cbs->reset;
+ remote->trace_buffer = ring_buffer_alloc_remote(rb_remote);
+ if (!remote->trace_buffer) {
+ remote->cbs->unload_trace_buffer(desc, remote->priv);
+ return -ENOMEM;
+ }
+
+ remote->trace_buffer_desc = desc;
+
+ return 0;
+}
+
+static void trace_remote_try_unload(struct trace_remote *remote)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ /* The buffer is being read or writable */
+ if (remote->nr_readers || remote->tracing_on)
+ return;
+
+ /* The buffer has readable data */
+ if (!ring_buffer_empty(remote->trace_buffer))
+ return;
+
+ ring_buffer_free(remote->trace_buffer);
+ remote->trace_buffer = NULL;
+ remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
+}
+
+static int trace_remote_enable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (remote->tracing_on)
+ return 0;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ ret = remote->cbs->enable_tracing(true, remote->priv);
+ if (ret) {
+ trace_remote_try_unload(remote);
+ return ret;
+ }
+
+ remote->tracing_on = true;
+
+ return 0;
+}
+
+static int trace_remote_disable_tracing(struct trace_remote *remote)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (!remote->tracing_on)
+ return 0;
+
+ ret = remote->cbs->enable_tracing(false, remote->priv);
+ if (ret)
+ return ret;
+
+ ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+ remote->tracing_on = false;
+ trace_remote_try_unload(remote);
+
+ return 0;
+}
+
+static void trace_remote_reset(struct trace_remote *remote, int cpu)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ ring_buffer_reset(remote->trace_buffer);
+ else
+ ring_buffer_reset_cpu(remote->trace_buffer, cpu);
+
+ trace_remote_try_unload(remote);
+}
+
+static ssize_t
+tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote);
+ if (ret)
+ return ret;
+
+ return cnt;
+}
+static int tracing_on_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%d\n", remote->tracing_on);
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on);
+
+static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* KiB to Bytes */
+ if (!val || check_shl_overflow(val, 10, &val))
+ return -EINVAL;
+
+ guard(mutex)(&remote->lock);
+
+ if (trace_remote_loaded(remote))
+ return -EBUSY;
+
+ remote->trace_buffer_size = val;
+
+ return cnt;
+}
+
+static int buffer_size_kb_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10,
+ trace_remote_loaded(remote) ? "loaded" : "unloaded");
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb);
+
+static int trace_remote_get(struct trace_remote *remote, int cpu)
+{
+ int ret;
+
+ if (remote->nr_readers == UINT_MAX)
+ return -EBUSY;
+
+ ret = trace_remote_load(remote);
+ if (ret)
+ return ret;
+
+ if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) {
+ int lock_cpu;
+
+ remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks),
+ GFP_KERNEL);
+ if (!remote->pcpu_reader_locks) {
+ trace_remote_try_unload(remote);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(lock_cpu)
+ init_rwsem(&remote->pcpu_reader_locks[lock_cpu]);
+ }
+
+ remote->nr_readers++;
+
+ return 0;
+}
+
+static void trace_remote_put(struct trace_remote *remote)
+{
+ if (WARN_ON(!remote->nr_readers))
+ return;
+
+ remote->nr_readers--;
+ if (remote->nr_readers)
+ return;
+
+ kfree(remote->pcpu_reader_locks);
+ remote->pcpu_reader_locks = NULL;
+
+ trace_remote_try_unload(remote);
+}
+
+static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return true;
+
+ return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
+}
+
+static void __poll_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct trace_remote_iterator *iter;
+
+ iter = container_of(dwork, struct trace_remote_iterator, poll_work);
+ ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu);
+ schedule_delayed_work((struct delayed_work *)work,
+ msecs_to_jiffies(iter->remote->poll_ms));
+}
+
+static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ ring_buffer_read_finish(iter->rb_iter);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (iter->rb_iters[cpu])
+ ring_buffer_read_finish(iter->rb_iters[cpu]);
+ }
+
+ kfree(iter->rb_iters);
+}
+
+static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL);
+
+ return iter->rb_iter ? 0 : -ENOMEM;
+ }
+
+ iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL);
+ if (!iter->rb_iters)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu,
+ GFP_KERNEL);
+ if (!iter->rb_iters[cpu]) {
+ /* This CPU isn't part of trace_buffer. Skip it */
+ if (!trace_remote_has_cpu(iter->remote, cpu))
+ continue;
+
+ __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static struct trace_remote_iterator
+*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
+{
+ struct trace_remote_iterator *iter = NULL;
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote))
+ return NULL;
+
+ ret = trace_remote_get(remote, cpu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!trace_remote_has_cpu(remote, cpu)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ iter = kzalloc_obj(*iter);
+ if (iter) {
+ iter->remote = remote;
+ iter->cpu = cpu;
+ iter->type = type;
+ trace_seq_init(&iter->seq);
+
+ switch (type) {
+ case TRI_CONSUMING:
+ ring_buffer_poll_remote(remote->trace_buffer, cpu);
+ INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
+ schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
+ break;
+ case TRI_NONCONSUMING:
+ ret = __alloc_ring_buffer_iter(iter, cpu);
+ break;
+ }
+
+ if (ret)
+ goto err;
+
+ return iter;
+ }
+ ret = -ENOMEM;
+
+err:
+ kfree(iter);
+ trace_remote_put(remote);
+
+ return ERR_PTR(ret);
+}
+
+static void trace_remote_iter_free(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote;
+
+ if (!iter)
+ return;
+
+ remote = iter->remote;
+
+ lockdep_assert_held(&remote->lock);
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ cancel_delayed_work_sync(&iter->poll_work);
+ break;
+ case TRI_NONCONSUMING:
+ __free_ring_buffer_iter(iter, iter->cpu);
+ break;
+ }
+
+ kfree(iter);
+ trace_remote_put(remote);
+}
+
+static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Acquire global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ down_write(&remote->reader_lock);
+ else
+ down_read(&remote->reader_lock);
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return;
+
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+
+ /* Get the per-CPU one */
+ if (WARN_ON_ONCE(!remote->pcpu_reader_locks))
+ return;
+
+ if (iter->type == TRI_CONSUMING)
+ down_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ down_read(&remote->pcpu_reader_locks[cpu]);
+}
+
+static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter)
+{
+ struct trace_remote *remote = iter->remote;
+ int cpu = iter->cpu;
+
+ /* Release per-CPU reader lock */
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ /*
+ * No need for the remote lock here, iter holds a reference on
+ * remote->nr_readers
+ */
+ if (iter->type == TRI_CONSUMING)
+ up_write(&remote->pcpu_reader_locks[cpu]);
+ else
+ up_read(&remote->pcpu_reader_locks[cpu]);
+ }
+
+ /* Release global reader lock */
+ if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ up_write(&remote->reader_lock);
+ else
+ up_read(&remote->reader_lock);
+}
+
+static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu)
+{
+ return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu];
+}
+
+static struct ring_buffer_event *
+__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events)
+{
+ struct ring_buffer_event *rb_evt;
+ struct ring_buffer_iter *rb_iter;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
+ case TRI_NONCONSUMING:
+ rb_iter = __get_rb_iter(iter, cpu);
+ if (!rb_iter)
+ return NULL;
+
+ rb_evt = ring_buffer_iter_peek(rb_iter, ts);
+ if (!rb_evt)
+ return NULL;
+
+ *lost_events = ring_buffer_iter_dropped(rb_iter);
+
+ return rb_evt;
+ }
+
+ return NULL;
+}
+
+static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ struct ring_buffer_event *rb_evt;
+ int cpu = iter->cpu;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ return false;
+
+ rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events);
+ if (!rb_evt)
+ return false;
+
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ return true;
+ }
+
+ iter->ts = U64_MAX;
+ for_each_possible_cpu(cpu) {
+ unsigned long lost_events;
+ u64 ts;
+
+ if (ring_buffer_empty_cpu(trace_buffer, cpu))
+ continue;
+
+ rb_evt = __peek_event(iter, cpu, &ts, &lost_events);
+ if (!rb_evt)
+ continue;
+
+ if (ts >= iter->ts)
+ continue;
+
+ iter->ts = ts;
+ iter->evt_cpu = cpu;
+ iter->evt = ring_buffer_event_data(rb_evt);
+ iter->lost_events = lost_events;
+ }
+
+ return iter->ts != U64_MAX;
+}
+
+static void trace_remote_iter_move(struct trace_remote_iterator *iter)
+{
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
+ break;
+ case TRI_NONCONSUMING:
+ ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu));
+ break;
+ }
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id);
+
+static int trace_remote_iter_print_event(struct trace_remote_iterator *iter)
+{
+ struct remote_event *evt;
+ unsigned long usecs_rem;
+ u64 ts = iter->ts;
+
+ if (iter->lost_events)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->evt_cpu, iter->lost_events);
+
+ do_div(ts, 1000);
+ usecs_rem = do_div(ts, USEC_PER_SEC);
+
+ trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu,
+ ts, usecs_rem);
+
+ evt = trace_remote_find_event(iter->remote, iter->evt->id);
+ if (!evt)
+ trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id);
+ else
+ evt->print(iter->evt, &iter->seq);
+
+ return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0;
+}
+
+static int trace_pipe_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_CONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ filp->private_data = iter;
+
+ return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+}
+
+static int trace_pipe_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_remote *remote = iter->remote;
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_remote_iterator *iter = filp->private_data;
+ struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
+ int ret;
+
+copy_to_user:
+ ret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (ret != -EBUSY)
+ return ret;
+
+ trace_seq_init(&iter->seq);
+
+ ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ trace_remote_iter_read_start(iter);
+
+ while (trace_remote_iter_read_event(iter)) {
+ int prev_len = iter->seq.seq.len;
+
+ if (trace_remote_iter_print_event(iter)) {
+ iter->seq.seq.len = prev_len;
+ break;
+ }
+
+ trace_remote_iter_move(iter);
+ }
+
+ trace_remote_iter_read_finished(iter);
+
+ goto copy_to_user;
+}
+
+static const struct file_operations trace_pipe_fops = {
+ .open = trace_pipe_open,
+ .read = trace_pipe_read,
+ .release = trace_pipe_release,
+};
+
+static void *trace_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ ++*pos;
+
+ if (!iter || !trace_remote_iter_read_event(iter))
+ return NULL;
+
+ trace_remote_iter_move(iter);
+ iter->pos++;
+
+ return iter;
+}
+
+static void *trace_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_remote_iterator *iter = m->private;
+ loff_t i;
+
+ if (!iter)
+ return NULL;
+
+ trace_remote_iter_read_start(iter);
+
+ if (!*pos) {
+ iter->pos = -1;
+ return trace_next(m, NULL, &i);
+ }
+
+ i = iter->pos;
+ while (i < *pos) {
+ iter = trace_next(m, NULL, &i);
+ if (!iter)
+ return NULL;
+ }
+
+ return iter;
+}
+
+static int trace_show(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = v;
+
+ trace_seq_init(&iter->seq);
+
+ if (trace_remote_iter_print_event(iter)) {
+ seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id);
+ return 0;
+ }
+
+ return trace_print_seq(m, &iter->seq);
+}
+
+static void trace_stop(struct seq_file *m, void *v)
+{
+ struct trace_remote_iterator *iter = m->private;
+
+ if (iter)
+ trace_remote_iter_read_finished(iter);
+}
+
+static const struct seq_operations trace_sops = {
+ .start = trace_start,
+ .next = trace_next,
+ .show = trace_show,
+ .stop = trace_stop,
+};
+
+static int trace_open(struct inode *inode, struct file *filp)
+{
+ struct trace_remote *remote = inode->i_private;
+ struct trace_remote_iterator *iter = NULL;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ guard(mutex)(&remote->lock);
+
+ iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ ret = seq_open(filp, &trace_sops);
+ if (ret) {
+ trace_remote_iter_free(iter);
+ return ret;
+ }
+
+ ((struct seq_file *)filp->private_data)->private = (void *)iter;
+
+ return 0;
+}
+
+static int trace_release(struct inode *inode, struct file *filp)
+{
+ struct trace_remote_iterator *iter;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return 0;
+
+ iter = ((struct seq_file *)filp->private_data)->private;
+ seq_release(inode, filp);
+
+ if (!iter)
+ return 0;
+
+ guard(mutex)(&iter->remote->lock);
+
+ trace_remote_iter_free(iter);
+
+ return 0;
+}
+
+static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_remote *remote = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_reset(remote, cpu);
+
+ return cnt;
+}
+
+static const struct file_operations trace_fops = {
+ .open = trace_open,
+ .write = trace_write,
+ .read = seq_read,
+ .read_iter = seq_read_iter,
+ .release = trace_release,
+};
+
+static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
+{
+ struct dentry *remote_d, *percpu_d, *d;
+ static struct dentry *root;
+ static DEFINE_MUTEX(lock);
+ bool root_inited = false;
+ int cpu;
+
+ guard(mutex)(&lock);
+
+ if (!root) {
+ root = tracefs_create_dir(TRACEFS_DIR, NULL);
+ if (!root) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n");
+ return -ENOMEM;
+ }
+ root_inited = true;
+ }
+
+ remote_d = tracefs_create_dir(name, root);
+ if (!remote_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name);
+ goto err;
+ }
+
+ d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
+ &buffer_size_kb_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops);
+ if (!d)
+ goto err;
+
+ percpu_d = tracefs_create_dir("per_cpu", remote_d);
+ if (!percpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name);
+ goto err;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *cpu_d;
+ char cpu_name[16];
+
+ snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu);
+ cpu_d = tracefs_create_dir(cpu_name, percpu_d);
+ if (!cpu_d) {
+ pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n",
+ name, cpu);
+ goto err;
+ }
+
+ d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
+ &trace_pipe_fops);
+ if (!d)
+ goto err;
+
+ d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu,
+ &trace_fops);
+ if (!d)
+ goto err;
+ }
+
+ remote->dentry = remote_d;
+
+ return 0;
+
+err:
+ if (root_inited) {
+ tracefs_remove(root);
+ root = NULL;
+ } else {
+ tracefs_remove(remote_d);
+ }
+
+ return -ENOMEM;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events);
+
+/**
+ * trace_remote_register() - Register a Tracefs remote
+ * @name: Name of the remote, used for the Tracefs remotes/ directory.
+ * @cbs: Set of callbacks used to control the remote.
+ * @priv: Private data, passed to each callback from @cbs.
+ * @events: Array of events. &remote_event.name and &remote_event.id must be
+ * filled by the caller.
+ * @nr_events: Number of events in the @events array.
+ *
+ * A trace remote is an entity, outside of the kernel (most likely firmware or
+ * hypervisor) capable of writing events into a Tracefs compatible ring-buffer.
+ * The kernel would then act as a reader.
+ *
+ * The registered remote will be found under the Tracefs directory
+ * remotes/<name>.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
+ struct remote_event *events, size_t nr_events)
+{
+ struct trace_remote *remote;
+ int ret;
+
+ remote = kzalloc_obj(*remote);
+ if (!remote)
+ return -ENOMEM;
+
+ remote->cbs = cbs;
+ remote->priv = priv;
+ remote->trace_buffer_size = 7 << 10;
+ remote->poll_ms = 100;
+ mutex_init(&remote->lock);
+ init_rwsem(&remote->reader_lock);
+
+ if (trace_remote_init_tracefs(name, remote)) {
+ kfree(remote);
+ return -ENOMEM;
+ }
+
+ ret = trace_remote_register_events(name, remote, events, nr_events);
+ if (ret) {
+ pr_err("Failed to register events for trace remote '%s' (%d)\n",
+ name, ret);
+ return ret;
+ }
+
+ ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
+ if (ret)
+ pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_register);
+
+/**
+ * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer()
+ * @desc: Descriptor of the per-CPU ring-buffers, originally filled by
+ * trace_remote_alloc_buffer()
+ *
+ * Most likely called from &trace_remote_callbacks.unload_trace_buffer.
+ */
+void trace_remote_free_buffer(struct trace_buffer_desc *desc)
+{
+ struct ring_buffer_desc *rb_desc;
+ int cpu;
+
+ for_each_ring_buffer_desc(rb_desc, cpu, desc) {
+ unsigned int id;
+
+ free_page(rb_desc->meta_va);
+
+ for (id = 0; id < rb_desc->nr_page_va; id++)
+ free_page(rb_desc->page_va[id]);
+ }
+}
+EXPORT_SYMBOL_GPL(trace_remote_free_buffer);
+
+/**
+ * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer
+ * @desc: Uninitialized trace_buffer_desc
+ * @desc_size: Size of the trace_buffer_desc. Must be at least equal to
+ * trace_buffer_desc_size()
+ * @buffer_size: Size in bytes of each per-CPU ring-buffer
+ * @cpumask: CPUs to allocate a ring-buffer for
+ *
+ * Helper to dynamically allocate a set of pages (enough to cover @buffer_size)
+ * for each CPU from @cpumask and fill @desc. Most likely called from
+ * &trace_remote_callbacks.load_trace_buffer.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size,
+ const struct cpumask *cpumask)
+{
+ unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1;
+ void *desc_end = desc + desc_size;
+ struct ring_buffer_desc *rb_desc;
+ int cpu, ret = -ENOMEM;
+
+ if (desc_size < struct_size(desc, __data, 0))
+ return -EINVAL;
+
+ desc->nr_cpus = 0;
+ desc->struct_len = struct_size(desc, __data, 0);
+
+ rb_desc = (struct ring_buffer_desc *)&desc->__data[0];
+
+ for_each_cpu(cpu, cpumask) {
+ unsigned int id;
+
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ rb_desc->cpu = cpu;
+ rb_desc->nr_page_va = 0;
+ rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->meta_va)
+ goto err;
+
+ for (id = 0; id < nr_pages; id++) {
+ rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL);
+ if (!rb_desc->page_va[id])
+ goto err;
+
+ rb_desc->nr_page_va++;
+ }
+ desc->nr_cpus++;
+ desc->struct_len += offsetof(struct ring_buffer_desc, page_va);
+ desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va);
+ rb_desc = __next_ring_buffer_desc(rb_desc);
+ }
+
+ return 0;
+
+err:
+ trace_remote_free_buffer(desc);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer);
+
+static int
+trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable)
+{
+ int ret;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (evt->enabled == enable)
+ return 0;
+
+ ret = remote->cbs->enable_event(evt->id, enable, remote->priv);
+ if (ret)
+ return ret;
+
+ evt->enabled = enable;
+
+ return 0;
+}
+
+static int remote_event_enable_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->enabled);
+
+ return 0;
+}
+
+static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct remote_event *evt = seq->private;
+ struct trace_remote *remote = evt->remote;
+ u8 enable;
+ int ret;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = trace_remote_enable_event(remote, evt, enable);
+ if (ret)
+ return ret;
+
+ return count;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable);
+
+static int remote_event_id_show(struct seq_file *s, void *unused)
+{
+ struct remote_event *evt = s->private;
+
+ seq_printf(s, "%d\n", evt->id);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_id);
+
+static int remote_event_format_show(struct seq_file *s, void *unused)
+{
+ size_t offset = sizeof(struct remote_event_hdr);
+ struct remote_event *evt = s->private;
+ struct trace_event_fields *field;
+
+ seq_printf(s, "name: %s\n", evt->name);
+ seq_printf(s, "ID: %d\n", evt->id);
+ seq_puts(s,
+ "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n");
+
+ field = &evt->fields[0];
+ while (field->name) {
+ seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, offset, field->size,
+ field->is_signed);
+ offset += field->size;
+ field++;
+ }
+
+ if (field != &evt->fields[0])
+ seq_puts(s, "\n");
+
+ seq_printf(s, "print fmt: %s\n", evt->print_fmt);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(remote_event_format);
+
+static int remote_event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_event_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "id")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_id_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "format")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_event_format_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ int i, ret;
+ u8 enable;
+
+ ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ trace_remote_enable_event(remote, evt, enable);
+ }
+
+ return count;
+}
+
+static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_remote *remote = file_inode(filp)->i_private;
+ const char enabled_char[] = {'0', '1', 'X'};
+ char enabled_str[] = " \n";
+ int i, enabled = -1;
+
+ guard(mutex)(&remote->lock);
+
+ for (i = 0; i < remote->nr_events; i++) {
+ struct remote_event *evt = &remote->events[i];
+
+ if (enabled == -1) {
+ enabled = evt->enabled;
+ } else if (enabled != evt->enabled) {
+ enabled = 2;
+ break;
+ }
+ }
+
+ enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled];
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2);
+}
+
+static const struct file_operations remote_events_dir_enable_fops = {
+ .write = remote_events_dir_enable_write,
+ .read = remote_events_dir_enable_read,
+};
+
+static ssize_t
+remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_page_header(NULL, s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_page_fops = {
+ .read = remote_events_dir_header_page_read,
+};
+
+static ssize_t
+remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int ret;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s));
+ kfree(s);
+
+ return ret;
+}
+
+static const struct file_operations remote_events_dir_header_event_fops = {
+ .read = remote_events_dir_header_event_read,
+};
+
+static int remote_events_dir_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (!strcmp(name, "enable")) {
+ *mode = TRACEFS_MODE_WRITE;
+ *fops = &remote_events_dir_enable_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_page")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_page_fops;
+ return 1;
+ }
+
+ if (!strcmp(name, "header_event")) {
+ *mode = TRACEFS_MODE_READ;
+ *fops = &remote_events_dir_header_event_fops;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *evt)
+{
+ struct eventfs_inode *eventfs = remote->eventfs;
+ static struct eventfs_entry dir_entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_page",
+ .callback = remote_events_dir_callback,
+ }, {
+ .name = "header_event",
+ .callback = remote_events_dir_callback,
+ }
+ };
+ static struct eventfs_entry entries[] = {
+ {
+ .name = "enable",
+ .callback = remote_event_callback,
+ }, {
+ .name = "id",
+ .callback = remote_event_callback,
+ }, {
+ .name = "format",
+ .callback = remote_event_callback,
+ }
+ };
+ bool eventfs_create = false;
+
+ if (!eventfs) {
+ eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries,
+ ARRAY_SIZE(dir_entries), remote);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ /*
+ * Create similar hierarchy as local events even if a single system is supported at
+ * the moment
+ */
+ eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL);
+ if (IS_ERR(eventfs))
+ return PTR_ERR(eventfs);
+
+ remote->eventfs = eventfs;
+ eventfs_create = true;
+ }
+
+ eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt);
+ if (IS_ERR(eventfs)) {
+ if (eventfs_create) {
+ eventfs_remove_events_dir(remote->eventfs);
+ remote->eventfs = NULL;
+ }
+ return PTR_ERR(eventfs);
+ }
+
+ return 0;
+}
+
+static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events,
+ size_t nr_events)
+{
+ int i;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ if (evt->remote)
+ return -EEXIST;
+
+ evt->remote = remote;
+
+ /* We need events to be sorted for efficient lookup */
+ if (i && evt->id <= events[i - 1].id)
+ return -EINVAL;
+ }
+
+ remote->events = events;
+ remote->nr_events = nr_events;
+
+ return 0;
+}
+
+static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
+ struct remote_event *events, size_t nr_events)
+{
+ int i, ret;
+
+ ret = trace_remote_attach_events(remote, events, nr_events);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ ret = trace_remote_init_eventfs(remote_name, remote, evt);
+ if (ret)
+ pr_warn("Failed to init eventfs for event '%s' (%d)",
+ evt->name, ret);
+ }
+
+ return 0;
+}
+
+static int __cmp_events(const void *key, const void *data)
+{
+ const struct remote_event *evt = data;
+ int id = (int)((long)key);
+
+ return id - (int)evt->id;
+}
+
+static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id)
+{
+ return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events,
+ sizeof(*remote->events), __cmp_events);
+}
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index a792a599b9d6..1c13bfa2d38a 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -428,6 +428,7 @@ enum {
#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
#define KVM_DEV_ARM_ITS_CTRL_RESET 4
+#define KVM_DEV_ARM_VGIC_USERSPACE_PPIS 5
/* Device Control API on vcpu fd */
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 80364d4dbebb..d0c0c8605976 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1224,6 +1224,8 @@ enum kvm_device_type {
#define KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_EIOINTC
KVM_DEV_TYPE_LOONGARCH_PCHPIC,
#define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC
+ KVM_DEV_TYPE_ARM_VGIC_V5,
+#define KVM_DEV_TYPE_ARM_VGIC_V5 KVM_DEV_TYPE_ARM_VGIC_V5
KVM_DEV_TYPE_MAX,
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc b/tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc
new file mode 100644
index 000000000000..1a43280ffa97
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc
@@ -0,0 +1,25 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote buffer size
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+test_buffer_size()
+{
+ echo 0 > tracing_on
+ assert_unloaded
+
+ echo 4096 > buffer_size_kb
+ echo 1 > tracing_on
+ assert_loaded
+
+ echo 0 > tracing_on
+ echo 7 > buffer_size_kb
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+ setup_remote_test
+ test_buffer_size
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/functions b/tools/testing/selftests/ftrace/test.d/remotes/functions
new file mode 100644
index 000000000000..05224fac3653
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/functions
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-2.0
+
+setup_remote()
+{
+ local name=$1
+
+ [ -e $TRACING_DIR/remotes/$name/write_event ] || exit_unresolved
+
+ cd remotes/$name/
+ echo 0 > tracing_on
+ clear_trace
+ echo 7 > buffer_size_kb
+ echo 0 > events/enable
+ echo 1 > events/$name/selftest/enable
+ echo 1 > tracing_on
+}
+
+setup_remote_test()
+{
+ [ -d $TRACING_DIR/remotes/test/ ] || modprobe remote_test || exit_unresolved
+
+ setup_remote "test"
+}
+
+assert_loaded()
+{
+ grep -q "(loaded)" buffer_size_kb || return 1
+}
+
+assert_unloaded()
+{
+ grep -q "(unloaded)" buffer_size_kb || return 1
+}
+
+reload_remote()
+{
+ echo 0 > tracing_on
+ clear_trace
+ assert_unloaded
+ echo 1 > tracing_on
+ assert_loaded
+}
+
+dump_trace_pipe()
+{
+ output=$(mktemp $TMPDIR/remote_test.XXXXXX)
+ cat trace_pipe > $output &
+ pid=$!
+ sleep 1
+ kill -1 $pid
+
+ echo $output
+}
+
+check_trace()
+{
+ start_id="$1"
+ end_id="$2"
+ file="$3"
+
+ # Ensure the file is not empty
+ test -n "$(head $file)"
+
+ prev_ts=0
+ id=0
+
+ # Only keep <timestamp> <id>
+ tmp=$(mktemp $TMPDIR/remote_test.XXXXXX)
+ sed -e 's/\[[0-9]*\]\s*\([0-9]*.[0-9]*\): [a-z]* id=\([0-9]*\)/\1 \2/' $file > $tmp
+
+ while IFS= read -r line; do
+ ts=$(echo $line | cut -d ' ' -f 1)
+ id=$(echo $line | cut -d ' ' -f 2)
+
+ test $(echo "$ts>$prev_ts" | bc) -eq 1
+ test $id -eq $start_id
+
+ prev_ts=$ts
+ start_id=$((start_id + 1))
+ done < $tmp
+
+ test $id -eq $end_id
+ rm $tmp
+}
+
+get_cpu_ids()
+{
+ sed -n 's/^processor\s*:\s*\([0-9]\+\).*/\1/p' /proc/cpuinfo
+}
+
+get_page_size()
+{
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_selftest_event_size()
+{
+ sed -ne 's/^.*field:.*;.*size:\([0-9][0-9]*\);.*/\1/p' events/*/selftest/format | awk '{s+=$1} END {print s}'
+}
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc b/tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc
new file mode 100644
index 000000000000..145617eb8061
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc
@@ -0,0 +1,88 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote read with an offline CPU
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+hotunplug_one_cpu()
+{
+ [ "$(get_cpu_ids | wc -l)" -ge 2 ] || return 1
+
+ for cpu in $(get_cpu_ids); do
+ echo 0 > /sys/devices/system/cpu/cpu$cpu/online || return 1
+ break
+ done
+
+ echo $cpu
+}
+
+# Check non-consuming and consuming read
+check_read()
+{
+ for i in $(seq 1 8); do
+ echo $i > write_event
+ done
+
+ check_trace 1 8 trace
+
+ output=$(dump_trace_pipe)
+ check_trace 1 8 $output
+ rm $output
+}
+
+test_hotplug()
+{
+ echo 0 > trace
+ assert_loaded
+
+ #
+ # Test a trace buffer containing an offline CPU
+ #
+
+ cpu=$(hotunplug_one_cpu) || exit_unsupported
+ trap "echo 1 > /sys/devices/system/cpu/cpu$cpu/online" EXIT
+
+ check_read
+
+ #
+ # Test a trace buffer with a missing CPU
+ #
+
+ reload_remote
+
+ check_read
+
+ #
+ # Test a trace buffer with a CPU added later
+ #
+
+ echo 1 > /sys/devices/system/cpu/cpu$cpu/online
+ trap "" EXIT
+ assert_loaded
+
+ check_read
+
+ # Test if the ring-buffer for the newly added CPU is both writable and
+ # readable
+ for i in $(seq 1 8); do
+ taskset -c $cpu echo $i > write_event
+ done
+
+ cd per_cpu/cpu$cpu/
+
+ check_trace 1 8 trace
+
+ output=$(dump_trace_pipe)
+ check_trace 1 8 $output
+ rm $output
+
+ cd -
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+
+ setup_remote_test
+ test_hotplug
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc
new file mode 100644
index 000000000000..64bf859d6406
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace buffer size
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/buffer_size.tc
+
+set -e
+setup_remote "hypervisor"
+test_buffer_size
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc
new file mode 100644
index 000000000000..580ec32c8f81
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace read with an offline CPU
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/hotplug.tc
+
+set -e
+setup_remote "hypervisor"
+test_hotplug
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc
new file mode 100644
index 000000000000..7fe3b09b34e3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace buffer reset
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/reset.tc
+
+set -e
+setup_remote "hypervisor"
+test_reset
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc
new file mode 100644
index 000000000000..b937c19ca7f9
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor non-consuming trace read
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/trace.tc
+
+set -e
+setup_remote "hypervisor"
+test_trace
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc
new file mode 100644
index 000000000000..66aa1b76c147
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor consuming trace read
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/trace_pipe.tc
+
+set -e
+setup_remote "hypervisor"
+test_trace_pipe
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc
new file mode 100644
index 000000000000..1dafde3414ab
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace buffer unloading
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/unloading.tc
+
+set -e
+setup_remote "hypervisor"
+test_unloading
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/reset.tc b/tools/testing/selftests/ftrace/test.d/remotes/reset.tc
new file mode 100644
index 000000000000..4d176349b2bc
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/reset.tc
@@ -0,0 +1,90 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote reset
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+check_reset()
+{
+ write_event_path="write_event"
+ taskset=""
+
+ clear_trace
+
+ # Is the buffer empty?
+ output=$(dump_trace_pipe)
+ test $(wc -l $output | cut -d ' ' -f1) -eq 0
+
+ if $(echo $(pwd) | grep -q "per_cpu/cpu"); then
+ write_event_path="../../write_event"
+ cpu_id=$(echo $(pwd) | sed -e 's/.*per_cpu\/cpu//')
+ taskset="taskset -c $cpu_id"
+ fi
+ rm $output
+
+ # Can we properly write a new event?
+ $taskset echo 7890 > $write_event_path
+ output=$(dump_trace_pipe)
+ test $(wc -l $output | cut -d ' ' -f1) -eq 1
+ grep -q "id=7890" $output
+ rm $output
+}
+
+test_global_interface()
+{
+ output=$(mktemp $TMPDIR/remote_test.XXXXXX)
+
+ # Confidence check
+ echo 123456 > write_event
+ output=$(dump_trace_pipe)
+ grep -q "id=123456" $output
+ rm $output
+
+ # Reset single event
+ echo 1 > write_event
+ check_reset
+
+ # Reset lost events
+ for i in $(seq 1 10000); do
+ echo 1 > write_event
+ done
+ check_reset
+}
+
+test_percpu_interface()
+{
+ [ "$(get_cpu_ids | wc -l)" -ge 2 ] || return 0
+
+ for cpu in $(get_cpu_ids); do
+ taskset -c $cpu echo 1 > write_event
+ done
+
+ check_non_empty=0
+ for cpu in $(get_cpu_ids); do
+ cd per_cpu/cpu$cpu/
+
+ if [ $check_non_empty -eq 0 ]; then
+ check_reset
+ check_non_empty=1
+ else
+ # Check we have only reset 1 CPU
+ output=$(dump_trace_pipe)
+ test $(wc -l $output | cut -d ' ' -f1) -eq 1
+ rm $output
+ fi
+ cd -
+ done
+}
+
+test_reset()
+{
+ test_global_interface
+ test_percpu_interface
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+ setup_remote_test
+ test_reset
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/trace.tc b/tools/testing/selftests/ftrace/test.d/remotes/trace.tc
new file mode 100644
index 000000000000..bc9377a70e8d
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/trace.tc
@@ -0,0 +1,102 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote non-consuming read
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+test_trace()
+{
+ echo 0 > tracing_on
+ assert_unloaded
+
+ echo 7 > buffer_size_kb
+ echo 1 > tracing_on
+ assert_loaded
+
+ # Simple test: Emit few events and try to read them
+ for i in $(seq 1 8); do
+ echo $i > write_event
+ done
+
+ check_trace 1 8 trace
+
+ #
+ # Test interaction with consuming read
+ #
+
+ cat trace_pipe > /dev/null &
+ pid=$!
+
+ sleep 1
+ kill $pid
+
+ test $(wc -l < trace) -eq 0
+
+ for i in $(seq 16 32); do
+ echo $i > write_event
+ done
+
+ check_trace 16 32 trace
+
+ #
+ # Test interaction with reset
+ #
+
+ echo 0 > trace
+
+ test $(wc -l < trace) -eq 0
+
+ for i in $(seq 1 8); do
+ echo $i > write_event
+ done
+
+ check_trace 1 8 trace
+
+ #
+ # Test interaction with lost events
+ #
+
+ # Ensure the writer is not on the reader page by reloading the buffer
+ reload_remote
+
+ # Ensure ring-buffer overflow by emitting events from the same CPU
+ for cpu in $(get_cpu_ids); do
+ break
+ done
+
+ events_per_page=$(($(get_page_size) / $(get_selftest_event_size))) # Approx: does not take TS into account
+ nr_events=$(($events_per_page * 2))
+ for i in $(seq 1 $nr_events); do
+ taskset -c $cpu echo $i > write_event
+ done
+
+ id=$(sed -n -e '1s/\[[0-9]*\]\s*[0-9]*.[0-9]*: [a-z]* id=\([0-9]*\)/\1/p' trace)
+ test $id -ne 1
+
+ check_trace $id $nr_events trace
+
+ #
+ # Test per-CPU interface
+ #
+ echo 0 > trace
+
+ for cpu in $(get_cpu_ids) ; do
+ taskset -c $cpu echo $cpu > write_event
+ done
+
+ for cpu in $(get_cpu_ids); do
+ cd per_cpu/cpu$cpu/
+
+ check_trace $cpu $cpu trace
+
+ cd - > /dev/null
+ done
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+
+ setup_remote_test
+ test_trace
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc
new file mode 100644
index 000000000000..7f7b7b79c490
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc
@@ -0,0 +1,102 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote consuming read
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+test_trace_pipe()
+{
+ echo 0 > tracing_on
+ assert_unloaded
+
+ # Emit events from the same CPU
+ for cpu in $(get_cpu_ids); do
+ break
+ done
+
+ #
+ # Simple test: Emit enough events to fill few pages
+ #
+
+ echo 1024 > buffer_size_kb
+ echo 1 > tracing_on
+ assert_loaded
+
+ events_per_page=$(($(get_page_size) / $(get_selftest_event_size)))
+ nr_events=$(($events_per_page * 4))
+
+ output=$(mktemp $TMPDIR/remote_test.XXXXXX)
+
+ cat trace_pipe > $output &
+ pid=$!
+
+ for i in $(seq 1 $nr_events); do
+ taskset -c $cpu echo $i > write_event
+ done
+
+ echo 0 > tracing_on
+ sleep 1
+ kill $pid
+
+ check_trace 1 $nr_events $output
+
+ rm $output
+
+ #
+ # Test interaction with lost events
+ #
+
+ assert_unloaded
+ echo 7 > buffer_size_kb
+ echo 1 > tracing_on
+ assert_loaded
+
+ nr_events=$((events_per_page * 2))
+ for i in $(seq 1 $nr_events); do
+ taskset -c $cpu echo $i > write_event
+ done
+
+ output=$(dump_trace_pipe)
+
+ lost_events=$(sed -n -e '1s/CPU:.*\[LOST \([0-9]*\) EVENTS\]/\1/p' $output)
+ test -n "$lost_events"
+
+ id=$(sed -n -e '2s/\[[0-9]*\]\s*[0-9]*.[0-9]*: [a-z]* id=\([0-9]*\)/\1/p' $output)
+ test "$id" -eq $(($lost_events + 1))
+
+ # Drop [LOST EVENTS] line
+ sed -i '1d' $output
+
+ check_trace $id $nr_events $output
+
+ rm $output
+
+ #
+ # Test per-CPU interface
+ #
+
+ echo 0 > trace
+ echo 1 > tracing_on
+
+ for cpu in $(get_cpu_ids); do
+ taskset -c $cpu echo $cpu > write_event
+ done
+
+ for cpu in $(get_cpu_ids); do
+ cd per_cpu/cpu$cpu/
+ output=$(dump_trace_pipe)
+
+ check_trace $cpu $cpu $output
+
+ rm $output
+ cd - > /dev/null
+ done
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+
+ setup_remote_test
+ test_trace_pipe
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/unloading.tc b/tools/testing/selftests/ftrace/test.d/remotes/unloading.tc
new file mode 100644
index 000000000000..cac2190183f6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/unloading.tc
@@ -0,0 +1,41 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote unloading
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+test_unloading()
+{
+ # No reader, writing
+ assert_loaded
+
+ # No reader, no writing
+ echo 0 > tracing_on
+ assert_unloaded
+
+ # 1 reader, no writing
+ cat trace_pipe &
+ pid=$!
+ sleep 1
+ assert_loaded
+ kill $pid
+ assert_unloaded
+
+ # No reader, no writing, events
+ echo 1 > tracing_on
+ echo 1 > write_event
+ echo 0 > tracing_on
+ assert_loaded
+
+ # Test reset
+ clear_trace
+ assert_unloaded
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+
+ setup_remote_test
+ test_unloading
+fi
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 502c99258bd1..8ad649a8e936 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -177,8 +177,9 @@ TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config
TEST_GEN_PROGS_arm64 += arm64/vgic_init
TEST_GEN_PROGS_arm64 += arm64/vgic_irq
TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
+TEST_GEN_PROGS_arm64 += arm64/vgic_v5
TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
-TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
+TEST_GEN_PROGS_arm64 += arm64/no-vgic
TEST_GEN_PROGS_arm64 += arm64/idreg-idst
TEST_GEN_PROGS_arm64 += arm64/kvm-uuid
TEST_GEN_PROGS_arm64 += access_tracking_perf_test
diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c
index c8ee6f520734..ce5d312ef6ba 100644
--- a/tools/testing/selftests/kvm/arm64/at.c
+++ b/tools/testing/selftests/kvm/arm64/at.c
@@ -13,7 +13,6 @@
enum {
CLEAR_ACCESS_FLAG,
- TEST_ACCESS_FLAG,
};
static u64 *ptep_hva;
@@ -49,7 +48,6 @@ do { \
GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \
GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \
GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \
- GUEST_SYNC(TEST_ACCESS_FLAG); \
} \
} while (0)
@@ -85,10 +83,6 @@ static void guest_code(void)
if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1)))
GUEST_DONE();
- /*
- * KVM's software PTW makes the implementation choice that the AT
- * instruction sets the access flag.
- */
sysreg_clear_set(tcr_el1, 0, TCR_HA);
isb();
test_at(false);
@@ -102,8 +96,8 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
case CLEAR_ACCESS_FLAG:
/*
* Delete + reinstall the memslot to invalidate stage-2
- * mappings of the stage-1 page tables, forcing KVM to
- * use the 'slow' AT emulation path.
+ * mappings of the stage-1 page tables, allowing KVM to
+ * potentially use the 'slow' AT emulation path.
*
* This and clearing the access flag from host userspace
* ensures that the access flag cannot be set speculatively
@@ -112,10 +106,6 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
clear_bit(__ffs(PTE_AF), ptep_hva);
vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]);
break;
- case TEST_ACCESS_FLAG:
- TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva),
- "Expected access flag to be set (desc: %lu)", *ptep_hva);
- break;
default:
TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]);
}
diff --git a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
deleted file mode 100644
index 152c34776981..000000000000
--- a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// Check that, on a GICv3 system, not configuring GICv3 correctly
-// results in all of the sysregs generating an UNDEF exception.
-
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-
-static volatile bool handled;
-
-#define __check_sr_read(r) \
- ({ \
- uint64_t val; \
- \
- handled = false; \
- dsb(sy); \
- val = read_sysreg_s(SYS_ ## r); \
- val; \
- })
-
-#define __check_sr_write(r) \
- do { \
- handled = false; \
- dsb(sy); \
- write_sysreg_s(0, SYS_ ## r); \
- isb(); \
- } while(0)
-
-/* Fatal checks */
-#define check_sr_read(r) \
- do { \
- __check_sr_read(r); \
- __GUEST_ASSERT(handled, #r " no read trap"); \
- } while(0)
-
-#define check_sr_write(r) \
- do { \
- __check_sr_write(r); \
- __GUEST_ASSERT(handled, #r " no write trap"); \
- } while(0)
-
-#define check_sr_rw(r) \
- do { \
- check_sr_read(r); \
- check_sr_write(r); \
- } while(0)
-
-static void guest_code(void)
-{
- uint64_t val;
-
- /*
- * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having
- * hidden the feature at runtime without any other userspace action.
- */
- __GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC,
- read_sysreg(id_aa64pfr0_el1)) == 0,
- "GICv3 wrongly advertised");
-
- /*
- * Access all GICv3 registers, and fail if we don't get an UNDEF.
- * Note that we happily access all the APxRn registers without
- * checking their existance, as all we want to see is a failure.
- */
- check_sr_rw(ICC_PMR_EL1);
- check_sr_read(ICC_IAR0_EL1);
- check_sr_write(ICC_EOIR0_EL1);
- check_sr_rw(ICC_HPPIR0_EL1);
- check_sr_rw(ICC_BPR0_EL1);
- check_sr_rw(ICC_AP0R0_EL1);
- check_sr_rw(ICC_AP0R1_EL1);
- check_sr_rw(ICC_AP0R2_EL1);
- check_sr_rw(ICC_AP0R3_EL1);
- check_sr_rw(ICC_AP1R0_EL1);
- check_sr_rw(ICC_AP1R1_EL1);
- check_sr_rw(ICC_AP1R2_EL1);
- check_sr_rw(ICC_AP1R3_EL1);
- check_sr_write(ICC_DIR_EL1);
- check_sr_read(ICC_RPR_EL1);
- check_sr_write(ICC_SGI1R_EL1);
- check_sr_write(ICC_ASGI1R_EL1);
- check_sr_write(ICC_SGI0R_EL1);
- check_sr_read(ICC_IAR1_EL1);
- check_sr_write(ICC_EOIR1_EL1);
- check_sr_rw(ICC_HPPIR1_EL1);
- check_sr_rw(ICC_BPR1_EL1);
- check_sr_rw(ICC_CTLR_EL1);
- check_sr_rw(ICC_IGRPEN0_EL1);
- check_sr_rw(ICC_IGRPEN1_EL1);
-
- /*
- * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can
- * be RAO/WI. Engage in non-fatal accesses, starting with a
- * write of 0 to try and disable SRE, and let's see if it
- * sticks.
- */
- __check_sr_write(ICC_SRE_EL1);
- if (!handled)
- GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n");
-
- val = __check_sr_read(ICC_SRE_EL1);
- if (!handled) {
- __GUEST_ASSERT((val & BIT(0)),
- "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n");
- GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n");
- }
-
- GUEST_DONE();
-}
-
-static void guest_undef_handler(struct ex_regs *regs)
-{
- /* Success, we've gracefully exploded! */
- handled = true;
- regs->pc += 4;
-}
-
-static void test_run_vcpu(struct kvm_vcpu *vcpu)
-{
- struct ucall uc;
-
- do {
- vcpu_run(vcpu);
-
- switch (get_ucall(vcpu, &uc)) {
- case UCALL_ABORT:
- REPORT_GUEST_ASSERT(uc);
- break;
- case UCALL_PRINTF:
- printf("%s", uc.buffer);
- break;
- case UCALL_DONE:
- break;
- default:
- TEST_FAIL("Unknown ucall %lu", uc.cmd);
- }
- } while (uc.cmd != UCALL_DONE);
-}
-
-static void test_guest_no_gicv3(void)
-{
- struct kvm_vcpu *vcpu;
- struct kvm_vm *vm;
-
- /* Create a VM without a GICv3 */
- vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
- vm_init_descriptor_tables(vm);
- vcpu_init_descriptor_tables(vcpu);
-
- vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
- ESR_ELx_EC_UNKNOWN, guest_undef_handler);
-
- test_run_vcpu(vcpu);
-
- kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
- struct kvm_vcpu *vcpu;
- struct kvm_vm *vm;
- uint64_t pfr0;
-
- test_disable_default_vgic();
-
- vm = vm_create_with_one_vcpu(&vcpu, NULL);
- pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
- __TEST_REQUIRE(FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr0),
- "GICv3 not supported.");
- kvm_vm_free(vm);
-
- test_guest_no_gicv3();
-
- return 0;
-}
diff --git a/tools/testing/selftests/kvm/arm64/no-vgic.c b/tools/testing/selftests/kvm/arm64/no-vgic.c
new file mode 100644
index 000000000000..b14686ef17d1
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/no-vgic.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Check that, on a GICv3-capable system (GICv3 native, or GICv5 with
+// FEAT_GCIE_LEGACY), not configuring GICv3 correctly results in all
+// of the sysregs generating an UNDEF exception. Do the same for GICv5
+// on a GICv5 host.
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#include <arm64/gic_v5.h>
+
+static volatile bool handled;
+
+#define __check_sr_read(r) \
+ ({ \
+ uint64_t val; \
+ \
+ handled = false; \
+ dsb(sy); \
+ val = read_sysreg_s(SYS_ ## r); \
+ val; \
+ })
+
+#define __check_sr_write(r) \
+ do { \
+ handled = false; \
+ dsb(sy); \
+ write_sysreg_s(0, SYS_ ## r); \
+ isb(); \
+ } while (0)
+
+#define __check_gicv5_gicr_op(r) \
+ ({ \
+ uint64_t val; \
+ \
+ handled = false; \
+ dsb(sy); \
+ val = read_sysreg_s(GICV5_OP_GICR_ ## r); \
+ val; \
+ })
+
+#define __check_gicv5_gic_op(r) \
+ do { \
+ handled = false; \
+ dsb(sy); \
+ write_sysreg_s(0, GICV5_OP_GIC_ ## r); \
+ isb(); \
+ } while (0)
+
+/* Fatal checks */
+#define check_sr_read(r) \
+ do { \
+ __check_sr_read(r); \
+ __GUEST_ASSERT(handled, #r " no read trap"); \
+ } while (0)
+
+#define check_sr_write(r) \
+ do { \
+ __check_sr_write(r); \
+ __GUEST_ASSERT(handled, #r " no write trap"); \
+ } while (0)
+
+#define check_sr_rw(r) \
+ do { \
+ check_sr_read(r); \
+ check_sr_write(r); \
+ } while (0)
+
+#define check_gicv5_gicr_op(r) \
+ do { \
+ __check_gicv5_gicr_op(r); \
+ __GUEST_ASSERT(handled, #r " no read trap"); \
+ } while (0)
+
+#define check_gicv5_gic_op(r) \
+ do { \
+ __check_gicv5_gic_op(r); \
+ __GUEST_ASSERT(handled, #r " no write trap"); \
+ } while (0)
+
+static void guest_code_gicv3(void)
+{
+ uint64_t val;
+
+ /*
+ * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having
+ * hidden the feature at runtime without any other userspace action.
+ */
+ __GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC,
+ read_sysreg(id_aa64pfr0_el1)) == 0,
+ "GICv3 wrongly advertised");
+
+ /*
+ * Access all GICv3 registers, and fail if we don't get an UNDEF.
+ * Note that we happily access all the APxRn registers without
+ * checking their existence, as all we want to see is a failure.
+ */
+ check_sr_rw(ICC_PMR_EL1);
+ check_sr_read(ICC_IAR0_EL1);
+ check_sr_write(ICC_EOIR0_EL1);
+ check_sr_rw(ICC_HPPIR0_EL1);
+ check_sr_rw(ICC_BPR0_EL1);
+ check_sr_rw(ICC_AP0R0_EL1);
+ check_sr_rw(ICC_AP0R1_EL1);
+ check_sr_rw(ICC_AP0R2_EL1);
+ check_sr_rw(ICC_AP0R3_EL1);
+ check_sr_rw(ICC_AP1R0_EL1);
+ check_sr_rw(ICC_AP1R1_EL1);
+ check_sr_rw(ICC_AP1R2_EL1);
+ check_sr_rw(ICC_AP1R3_EL1);
+ check_sr_write(ICC_DIR_EL1);
+ check_sr_read(ICC_RPR_EL1);
+ check_sr_write(ICC_SGI1R_EL1);
+ check_sr_write(ICC_ASGI1R_EL1);
+ check_sr_write(ICC_SGI0R_EL1);
+ check_sr_read(ICC_IAR1_EL1);
+ check_sr_write(ICC_EOIR1_EL1);
+ check_sr_rw(ICC_HPPIR1_EL1);
+ check_sr_rw(ICC_BPR1_EL1);
+ check_sr_rw(ICC_CTLR_EL1);
+ check_sr_rw(ICC_IGRPEN0_EL1);
+ check_sr_rw(ICC_IGRPEN1_EL1);
+
+ /*
+ * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can
+ * be RAO/WI. Engage in non-fatal accesses, starting with a
+ * write of 0 to try and disable SRE, and let's see if it
+ * sticks.
+ */
+ __check_sr_write(ICC_SRE_EL1);
+ if (!handled)
+ GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n");
+
+ val = __check_sr_read(ICC_SRE_EL1);
+ if (!handled) {
+ __GUEST_ASSERT((val & BIT(0)),
+ "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n");
+ GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n");
+ }
+
+ GUEST_DONE();
+}
+
+static void guest_code_gicv5(void)
+{
+ /*
+ * Check that we advertise that ID_AA64PFR2_EL1.GCIE == 0, having
+ * hidden the feature at runtime without any other userspace action.
+ */
+ __GUEST_ASSERT(FIELD_GET(ID_AA64PFR2_EL1_GCIE,
+ read_sysreg_s(SYS_ID_AA64PFR2_EL1)) == 0,
+ "GICv5 wrongly advertised");
+
+ /*
+ * Try all GICv5 instructions, and fail if we don't get an UNDEF.
+ */
+ check_gicv5_gic_op(CDAFF);
+ check_gicv5_gic_op(CDDI);
+ check_gicv5_gic_op(CDDIS);
+ check_gicv5_gic_op(CDEOI);
+ check_gicv5_gic_op(CDHM);
+ check_gicv5_gic_op(CDPEND);
+ check_gicv5_gic_op(CDPRI);
+ check_gicv5_gic_op(CDRCFG);
+ check_gicv5_gicr_op(CDIA);
+ check_gicv5_gicr_op(CDNMIA);
+
+ /* Check General System Register acccesses */
+ check_sr_rw(ICC_APR_EL1);
+ check_sr_rw(ICC_CR0_EL1);
+ check_sr_read(ICC_HPPIR_EL1);
+ check_sr_read(ICC_IAFFIDR_EL1);
+ check_sr_rw(ICC_ICSR_EL1);
+ check_sr_read(ICC_IDR0_EL1);
+ check_sr_rw(ICC_PCR_EL1);
+
+ /* Check PPI System Register accessess */
+ check_sr_rw(ICC_PPI_CACTIVER0_EL1);
+ check_sr_rw(ICC_PPI_CACTIVER1_EL1);
+ check_sr_rw(ICC_PPI_SACTIVER0_EL1);
+ check_sr_rw(ICC_PPI_SACTIVER1_EL1);
+ check_sr_rw(ICC_PPI_CPENDR0_EL1);
+ check_sr_rw(ICC_PPI_CPENDR1_EL1);
+ check_sr_rw(ICC_PPI_SPENDR0_EL1);
+ check_sr_rw(ICC_PPI_SPENDR1_EL1);
+ check_sr_rw(ICC_PPI_ENABLER0_EL1);
+ check_sr_rw(ICC_PPI_ENABLER1_EL1);
+ check_sr_read(ICC_PPI_HMR0_EL1);
+ check_sr_read(ICC_PPI_HMR1_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR0_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR1_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR2_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR3_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR4_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR5_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR6_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR7_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR8_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR9_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR10_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR11_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR12_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR13_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR14_EL1);
+ check_sr_rw(ICC_PPI_PRIORITYR15_EL1);
+
+ GUEST_DONE();
+}
+
+static void guest_undef_handler(struct ex_regs *regs)
+{
+ /* Success, we've gracefully exploded! */
+ handled = true;
+ regs->pc += 4;
+}
+
+static void test_run_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct ucall uc;
+
+ do {
+ vcpu_run(vcpu);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_PRINTF:
+ printf("%s", uc.buffer);
+ break;
+ case UCALL_DONE:
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ } while (uc.cmd != UCALL_DONE);
+}
+
+static void test_guest_no_vgic(void *guest_code)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+
+ /* Create a VM without a GIC */
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vcpu);
+
+ vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+ ESR_ELx_EC_UNKNOWN, guest_undef_handler);
+
+ test_run_vcpu(vcpu);
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ bool has_v3, has_v5;
+ uint64_t pfr;
+
+ test_disable_default_vgic();
+
+ vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+ pfr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
+ has_v3 = !!FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr);
+
+ pfr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR2_EL1));
+ has_v5 = !!FIELD_GET(ID_AA64PFR2_EL1_GCIE, pfr);
+
+ kvm_vm_free(vm);
+
+ __TEST_REQUIRE(has_v3 || has_v5,
+ "Neither GICv3 nor GICv5 supported.");
+
+ if (has_v3) {
+ pr_info("Testing no-vgic-v3\n");
+ test_guest_no_vgic(guest_code_gicv3);
+ } else {
+ pr_info("No GICv3 support: skipping no-vgic-v3 test\n");
+ }
+
+ if (has_v5) {
+ pr_info("Testing no-vgic-v5\n");
+ test_guest_no_vgic(guest_code_gicv5);
+ } else {
+ pr_info("No GICv5 support: skipping no-vgic-v5 test\n");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index 73de5be58bab..7899d557c70b 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -37,6 +37,9 @@ struct reg_ftr_bits {
* For FTR_LOWER_SAFE, safe_val is used as the minimal safe value.
*/
int64_t safe_val;
+
+ /* Allowed to be changed by the host after run */
+ bool mutable;
};
struct test_feature_reg {
@@ -44,7 +47,7 @@ struct test_feature_reg {
const struct reg_ftr_bits *ftr_bits;
};
-#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL) \
+#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL, MUT) \
{ \
.name = #NAME, \
.sign = SIGNED, \
@@ -52,15 +55,20 @@ struct test_feature_reg {
.shift = SHIFT, \
.mask = MASK, \
.safe_val = SAFE_VAL, \
+ .mutable = MUT, \
}
#define REG_FTR_BITS(type, reg, field, safe_val) \
__REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \
- reg##_##field##_MASK, safe_val)
+ reg##_##field##_MASK, safe_val, false)
+
+#define REG_FTR_BITS_MUTABLE(type, reg, field, safe_val) \
+ __REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \
+ reg##_##field##_MASK, safe_val, true)
#define S_REG_FTR_BITS(type, reg, field, safe_val) \
__REG_FTR_BITS(reg##_##field, FTR_SIGNED, type, reg##_##field##_SHIFT, \
- reg##_##field##_MASK, safe_val)
+ reg##_##field##_MASK, safe_val, false)
#define REG_FTR_END \
{ \
@@ -134,7 +142,8 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = {
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0),
- REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0),
+ /* GICv3 support will be forced at run time if available */
+ REG_FTR_BITS_MUTABLE(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 1),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 1),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 1),
@@ -634,12 +643,38 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu)
ksft_test_result_pass("ID_AA64PFR1_EL1.MTE_frac no longer 0xF\n");
}
+static uint64_t reset_mutable_bits(uint32_t id, uint64_t val)
+{
+ struct test_feature_reg *reg = NULL;
+
+ for (int i = 0; i < ARRAY_SIZE(test_regs); i++) {
+ if (test_regs[i].reg == id) {
+ reg = &test_regs[i];
+ break;
+ }
+ }
+
+ if (!reg)
+ return val;
+
+ for (const struct reg_ftr_bits *bits = reg->ftr_bits; bits->type != FTR_END; bits++) {
+ if (bits->mutable) {
+ val &= ~bits->mask;
+ val |= bits->safe_val << bits->shift;
+ }
+ }
+
+ return val;
+}
+
static void test_guest_reg_read(struct kvm_vcpu *vcpu)
{
bool done = false;
struct ucall uc;
while (!done) {
+ uint64_t val;
+
vcpu_run(vcpu);
switch (get_ucall(vcpu, &uc)) {
@@ -647,9 +682,11 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu)
REPORT_GUEST_ASSERT(uc);
break;
case UCALL_SYNC:
+ val = test_reg_vals[encoding_to_range_idx(uc.args[2])];
+ val = reset_mutable_bits(uc.args[2], val);
+
/* Make sure the written values are seen by guest */
- TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])],
- uc.args[3]);
+ TEST_ASSERT_EQ(val, reset_mutable_bits(uc.args[2], uc.args[3]));
break;
case UCALL_DONE:
done = true;
@@ -740,7 +777,8 @@ static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encodin
uint64_t observed;
observed = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding));
- TEST_ASSERT_EQ(test_reg_vals[idx], observed);
+ TEST_ASSERT_EQ(reset_mutable_bits(encoding, test_reg_vals[idx]),
+ reset_mutable_bits(encoding, observed));
}
static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu)
diff --git a/tools/testing/selftests/kvm/arm64/vgic_v5.c b/tools/testing/selftests/kvm/arm64/vgic_v5.c
new file mode 100644
index 000000000000..3ce6cf37a629
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vgic_v5.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include <arm64/gic_v5.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vgic.h"
+
+#define NR_VCPUS 1
+
+struct vm_gic {
+ struct kvm_vm *vm;
+ int gic_fd;
+ uint32_t gic_dev_type;
+};
+
+static uint64_t max_phys_size;
+
+#define GUEST_CMD_IRQ_CDIA 10
+#define GUEST_CMD_IRQ_DIEOI 11
+#define GUEST_CMD_IS_AWAKE 12
+#define GUEST_CMD_IS_READY 13
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+ bool valid;
+ u32 hwirq;
+ u64 ia;
+ static int count;
+
+ /*
+ * We have pending interrupts. Should never actually enter WFI
+ * here!
+ */
+ wfi();
+ GUEST_SYNC(GUEST_CMD_IS_AWAKE);
+
+ ia = gicr_insn(CDIA);
+ valid = GICV5_GICR_CDIA_VALID(ia);
+
+ GUEST_SYNC(GUEST_CMD_IRQ_CDIA);
+
+ if (!valid)
+ return;
+
+ gsb_ack();
+ isb();
+
+ hwirq = FIELD_GET(GICV5_GICR_CDIA_INTID, ia);
+
+ gic_insn(hwirq, CDDI);
+ gic_insn(0, CDEOI);
+
+ GUEST_SYNC(GUEST_CMD_IRQ_DIEOI);
+
+ if (++count >= 2)
+ GUEST_DONE();
+
+ /* Ask for the next interrupt to be injected */
+ GUEST_SYNC(GUEST_CMD_IS_READY);
+}
+
+static void guest_code(void)
+{
+ local_irq_disable();
+
+ gicv5_cpu_enable_interrupts();
+ local_irq_enable();
+
+ /* Enable the SW_PPI (3) */
+ write_sysreg_s(BIT_ULL(3), SYS_ICC_PPI_ENABLER0_EL1);
+
+ /* Ask for the first interrupt to be injected */
+ GUEST_SYNC(GUEST_CMD_IS_READY);
+
+ /* Loop forever waiting for interrupts */
+ while (1);
+}
+
+
+/* we don't want to assert on run execution, hence that helper */
+static int run_vcpu(struct kvm_vcpu *vcpu)
+{
+ return __vcpu_run(vcpu) ? -errno : 0;
+}
+
+static void vm_gic_destroy(struct vm_gic *v)
+{
+ close(v->gic_fd);
+ kvm_vm_free(v->vm);
+}
+
+static void test_vgic_v5_ppis(uint32_t gic_dev_type)
+{
+ struct kvm_vcpu *vcpus[NR_VCPUS];
+ struct ucall uc;
+ u64 user_ppis[2];
+ struct vm_gic v;
+ int ret, i;
+
+ v.gic_dev_type = gic_dev_type;
+ v.vm = __vm_create(VM_SHAPE_DEFAULT, NR_VCPUS, 0);
+
+ v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
+
+ for (i = 0; i < NR_VCPUS; i++)
+ vcpus[i] = vm_vcpu_add(v.vm, i, guest_code);
+
+ vm_init_descriptor_tables(v.vm);
+ vm_install_exception_handler(v.vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
+
+ for (i = 0; i < NR_VCPUS; i++)
+ vcpu_init_descriptor_tables(vcpus[i]);
+
+ kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+ KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+ /* Read out the PPIs that user space is allowed to drive. */
+ kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+ KVM_DEV_ARM_VGIC_USERSPACE_PPIS, &user_ppis);
+
+ /* We should always be able to drive the SW_PPI. */
+ TEST_ASSERT(user_ppis[0] & BIT(GICV5_ARCH_PPI_SW_PPI),
+ "SW_PPI is not drivable by userspace");
+
+ while (1) {
+ ret = run_vcpu(vcpus[0]);
+
+ switch (get_ucall(vcpus[0], &uc)) {
+ case UCALL_SYNC:
+ /*
+ * The guest is ready for the next level change. Set
+ * high if ready, and lower if it has been consumed.
+ */
+ if (uc.args[1] == GUEST_CMD_IS_READY ||
+ uc.args[1] == GUEST_CMD_IRQ_DIEOI) {
+ u64 irq;
+ bool level = uc.args[1] == GUEST_CMD_IRQ_DIEOI ? 0 : 1;
+
+ irq = FIELD_PREP(KVM_ARM_IRQ_NUM_MASK, 3);
+ irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT;
+
+ _kvm_irq_line(v.vm, irq, level);
+ } else if (uc.args[1] == GUEST_CMD_IS_AWAKE) {
+ pr_info("Guest skipping WFI due to pending IRQ\n");
+ } else if (uc.args[1] == GUEST_CMD_IRQ_CDIA) {
+ pr_info("Guest acknowledged IRQ\n");
+ }
+
+ continue;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+
+done:
+ TEST_ASSERT(ret == 0, "Failed to test GICv5 PPIs");
+
+ vm_gic_destroy(&v);
+}
+
+/*
+ * Returns 0 if it's possible to create GIC device of a given type (V5).
+ */
+int test_kvm_device(uint32_t gic_dev_type)
+{
+ struct kvm_vcpu *vcpus[NR_VCPUS];
+ struct vm_gic v;
+ int ret;
+
+ v.vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus);
+
+ /* try to create a non existing KVM device */
+ ret = __kvm_test_create_device(v.vm, 0);
+ TEST_ASSERT(ret && errno == ENODEV, "unsupported device");
+
+ /* trial mode */
+ ret = __kvm_test_create_device(v.vm, gic_dev_type);
+ if (ret)
+ return ret;
+ v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
+
+ ret = __kvm_create_device(v.vm, gic_dev_type);
+ TEST_ASSERT(ret < 0 && errno == EEXIST, "create GIC device twice");
+
+ vm_gic_destroy(&v);
+
+ return 0;
+}
+
+void run_tests(uint32_t gic_dev_type)
+{
+ pr_info("Test VGICv5 PPIs\n");
+ test_vgic_v5_ppis(gic_dev_type);
+}
+
+int main(int ac, char **av)
+{
+ int ret;
+ int pa_bits;
+
+ test_disable_default_vgic();
+
+ pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits;
+ max_phys_size = 1ULL << pa_bits;
+
+ ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V5);
+ if (ret) {
+ pr_info("No GICv5 support; Not running GIC_v5 tests.\n");
+ exit(KSFT_SKIP);
+ }
+
+ pr_info("Running VGIC_V5 tests.\n");
+ run_tests(KVM_DEV_TYPE_ARM_VGIC_V5);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v5.h b/tools/testing/selftests/kvm/include/arm64/gic_v5.h
new file mode 100644
index 000000000000..eb523d9277cf
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/gic_v5.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SELFTESTS_GIC_V5_H
+#define __SELFTESTS_GIC_V5_H
+
+#include <asm/barrier.h>
+#include <asm/sysreg.h>
+
+#include <linux/bitfield.h>
+
+#include "processor.h"
+
+/*
+ * Definitions for GICv5 instructions for the Current Domain
+ */
+#define GICV5_OP_GIC_CDAFF sys_insn(1, 0, 12, 1, 3)
+#define GICV5_OP_GIC_CDDI sys_insn(1, 0, 12, 2, 0)
+#define GICV5_OP_GIC_CDDIS sys_insn(1, 0, 12, 1, 0)
+#define GICV5_OP_GIC_CDHM sys_insn(1, 0, 12, 2, 1)
+#define GICV5_OP_GIC_CDEN sys_insn(1, 0, 12, 1, 1)
+#define GICV5_OP_GIC_CDEOI sys_insn(1, 0, 12, 1, 7)
+#define GICV5_OP_GIC_CDPEND sys_insn(1, 0, 12, 1, 4)
+#define GICV5_OP_GIC_CDPRI sys_insn(1, 0, 12, 1, 2)
+#define GICV5_OP_GIC_CDRCFG sys_insn(1, 0, 12, 1, 5)
+#define GICV5_OP_GICR_CDIA sys_insn(1, 0, 12, 3, 0)
+#define GICV5_OP_GICR_CDNMIA sys_insn(1, 0, 12, 3, 1)
+
+/* Definitions for GIC CDAFF */
+#define GICV5_GIC_CDAFF_IAFFID_MASK GENMASK_ULL(47, 32)
+#define GICV5_GIC_CDAFF_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDAFF_IRM_MASK BIT_ULL(28)
+#define GICV5_GIC_CDAFF_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDDI */
+#define GICV5_GIC_CDDI_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDI_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDDIS */
+#define GICV5_GIC_CDDIS_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDIS_TYPE(r) FIELD_GET(GICV5_GIC_CDDIS_TYPE_MASK, r)
+#define GICV5_GIC_CDDIS_ID_MASK GENMASK_ULL(23, 0)
+#define GICV5_GIC_CDDIS_ID(r) FIELD_GET(GICV5_GIC_CDDIS_ID_MASK, r)
+
+/* Definitions for GIC CDEN */
+#define GICV5_GIC_CDEN_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDEN_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDHM */
+#define GICV5_GIC_CDHM_HM_MASK BIT_ULL(32)
+#define GICV5_GIC_CDHM_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDHM_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPEND */
+#define GICV5_GIC_CDPEND_PENDING_MASK BIT_ULL(32)
+#define GICV5_GIC_CDPEND_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPEND_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPRI */
+#define GICV5_GIC_CDPRI_PRIORITY_MASK GENMASK_ULL(39, 35)
+#define GICV5_GIC_CDPRI_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPRI_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDRCFG */
+#define GICV5_GIC_CDRCFG_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDRCFG_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GICR CDIA */
+#define GICV5_GICR_CDIA_VALID_MASK BIT_ULL(32)
+#define GICV5_GICR_CDIA_VALID(r) FIELD_GET(GICV5_GICR_CDIA_VALID_MASK, r)
+#define GICV5_GICR_CDIA_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GICR_CDIA_ID_MASK GENMASK_ULL(23, 0)
+#define GICV5_GICR_CDIA_INTID GENMASK_ULL(31, 0)
+
+/* Definitions for GICR CDNMIA */
+#define GICV5_GICR_CDNMIA_VALID_MASK BIT_ULL(32)
+#define GICV5_GICR_CDNMIA_VALID(r) FIELD_GET(GICV5_GICR_CDNMIA_VALID_MASK, r)
+#define GICV5_GICR_CDNMIA_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GICR_CDNMIA_ID_MASK GENMASK_ULL(23, 0)
+
+#define gicr_insn(insn) read_sysreg_s(GICV5_OP_GICR_##insn)
+#define gic_insn(v, insn) write_sysreg_s(v, GICV5_OP_GIC_##insn)
+
+#define __GIC_BARRIER_INSN(op0, op1, CRn, CRm, op2, Rt) \
+ __emit_inst(0xd5000000 | \
+ sys_insn((op0), (op1), (CRn), (CRm), (op2)) | \
+ ((Rt) & 0x1f))
+
+#define GSB_SYS_BARRIER_INSN __GIC_BARRIER_INSN(1, 0, 12, 0, 0, 31)
+#define GSB_ACK_BARRIER_INSN __GIC_BARRIER_INSN(1, 0, 12, 0, 1, 31)
+
+#define gsb_ack() asm volatile(GSB_ACK_BARRIER_INSN : : : "memory")
+#define gsb_sys() asm volatile(GSB_SYS_BARRIER_INSN : : : "memory")
+
+#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
+
+#define GICV5_IRQ_DEFAULT_PRI 0b10000
+
+#define GICV5_ARCH_PPI_SW_PPI 0x3
+
+void gicv5_ppi_priority_init(void)
+{
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR0_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR1_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR2_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR3_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR4_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR5_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR6_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR7_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR8_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR9_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR10_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR11_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR12_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR13_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR14_EL1);
+ write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR15_EL1);
+
+ /*
+ * Context syncronization required to make sure system register writes
+ * effects are synchronised.
+ */
+ isb();
+}
+
+void gicv5_cpu_disable_interrupts(void)
+{
+ u64 cr0;
+
+ cr0 = FIELD_PREP(ICC_CR0_EL1_EN, 0);
+ write_sysreg_s(cr0, SYS_ICC_CR0_EL1);
+}
+
+void gicv5_cpu_enable_interrupts(void)
+{
+ u64 cr0, pcr;
+
+ write_sysreg_s(0, SYS_ICC_PPI_ENABLER0_EL1);
+ write_sysreg_s(0, SYS_ICC_PPI_ENABLER1_EL1);
+
+ gicv5_ppi_priority_init();
+
+ pcr = FIELD_PREP(ICC_PCR_EL1_PRIORITY, GICV5_IRQ_DEFAULT_PRI);
+ write_sysreg_s(pcr, SYS_ICC_PCR_EL1);
+
+ cr0 = FIELD_PREP(ICC_CR0_EL1_EN, 1);
+ write_sysreg_s(cr0, SYS_ICC_CR0_EL1);
+}
+
+#endif