summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 21:58:57 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 21:58:57 +0300
commit1ebdbeb03efe89f01f15df038a589077df3d21f5 (patch)
tree06b6b7bb565668d136c060c5104481e48cbf71e2 /arch/x86/kvm
parentefee6c79298fd823c569d501d041de85caa102a6 (diff)
parentc9b8fecddb5bb4b67e351bbaeaa648a6f7456912 (diff)
downloadlinux-1ebdbeb03efe89f01f15df038a589077df3d21f5.tar.xz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "ARM: - Proper emulation of the OSLock feature of the debug architecture - Scalibility improvements for the MMU lock when dirty logging is on - New VMID allocator, which will eventually help with SVA in VMs - Better support for PMUs in heterogenous systems - PSCI 1.1 support, enabling support for SYSTEM_RESET2 - Implement CONFIG_DEBUG_LIST at EL2 - Make CONFIG_ARM64_ERRATUM_2077057 default y - Reduce the overhead of VM exit when no interrupt is pending - Remove traces of 32bit ARM host support from the documentation - Updated vgic selftests - Various cleanups, doc updates and spelling fixes RISC-V: - Prevent KVM_COMPAT from being selected - Optimize __kvm_riscv_switch_to() implementation - RISC-V SBI v0.3 support s390: - memop selftest - fix SCK locking - adapter interruptions virtualization for secure guests - add Claudio Imbrenda as maintainer - first step to do proper storage key checking x86: - Continue switching kvm_x86_ops to static_call(); introduce static_call_cond() and __static_call_ret0 when applicable. - Cleanup unused arguments in several functions - Synthesize AMD 0x80000021 leaf - Fixes and optimization for Hyper-V sparse-bank hypercalls - Implement Hyper-V's enlightened MSR bitmap for nested SVM - Remove MMU auditing - Eager splitting of page tables (new aka "TDP" MMU only) when dirty page tracking is enabled - Cleanup the implementation of the guest PGD cache - Preparation for the implementation of Intel IPI virtualization - Fix some segment descriptor checks in the emulator - Allow AMD AVIC support on systems with physical APIC ID above 255 - Better API to disable virtualization quirks - Fixes and optimizations for the zapping of page tables: - Zap roots in two passes, avoiding RCU read-side critical sections that last too long for very large guests backed by 4 KiB SPTEs. - Zap invalid and defunct roots asynchronously via concurrency-managed work queue. - Allowing yielding when zapping TDP MMU roots in response to the root's last reference being put. - Batch more TLB flushes with an RCU trick. Whoever frees the paging structure now holds RCU as a proxy for all vCPUs running in the guest, i.e. to prolongs the grace period on their behalf. It then kicks the the vCPUs out of guest mode before doing rcu_read_unlock(). Generic: - Introduce __vcalloc and use it for very large allocations that need memcg accounting" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (246 commits) KVM: use kvcalloc for array allocations KVM: x86: Introduce KVM_CAP_DISABLE_QUIRKS2 kvm: x86: Require const tsc for RT KVM: x86: synthesize CPUID leaf 0x80000021h if useful KVM: x86: add support for CPUID leaf 0x80000021 KVM: x86: do not use KVM_X86_OP_OPTIONAL_RET0 for get_mt_mask Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()" kvm: x86/mmu: Flush TLB before zap_gfn_range releases RCU KVM: arm64: fix typos in comments KVM: arm64: Generalise VM features into a set of flags KVM: s390: selftests: Add error memop tests KVM: s390: selftests: Add more copy memop tests KVM: s390: selftests: Add named stages for memop test KVM: s390: selftests: Add macro as abstraction for MEM_OP KVM: s390: selftests: Split memop tests KVM: s390x: fix SCK locking RISC-V: KVM: Implement SBI HSM suspend call RISC-V: KVM: Add common kvm_riscv_vcpu_wfi() function RISC-V: Add SBI HSM suspend related defines RISC-V: KVM: Implement SBI v0.3 SRST extension ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/cpuid.c59
-rw-r--r--arch/x86/kvm/emulate.c71
-rw-r--r--arch/x86/kvm/hyperv.c242
-rw-r--r--arch/x86/kvm/hyperv.h6
-rw-r--r--arch/x86/kvm/i8259.c8
-rw-r--r--arch/x86/kvm/ioapic.c6
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c14
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h14
-rw-r--r--arch/x86/kvm/lapic.c227
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/mmu/mmu.c502
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c303
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h15
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h23
-rw-r--r--arch/x86/kvm/mmu/page_track.c7
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h4
-rw-r--r--arch/x86/kvm/mmu/spte.c72
-rw-r--r--arch/x86/kvm/mmu/spte.h129
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c14
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h25
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c986
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h31
-rw-r--r--arch/x86/kvm/svm/avic.c181
-rw-r--r--arch/x86/kvm/svm/hyperv.h35
-rw-r--r--arch/x86/kvm/svm/nested.c51
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c121
-rw-r--r--arch/x86/kvm/svm/svm.c134
-rw-r--r--arch/x86/kvm/svm/svm.h69
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h25
-rw-r--r--arch/x86/kvm/trace.h20
-rw-r--r--arch/x86/kvm/vmx/nested.c21
-rw-r--r--arch/x86/kvm/vmx/nested.h3
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c5
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c6
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c116
-rw-r--r--arch/x86/kvm/x86.c285
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/kvm/xen.c4
42 files changed, 2234 insertions, 1676 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2b1548da00eb..e3cbd7706136 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -126,13 +126,6 @@ config KVM_XEN
If in doubt, say "N".
-config KVM_MMU_AUDIT
- bool "Audit KVM MMU"
- depends on KVM && TRACEPOINTS
- help
- This option adds a R/W kVM module parameter 'mmu_audit', which allows
- auditing of KVM MMU events at runtime.
-
config KVM_EXTERNAL_WRITE_TRACKING
bool
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8bf541b24b4b..a00cd97b2623 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -715,9 +715,30 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
entry = &array->entries[array->nent++];
+ memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
- entry->flags = 0;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ case 0x80000000:
+ /*
+ * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
+ * would result in out-of-bounds calls to do_host_cpuid.
+ */
+ {
+ static int max_cpuid_80000000;
+ if (!READ_ONCE(max_cpuid_80000000))
+ WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
+ if (function > READ_ONCE(max_cpuid_80000000))
+ return entry;
+ }
+
+ default:
+ break;
+ }
cpuid_count(entry->function, entry->index,
&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
@@ -1061,7 +1082,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
break;
case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001f);
+ entry->eax = min(entry->eax, 0x80000021);
+ /*
+ * Serializing LFENCE is reported in a multitude of ways,
+ * and NullSegClearsBase is not reported in CPUID on Zen2;
+ * help userspace by providing the CPUID leaf ourselves.
+ */
+ if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG))
+ entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
@@ -1132,6 +1161,27 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx &= ~GENMASK(11, 6);
}
break;
+ case 0x80000020:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x80000021:
+ entry->ebx = entry->ecx = entry->edx = 0;
+ /*
+ * Pass down these bits:
+ * EAX 0 NNDBP, Processor ignores nested data breakpoints
+ * EAX 2 LAS, LFENCE always serializing
+ * EAX 6 NSCB, Null selector clear base
+ *
+ * Other defined bits are for MSRs that KVM does not expose:
+ * EAX 3 SPCL, SMM page configuration lock
+ * EAX 13 PCMSR, Prefetch control MSR
+ */
+ entry->eax &= BIT(0) | BIT(2) | BIT(6);
+ if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ entry->eax |= BIT(2);
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ entry->eax |= BIT(6);
+ break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
/*Just support up to 0xC0000004 now*/
@@ -1241,8 +1291,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
if (sanity_check_entries(entries, cpuid->nent, type))
return -EINVAL;
- array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
- cpuid->nent));
+ array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL);
if (!array.entries)
return -ENOMEM;
@@ -1260,7 +1309,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
r = -EFAULT;
out_free:
- vfree(array.entries);
+ kvfree(array.entries);
return r;
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 50ca1db7247c..7ba4ec77feeb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1623,11 +1623,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
dpl = seg_desc.dpl;
switch (seg) {
@@ -1643,14 +1638,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (!(seg_desc.type & 8))
goto exception;
- if (seg_desc.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
+ if (transfer == X86_TRANSFER_RET) {
+ /* RET can never return to an inner privilege level. */
+ if (rpl < cpl)
goto exception;
+ /* Outer-privilege level return is not implemented */
+ if (rpl > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) {
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > rpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (dpl != rpl)
+ goto exception;
+ }
+ } else { /* X86_TRANSFER_CALL_JMP */
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
}
/* in long-mode d/b must be clear if l is set */
if (seg_desc.d && seg_desc.l) {
@@ -1667,6 +1682,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
+ if (!seg_desc.p) {
+ err_vec = NP_VECTOR;
+ goto exception;
+ }
old_desc = seg_desc;
seg_desc.type |= 2; /* busy */
ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
@@ -1691,6 +1710,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
if (!(seg_desc.type & 1)) {
@@ -2216,9 +2240,6 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- /* Outer-privilege level return is not implemented */
- if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
- return X86EMUL_UNHANDLEABLE;
rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
X86_TRANSFER_RET,
&new_desc);
@@ -2615,8 +2636,7 @@ emulate_shutdown:
}
static void
-setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct desc_struct *cs, struct desc_struct *ss)
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
{
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
@@ -2705,7 +2725,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
if (!(efer & EFER_SCE))
return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
@@ -2773,7 +2793,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2810,7 +2830,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
if ((ctxt->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -3028,8 +3048,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_16 tss_seg;
@@ -3167,8 +3186,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
return ret;
}
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_32 tss_seg;
@@ -3276,10 +3294,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
old_tss_sel = 0xffff;
if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
else
- ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
+ ret = task_switch_16(ctxt, old_tss_sel,
old_tss_base, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6e38a7d22e97..a32f54ab84a2 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -112,6 +112,9 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
if (!!auto_eoi_old == !!auto_eoi_new)
return;
+ if (!enable_apicv)
+ return;
+
down_write(&vcpu->kvm->arch.apicv_update_lock);
if (auto_eoi_new)
@@ -1710,32 +1713,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
-static __always_inline unsigned long *sparse_set_to_vcpu_mask(
- struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
- u64 *vp_bitmap, unsigned long *vcpu_bitmap)
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
struct kvm_vcpu *vcpu;
int bank, sbank = 0;
unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
+
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
- memset(vp_bitmap, 0,
- KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
- vp_bitmap[bank] = sparse_banks[sbank++];
+ bitmap[bank] = sparse_banks[sbank++];
- if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
- /* for all vcpus vp_index == vcpu_idx */
- return (unsigned long *)vp_bitmap;
- }
+ if (likely(!has_mismatch))
+ return;
- bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
- __set_bit(i, vcpu_bitmap);
+ __set_bit(i, vcpu_mask);
}
- return vcpu_bitmap;
}
struct kvm_hv_hcall {
@@ -1743,6 +1761,7 @@ struct kvm_hv_hcall {
u64 ingpa;
u64 outgpa;
u16 code;
+ u16 var_cnt;
u16 rep_cnt;
u16 rep_idx;
bool fast;
@@ -1750,22 +1769,60 @@ struct kvm_hv_hcall {
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
};
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ int consumed_xmm_halves,
+ u64 *sparse_banks, gpa_t offset)
{
+ u16 var_cnt;
int i;
- gpa_t gpa;
+
+ if (hc->var_cnt > 64)
+ return -EINVAL;
+
+ /* Ignore banks that cannot possibly contain a legal VP index. */
+ var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+
+ if (hc->fast) {
+ /*
+ * Each XMM holds two sparse banks, but do not count halves that
+ * have already been consumed for hypercall parameters.
+ */
+ if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ for (i = 0; i < var_cnt; i++) {
+ int j = i + consumed_xmm_halves;
+ if (j % 2)
+ sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
+ else
+ sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
+ }
+ return 0;
+ }
+
+ return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
+ var_cnt * sizeof(*sparse_banks));
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
bool all_cpus;
- if (!ex) {
+ /*
+ * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
+ * valid mask is a u64. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) would exceed this limit, KVM will additional changes
+ * for Hyper-V support to avoid setting the guest up to fail.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
if (hc->fast) {
flush.address_space = hc->ingpa;
flush.flags = hc->outgpa;
@@ -1812,30 +1869,22 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = flush_ex.hv_vp_set.format !=
HV_GENERIC_SET_SPARSE_4K;
- sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64);
+ if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (!sparse_banks_len && !all_cpus)
+ if (all_cpus)
+ goto do_flush;
+
+ if (!hc->var_cnt)
goto ret_success;
- if (!all_cpus) {
- if (hc->fast) {
- if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1)
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
- for (i = 0; i < sparse_banks_len; i += 2) {
- sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]);
- sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]);
- }
- } else {
- gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents);
- if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks,
- sparse_banks_len *
- sizeof(sparse_banks[0]))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
- }
- }
+ if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
+ offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents)))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
@@ -1843,11 +1892,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
if (all_cpus) {
kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
} else {
- vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
- vcpu_mask);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
}
ret_success:
@@ -1875,21 +1922,18 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
unsigned long valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
- if (!ex) {
+ if (hc->code == HVCALL_SEND_IPI) {
if (!hc->fast) {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
sizeof(send_ipi))))
@@ -1908,9 +1952,15 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
} else {
- if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
- sizeof(send_ipi_ex))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ } else {
+ send_ipi_ex.vector = (u32)hc->ingpa;
+ send_ipi_ex.vp_set.format = hc->outgpa;
+ send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]);
+ }
trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
send_ipi_ex.vp_set.format,
@@ -1918,22 +1968,20 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
vector = send_ipi_ex.vector;
valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
- sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
- sizeof(sparse_banks[0]);
-
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
if (all_cpus)
goto check_and_send_ipi;
- if (!sparse_banks_len)
+ if (!hc->var_cnt)
goto ret_success;
- if (kvm_read_guest(kvm,
- hc->ingpa + offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents),
- sparse_banks,
- sparse_banks_len))
+ if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
+ offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
@@ -1941,11 +1989,13 @@ check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ if (all_cpus) {
+ kvm_send_ipi_to_many(kvm, vector, NULL);
+ } else {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ }
ret_success:
return HV_STATUS_SUCCESS;
@@ -2017,11 +2067,6 @@ int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
return ret;
}
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
-}
-
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
bool longmode;
@@ -2096,6 +2141,7 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ case HVCALL_SEND_IPI_EX:
return true;
}
@@ -2191,19 +2237,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
hc.rep = !!(hc.rep_cnt || hc.rep_idx);
- trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx,
- hc.ingpa, hc.outgpa);
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
ret = HV_STATUS_ACCESS_DENIED;
goto hypercall_complete;
}
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
if (hc.fast && is_xmm_fast_hypercall(&hc)) {
if (unlikely(hv_vcpu->enforce_cpuid &&
!(hv_vcpu->cpuid_cache.features_edx &
@@ -2217,14 +2269,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
switch (hc.code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_SIGNAL_EVENT:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2234,7 +2286,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) {
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -2247,46 +2299,43 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_complete_userspace;
return 0;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
- if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, false);
- break;
- case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
- if (unlikely(hc.rep)) {
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, false);
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
break;
- case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
- if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, true);
- break;
+ fallthrough;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, &hc, true);
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
break;
case HVCALL_SEND_IPI:
- if (unlikely(hc.rep)) {
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, &hc, false);
- break;
+ fallthrough;
case HVCALL_SEND_IPI_EX:
- if (unlikely(hc.fast || hc.rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, &hc, true);
+ ret = kvm_hv_send_ipi(vcpu, &hc);
break;
case HVCALL_POST_DEBUG_DATA:
case HVCALL_RETRIEVE_DEBUG_DATA:
@@ -2417,10 +2466,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
if (kvm_x86_ops.nested_ops->get_evmcs_version)
evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
- /* Skip NESTED_FEATURES if eVMCS is not supported */
- if (!evmcs_ver)
- --nent;
-
if (cpuid->nent < nent)
return -E2BIG;
@@ -2520,8 +2565,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
- if (evmcs_ver)
- ent->eax |= HV_X64_NESTED_MSR_BITMAP;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index ed1c4e546d04..e19c00ee9ab3 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -89,7 +89,11 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 814064d06016..be99dc86293d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -437,13 +437,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr)
return ret;
}
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+static void elcr_ioport_write(void *opaque, u32 val)
{
struct kvm_kpic_state *s = opaque;
s->elcr = val & s->elcr_mask;
}
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
+static u32 elcr_ioport_read(void *opaque)
{
struct kvm_kpic_state *s = opaque;
return s->elcr;
@@ -474,7 +474,7 @@ static int picdev_write(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ elcr_ioport_write(&s->pics[addr & 1], data);
pic_unlock(s);
break;
default:
@@ -505,7 +505,7 @@ static int picdev_read(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
pic_unlock(s);
break;
default:
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index decfa36b7891..765943d7cfa5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
int trigger_mode,
int pin);
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
- unsigned long addr,
- unsigned long length)
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
{
unsigned long result = 0;
@@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
break;
case IOAPIC_REG_WINDOW:
- result = ioapic_read_indirect(ioapic, addr, len);
+ result = ioapic_read_indirect(ioapic);
break;
default:
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index b469f45e3fe4..ee4f696a0782 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -92,3 +92,17 @@ int hv_remote_flush_tlb(struct kvm *kvm)
return hv_remote_flush_tlb_with_range(kvm, NULL);
}
EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 1c67abf2eba9..287e98ef9df3 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -10,19 +10,7 @@
int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range);
int hv_remote_flush_tlb(struct kvm *kvm);
-
-static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
-{
- struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
-
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
- spin_lock(&kvm_arch->hv_root_tdp_lock);
- vcpu->arch.hv_root_tdp = root_tdp;
- if (root_tdp != kvm_arch->hv_root_tdp)
- kvm_arch->hv_root_tdp = INVALID_PAGE;
- spin_unlock(&kvm_arch->hv_root_tdp_lock);
- }
-}
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9322e6340a74..80a2020c4db4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,39 @@ static bool lapic_timer_advance_dynamic __read_mostly;
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+ *((u32 *) (regs + reg_off)) = val;
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ __kvm_lapic_set_reg(apic->regs, reg_off, val);
+}
+
+static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
+{
+ return __kvm_lapic_get_reg64(apic->regs, reg);
+}
+
+static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+ int reg, u64 val)
+{
+ __kvm_lapic_set_reg64(apic->regs, reg, val);
+}
+
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -113,7 +146,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
@@ -491,8 +525,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -522,7 +555,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -570,8 +603,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -1276,6 +1308,9 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
{
struct kvm_lapic_irq irq;
+ /* KVM has no delay and should always clear the BUSY/PENDING flag. */
+ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
@@ -1292,6 +1327,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
+EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
@@ -1375,8 +1411,8 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
unsigned char alignment = offset & 0xf;
u32 result;
@@ -1394,7 +1430,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
APIC_REG_MASK(APIC_ESR) |
APIC_REG_MASK(APIC_ICR) |
- APIC_REG_MASK(APIC_ICR2) |
APIC_REG_MASK(APIC_LVTT) |
APIC_REG_MASK(APIC_LVTTHMR) |
APIC_REG_MASK(APIC_LVTPC) |
@@ -1405,9 +1440,16 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REG_MASK(APIC_TMCCT) |
APIC_REG_MASK(APIC_TDCR);
- /* ARBPRI is not valid on x2APIC */
+ /*
+ * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
+ * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
+ * manually handled by the caller.
+ */
if (!apic_x2apic_mode(apic))
- valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_ICR2);
+ else
+ WARN_ON_ONCE(offset == APIC_ICR);
if (alignment + len > 4)
return 1;
@@ -1432,7 +1474,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
@@ -1993,7 +2034,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -2052,16 +2093,18 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}
case APIC_ICR:
+ WARN_ON_ONCE(apic_x2apic_mode(apic));
+
/* No delay here, so we always clear the pending bit */
- val &= ~(1 << 12);
+ val &= ~APIC_ICR_BUSY;
kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
kvm_lapic_set_reg(apic, APIC_ICR, val);
break;
-
case APIC_ICR2:
- if (!apic_x2apic_mode(apic))
- val &= 0xff000000;
- kvm_lapic_set_reg(apic, APIC_ICR2, val);
+ if (apic_x2apic_mode(apic))
+ ret = 1;
+ else
+ kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
case APIC_LVT0:
@@ -2121,10 +2164,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic)) {
- kvm_lapic_reg_write(apic, APIC_ICR,
- APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
- } else
+ if (apic_x2apic_mode(apic))
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
+ else
ret = 1;
break;
default:
@@ -2132,11 +2174,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}
+ /*
+ * Recalculate APIC maps if necessary, e.g. if the software enable bit
+ * was toggled, the APIC ID changed, etc... The maps are marked dirty
+ * on relevant changes, i.e. this is a nop for most writes.
+ */
kvm_recalculate_apic_map(apic->vcpu->kvm);
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, const void *data)
@@ -2180,12 +2226,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
- u32 val = 0;
-
- /* hw has done the conditional check and inst decode */
- offset &= 0xff0;
-
- kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+ u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
/* TODO: optimize to just emulate side effect w/o one more write */
kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
@@ -2242,10 +2283,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4));
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
@@ -2287,7 +2325,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
+ static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2356,8 +2394,12 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!apic_x2apic_mode(apic))
kvm_apic_set_ldr(apic, 0);
kvm_lapic_set_reg(apic, APIC_ESR, 0);
- kvm_lapic_set_reg(apic, APIC_ICR, 0);
- kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+ }
kvm_lapic_set_reg(apic, APIC_TDCR, 0);
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
for (i = 0; i < 8; i++) {
@@ -2373,9 +2415,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2574,6 +2616,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (apic_x2apic_mode(vcpu->arch.apic)) {
u32 *id = (u32 *)(s->regs + APIC_ID);
u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+ u64 icr;
if (vcpu->kvm->arch.x2apic_format) {
if (*id != vcpu->vcpu_id)
@@ -2585,9 +2628,21 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
*id <<= 24;
}
- /* In x2APIC mode, the LDR is fixed and based on the id */
- if (set)
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+ * ICR is internally a single 64-bit register, but needs to be
+ * split to ICR+ICR2 in userspace for backwards compatibility.
+ */
+ if (set) {
*ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
}
return 0;
@@ -2638,11 +2693,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -2779,73 +2832,85 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
return 0;
}
-int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4;
+ data &= ~APIC_ICR_BUSY;
- if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
- return 1;
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+ u32 low;
+
+ if (reg == APIC_ICR) {
+ *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ return 0;
+ }
- if (reg == APIC_ICR2)
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
return 1;
- /* if this is ICR write vector before command */
+ *data = low;
+
+ return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+ /*
+ * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+ * can be written as such, all other registers remain accessible only
+ * through 32-bit reads/writes.
+ */
if (reg == APIC_ICR)
- kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_x2apic_icr_write(apic, data);
+
return kvm_lapic_reg_write(apic, reg, (u32)data);
}
-int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_DFR || reg == APIC_ICR2)
- return 1;
+ return kvm_lapic_msr_write(apic, reg, data);
+}
- if (kvm_lapic_reg_read(apic, reg, 4, &low))
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_ICR)
- kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
- *data = (((u64)high) << 32) | low;
+ if (reg == APIC_DFR)
+ return 1;
- return 0;
+ return kvm_lapic_msr_read(apic, reg, data);
}
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
if (!lapic_in_kernel(vcpu))
return 1;
- /* if this is ICR write vector before command */
- if (reg == APIC_ICR)
- kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return kvm_lapic_reg_write(apic, reg, (u32)data);
+ return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
}
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 low, high = 0;
-
if (!lapic_in_kernel(vcpu))
return 1;
- if (kvm_lapic_reg_read(apic, reg, 4, &low))
- return 1;
- if (reg == APIC_ICR)
- kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
-
- *data = (((u64)high) << 32) | low;
-
- return 0;
+ return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
}
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
@@ -2933,7 +2998,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2b44e533fc8d..4e4f8a22754f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -85,9 +85,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
@@ -121,6 +118,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
@@ -153,19 +151,14 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
{
- return *((u32 *) (apic->regs + reg_off));
+ return *((u32 *) (regs + reg_off));
}
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
-static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
+ return __kvm_lapic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9fbb2c8bbe2..bf8dbc4bb12a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
@@ -79,12 +80,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
+ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
@@ -106,7 +108,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
{
- u64 root_hpa = vcpu->arch.mmu->root_hpa;
+ u64 root_hpa = vcpu->arch.mmu->root.hpa;
if (!VALID_PAGE(root_hpa))
return;
@@ -203,44 +205,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
/*
- * Currently, we have two sorts of write-protection, a) the first one
- * write-protects guest page to sync the guest modification, b) another one is
- * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
- * between these two sorts are:
- * 1) the first case clears MMU-writable bit.
- * 2) the first case requires flushing tlb immediately avoiding corrupting
- * shadow page table between all vcpus so it should be in the protection of
- * mmu-lock. And the another case does not need to flush tlb until returning
- * the dirty bitmap to userspace since it only write-protects the page
- * logged in the bitmap, that means the page in the dirty bitmap is not
- * missed, so it can flush tlb out of mmu-lock.
- *
- * So, there is the problem: the first case can meet the corrupted tlb caused
- * by another case which write-protects pages but without flush tlb
- * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose MMU-writable bit
- * is set, it works since another case never touches MMU-writable bit.
- *
- * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with MMU-writable becomes
- * readonly, if that happens, we need to flush tlb. Fortunately,
- * mmu_spte_update() has already handled it perfectly.
- *
- * The rules to use MMU-writable and PT_WRITABLE_MASK:
- * - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check MMU-writable, this is the most
- * case, otherwise
- * - if we fix page fault on the spte or do write-protection by dirty logging,
- * check PT_WRITABLE_MASK.
- *
- * TODO: introduce APIs to split these two cases.
- */
-static inline bool is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
* access rights (in ACC_* format).
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5628d0ba637e..51671cb34fb6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -104,15 +104,6 @@ static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
-enum {
- AUDIT_PRE_PAGE_FAULT,
- AUDIT_POST_PAGE_FAULT,
- AUDIT_PRE_PTE_WRITE,
- AUDIT_POST_PTE_WRITE,
- AUDIT_PRE_SYNC,
- AUDIT_POST_SYNC
-};
-
#ifdef MMU_DEBUG
bool dbg = 0;
module_param(dbg, bool, 0644);
@@ -190,8 +181,6 @@ struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
static void mmu_spte_set(u64 *sptep, u64 spte);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
struct kvm_mmu_role_regs {
const unsigned long cr0;
@@ -529,6 +518,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
WARN_ON(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -548,11 +538,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changed.
*
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+ * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
+ * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
+ * spte, even though the writable spte might be cached on a CPU's TLB.
*
* Returns true if the TLB needs to be flushed
*/
@@ -646,24 +634,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Restore an acc-track PTE back to a regular PTE */
-static u64 restore_acc_track_spte(u64 spte)
-{
- u64 new_spte = spte;
- u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
- & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
-
- WARN_ON_ONCE(spte_ad_enabled(spte));
- WARN_ON_ONCE(!is_access_track_spte(spte));
-
- new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
- SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
- new_spte |= saved_bits;
-
- return new_spte;
-}
-
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
@@ -1229,9 +1199,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- bool pt_protect)
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1311,7 +1280,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
- __rmap_write_protect(kvm, rmap_head, false);
+ rmap_write_protect(rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1378,6 +1347,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
/* Cross two large pages? */
@@ -1410,7 +1382,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ write_protected |= rmap_write_protect(rmap_head, true);
}
}
@@ -1421,7 +1393,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
return write_protected;
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
@@ -1921,13 +1893,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
return true;
}
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-static void mmu_audit_disable(void) { }
-#endif
-
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
if (sp->role.invalid)
@@ -2024,7 +1989,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
bool protected = false;
for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu, sp->gfn);
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
if (protected) {
kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
@@ -2149,7 +2114,7 @@ trace_get_page:
hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
account_shadowed(vcpu->kvm, sp);
- if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
+ if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
}
trace_kvm_mmu_get_page(sp, true);
@@ -2179,7 +2144,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
* prev_root is currently only used for 64-bit hosts. So only
* the active root_hpa is valid here.
*/
- BUG_ON(root != vcpu->arch.mmu->root_hpa);
+ BUG_ON(root != vcpu->arch.mmu->root.hpa);
iterator->shadow_addr
= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
@@ -2193,7 +2158,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, u64 addr)
{
- shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
addr);
}
@@ -2307,7 +2272,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -2345,13 +2310,13 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
struct list_head *invalid_list,
int *nr_zapped)
{
- bool list_unstable;
+ bool list_unstable, zapped_root = false;
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
- kvm_mmu_unlink_parents(kvm, sp);
+ kvm_mmu_unlink_parents(sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
@@ -2387,14 +2352,20 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
* in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
* treats invalid shadow pages as being obsolete.
*/
- if (!is_obsolete_sp(kvm, sp))
- kvm_reload_remote_mmus(kvm);
+ zapped_root = !is_obsolete_sp(kvm, sp);
}
if (sp->lpage_disallowed)
unaccount_huge_nx_page(kvm, sp);
sp->role.invalid = 1;
+
+ /*
+ * Make the request to free obsolete roots after marking the root
+ * invalid, otherwise other vCPUs may not see it as invalid.
+ */
+ if (zapped_root)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
return list_unstable;
}
@@ -3239,6 +3210,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+ if (WARN_ON(!sp))
+ return;
if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_put_root(kvm, sp, false);
@@ -3249,18 +3222,20 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
}
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
- struct kvm *kvm = vcpu->kvm;
int i;
LIST_HEAD(invalid_list);
- bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
+ bool free_active_root;
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
/* Before acquiring the MMU lock, see if we need to do any real work. */
- if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
+ && VALID_PAGE(mmu->root.hpa);
+
+ if (!free_active_root) {
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
VALID_PAGE(mmu->prev_roots[i].hpa))
@@ -3278,9 +3253,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
&invalid_list);
if (free_active_root) {
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
+ if (to_shadow_page(mmu->root.hpa)) {
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
@@ -3291,8 +3265,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
mmu->pae_root[i] = INVALID_PAE_ROOT;
}
}
- mmu->root_hpa = INVALID_PAGE;
- mmu->root_pgd = 0;
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
@@ -3300,7 +3274,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
-void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
{
unsigned long roots_to_free = 0;
hpa_t root_hpa;
@@ -3322,7 +3296,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
@@ -3365,10 +3339,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
- mmu->root_hpa = root;
+ mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
- mmu->root_hpa = root;
+ mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
if (WARN_ON_ONCE(!mmu->pae_root)) {
r = -EIO;
@@ -3383,15 +3357,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
mmu->pae_root[i] = root | PT_PRESENT_MASK |
shadow_me_mask;
}
- mmu->root_hpa = __pa(mmu->pae_root);
+ mmu->root.hpa = __pa(mmu->pae_root);
} else {
WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
r = -EIO;
goto out_unlock;
}
- /* root_pgd is ignored for direct MMUs. */
- mmu->root_pgd = 0;
+ /* root.pgd is ignored for direct MMUs. */
+ mmu->root.pgd = 0;
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -3504,7 +3478,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (mmu->root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
mmu->shadow_root_level, false);
- mmu->root_hpa = root;
+ mmu->root.hpa = root;
goto set_root_pgd;
}
@@ -3554,14 +3528,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
- mmu->root_hpa = __pa(mmu->pml5_root);
+ mmu->root.hpa = __pa(mmu->pml5_root);
else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
- mmu->root_hpa = __pa(mmu->pml4_root);
+ mmu->root.hpa = __pa(mmu->pml4_root);
else
- mmu->root_hpa = __pa(mmu->pae_root);
+ mmu->root.hpa = __pa(mmu->pae_root);
set_root_pgd:
- mmu->root_pgd = root_pgd;
+ mmu->root.pgd = root_pgd;
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
@@ -3660,6 +3634,14 @@ static bool is_unsync_root(hpa_t root)
*/
smp_rmb();
sp = to_shadow_page(root);
+
+ /*
+ * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
+ * PDPTEs for a given PAE root need to be synchronized individually.
+ */
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
if (sp->unsync || sp->unsync_children)
return true;
@@ -3674,30 +3656,25 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->direct_map)
return;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
+ hpa_t root = vcpu->arch.mmu->root.hpa;
sp = to_shadow_page(root);
if (!is_unsync_root(root))
return;
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-
mmu_sync_children(vcpu, sp, true);
-
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
return;
}
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
@@ -3709,7 +3686,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -3723,7 +3699,7 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
/* sync prev_roots by simply freeing them */
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -3982,7 +3958,7 @@ out_retry:
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int mmu_seq)
{
- struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */
if (sp && is_obsolete_sp(vcpu->kvm, sp))
@@ -3996,7 +3972,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
* previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
* to reload even if no vCPU is actively using the root.
*/
- if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
+ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
return true;
return fault->slot &&
@@ -4132,74 +4108,105 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+ VALID_PAGE(root->hpa) &&
role.word == to_shadow_page(root->hpa)->role.word;
}
/*
- * Find out if a previously cached root matching the new pgd/role is available.
- * The current root is also inserted into the cache.
- * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
- * returned.
- * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
- * false is returned. This root should now be freed by the caller.
+ * Find out if a previously cached root matching the new pgd/role is available,
+ * and insert the current root as the MRU in the cache.
+ * If a matching root is found, it is assigned to kvm_mmu->root and
+ * true is returned.
+ * If no match is found, kvm_mmu->root is left invalid, the LRU root is
+ * evicted to make room for the current root, and false is returned.
*/
-static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
uint i;
- struct kvm_mmu_root_info root;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
-
- root.pgd = mmu->root_pgd;
- root.hpa = mmu->root_hpa;
- if (is_root_usable(&root, new_pgd, new_role))
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
return true;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
- swap(root, mmu->prev_roots[i]);
-
- if (is_root_usable(&root, new_pgd, new_role))
- break;
+ /*
+ * The swaps end up rotating the cache like this:
+ * C 0 1 2 3 (on entry to the function)
+ * 0 C 1 2 3
+ * 1 C 0 2 3
+ * 2 C 0 1 3
+ * 3 C 0 1 2 (on exit from the loop)
+ */
+ swap(mmu->root, mmu->prev_roots[i]);
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
}
- mmu->root_hpa = root.hpa;
- mmu->root_pgd = root.pgd;
-
- return i < KVM_MMU_NUM_PREV_ROOTS;
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+ return false;
}
-static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+/*
+ * Find out if a previously cached root matching the new pgd/role is available.
+ * On entry, mmu->root is invalid.
+ * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
+ * of the cache becomes invalid, and true is returned.
+ * If no match is found, kvm_mmu->root is left invalid and false is returned.
+ */
+static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ uint i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
+ goto hit;
+
+ return false;
+hit:
+ swap(mmu->root, mmu->prev_roots[i]);
+ /* Bubble up the remaining roots. */
+ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
+ mmu->prev_roots[i] = mmu->prev_roots[i + 1];
+ mmu->prev_roots[i].hpa = INVALID_PAGE;
+ return true;
+}
+
+static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd, union kvm_mmu_page_role new_role)
+{
/*
- * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * For now, limit the caching to 64-bit hosts+VMs in order to avoid
* having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
* later if necessary.
*/
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- mmu->root_level >= PT64_ROOT_4LEVEL)
- return cached_root_available(vcpu, new_pgd, new_role);
+ if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
- return false;
+ if (VALID_PAGE(mmu->root.hpa))
+ return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
+ else
+ return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
}
-static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
{
- if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role new_role = mmu->mmu_role.base;
+
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
+ /* kvm_mmu_ensure_valid_pgd will set up a new root. */
return;
}
/*
* It's possible that the cached previous root page is obsolete because
* of a change in the MMU generation number. However, changing the
- * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
- * free the root set here and allocate a new one.
+ * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
+ * which will free the root set here and allocate a new one.
*/
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
@@ -4222,12 +4229,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
if (!new_role.direct)
__clear_sp_write_flooding_count(
- to_shadow_page(vcpu->arch.mmu->root_hpa));
-}
-
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
-{
- __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
+ to_shadow_page(vcpu->arch.mmu->root.hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -4485,8 +4487,7 @@ static inline bool boot_cpu_is_amd(void)
* possible, however, kvm currently does not do execution-protection.
*/
static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4517,8 +4518,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
* is the shadow page table for intel nested guest.
*/
static void
-reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
reserved_hpa_bits(), execonly,
@@ -4805,7 +4805,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
reset_guest_paging_metadata(vcpu, context);
- reset_tdp_shadow_zero_bits_mask(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
}
static union kvm_mmu_role
@@ -4899,9 +4899,8 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
- __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
-
shadow_mmu_init_context(vcpu, context, &regs, new_role);
+ kvm_mmu_new_pgd(vcpu, nested_cr3);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4939,27 +4938,25 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
-
- if (new_role.as_u64 == context->mmu_role.as_u64)
- return;
-
- context->mmu_role.as_u64 = new_role.as_u64;
-
- context->shadow_root_level = level;
-
- context->ept_ad = accessed_dirty;
- context->page_fault = ept_page_fault;
- context->gva_to_gpa = ept_gva_to_gpa;
- context->sync_page = ept_sync_page;
- context->invlpg = ept_invlpg;
- context->root_level = level;
- context->direct_map = false;
+ if (new_role.as_u64 != context->mmu_role.as_u64) {
+ context->mmu_role.as_u64 = new_role.as_u64;
+
+ context->shadow_root_level = level;
+
+ context->ept_ad = accessed_dirty;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->root_level = level;
+ context->direct_map = false;
+ update_permission_bitmask(context, true);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
+ }
- update_permission_bitmask(context, true);
- context->pkru_mask = 0;
- reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
- reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
+ kvm_mmu_new_pgd(vcpu, new_eptp);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -5044,20 +5041,6 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_init_mmu);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- union kvm_mmu_role role;
-
- if (tdp_enabled)
- role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
- else
- role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
-
- return role.base;
-}
-
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
/*
@@ -5111,17 +5094,73 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
kvm_mmu_load_pgd(vcpu);
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+
+ /*
+ * Flush any TLB entries for the new root, the provenance of the root
+ * is unknown. Even if KVM ensures there are no stale TLB entries
+ * for a freed root, in theory another hypervisor could have left
+ * stale entries. Flushing on alloc also allows KVM to skip the TLB
+ * flush when freeing a root (see kvm_tdp_mmu_put_root()).
+ */
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
out:
return r;
}
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+}
+
+static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root_hpa))
+ return false;
+
+ /*
+ * When freeing obsolete roots, treat roots as obsolete if they don't
+ * have an associated shadow page. This does mean KVM will get false
+ * positives and free roots that don't strictly need to be freed, but
+ * such false positives are relatively rare:
+ *
+ * (a) only PAE paging and nested NPT has roots without shadow pages
+ * (b) remote reloads due to a memslot update obsoletes _all_ roots
+ * (c) KVM doesn't track previous roots for PAE paging, and the guest
+ * is unlikely to zap an in-use PGD.
+ */
+ sp = to_shadow_page(root_hpa);
+ return !sp || is_obsolete_sp(kvm, sp);
+}
+
+static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
+{
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -5271,7 +5310,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
@@ -5296,7 +5334,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
- kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -5306,7 +5343,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->direct_map;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
r = RET_PF_INVALID;
@@ -5371,14 +5408,14 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
}
if (!mmu->invlpg)
return;
if (root_hpa == INVALID_PAGE) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
/*
* INVLPG is required to invalidate any global mappings for the VA,
@@ -5414,7 +5451,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
uint i;
if (pcid == kvm_get_active_pcid(vcpu)) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
tlb_flush = true;
}
@@ -5427,7 +5464,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (tlb_flush)
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5527,8 +5564,8 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
struct page *page;
int i;
- mmu->root_hpa = INVALID_PAGE;
- mmu->root_pgd = 0;
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -5648,9 +5685,13 @@ restart:
}
/*
- * Trigger a remote TLB flush before freeing the page tables to ensure
- * KVM is not in the middle of a lockless shadow page table walk, which
- * may reference the pages.
+ * Kick all vCPUs (via remote TLB flush) before freeing the page tables
+ * to ensure KVM is not in the middle of a lockless shadow page table
+ * walk, which may reference the pages. The remote TLB flush itself is
+ * not required and is simply a convenient way to kick vCPUs as needed.
+ * KVM performs a local TLB flush when allocating a new root (see
+ * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
+ * running with an obsolete MMU.
*/
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
}
@@ -5680,11 +5721,11 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
- /* In order to ensure all threads see this change when
- * handling the MMU reload signal, this must happen in the
- * same critical section as kvm_reload_remote_mmus, and
- * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
- * could drop the MMU lock and yield.
+ /*
+ * In order to ensure all vCPUs drop their soon-to-be invalid roots,
+ * invalidating TDP MMU roots must be done while holding mmu_lock for
+ * write and in the same critical section as making the reload request,
+ * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_invalidate_all_roots(kvm);
@@ -5697,17 +5738,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Note: we need to do this under the protection of mmu_lock,
* otherwise, vcpu would purge shadow page but miss tlb flush.
*/
- kvm_reload_remote_mmus(kvm);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
kvm_zap_obsolete_pages(kvm);
write_unlock(&kvm->mmu_lock);
- if (is_tdp_mmu_enabled(kvm)) {
- read_lock(&kvm->mmu_lock);
+ /*
+ * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
+ * returning to the caller, e.g. if the zap is in response to a memslot
+ * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
+ * associated with the deleted memslot once the update completes, and
+ * Deferring the zap until the final reference to the root is put would
+ * lead to use-after-free.
+ */
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_invalidated_roots(kvm);
- read_unlock(&kvm->mmu_lock);
- }
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5813,7 +5859,7 @@ static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot)
{
- return __rmap_write_protect(kvm, rmap_head, false);
+ return rmap_write_protect(rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -5857,12 +5903,52 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
* will clear a separate software-only bit (MMU-writable) and skip the
* flush if-and-only-if this bit was already clear.
*
- * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
+ * See is_writable_pte() for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+ target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same resons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot)
@@ -6202,7 +6288,6 @@ void kvm_mmu_module_exit(void)
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
- mmu_audit_disable();
}
/*
@@ -6272,6 +6357,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
rcu_idx = srcu_read_lock(&kvm->srcu);
write_lock(&kvm->mmu_lock);
+ /*
+ * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+ * be done under RCU protection, because the pages are freed via RCU
+ * callback.
+ */
+ rcu_read_lock();
+
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
@@ -6296,12 +6388,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
cond_resched_rwlock_write(&kvm->mmu_lock);
flush = false;
+
+ rcu_read_lock();
}
}
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
deleted file mode 100644
index 9e7dcf999f08..000000000000
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mmu_audit.c:
- *
- * Audit code for KVM MMU
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- * Marcelo Tosatti <mtosatti@redhat.com>
- * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
- */
-
-#include <linux/ratelimit.h>
-
-static char const *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
-
-#define audit_printk(kvm, fmt, args...) \
- printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[kvm->arch.audit_point], ##args)
-
-typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-
-static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- inspect_spte_fn fn, int level)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 *ent = sp->spt;
-
- fn(vcpu, ent + i, level);
-
- if (is_shadow_present_pte(ent[i]) &&
- !is_last_spte(ent[i], level)) {
- struct kvm_mmu_page *child;
-
- child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(vcpu, child, fn, level - 1);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
-
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
-
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
- return;
- }
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu->pae_root[i];
-
- if (IS_VALID_PAE_ROOT(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, 2);
- }
- }
-
- return;
-}
-
-typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
- fn(kvm, sp);
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- kvm_pfn_t pfn;
- hpa_t hpa;
-
- sp = sptep_to_sp(sptep);
-
- if (sp->unsync) {
- if (level != PG_LEVEL_4K) {
- audit_printk(vcpu->kvm, "unsync sp: %p "
- "level = %d\n", sp, level);
- return;
- }
- }
-
- if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
-
- if (is_error_pfn(pfn))
- return;
-
- hpa = pfn << PAGE_SHIFT;
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu->root_level, pfn,
- hpa, *sptep);
-}
-
-static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- struct kvm_rmap_head *rmap_head;
- struct kvm_mmu_page *rev_sp;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- gfn_t gfn;
-
- rev_sp = sptep_to_sp(sptep);
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- if (!slot) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
- audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
- (long int)(sptep - rev_sp->spt), rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot);
- if (!rmap_head->val) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no rmap for writable spte %llx\n",
- *sptep);
- dump_stack();
- }
-}
-
-static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
- inspect_spte_has_rmap(vcpu->kvm, sptep);
-}
-
-static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp = sptep_to_sp(sptep);
-
- if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
- "root.\n", sp);
-}
-
-static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int i;
-
- if (sp->role.level != PG_LEVEL_4K)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(sp->spt[i]))
- continue;
-
- inspect_spte_has_rmap(kvm, sp->spt + i);
- }
-}
-
-static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct kvm_rmap_head *rmap_head;
- u64 *sptep;
- struct rmap_iterator iter;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
-
- if (sp->role.direct || sp->unsync || sp->role.invalid)
- return;
-
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, sp->gfn);
- rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (is_writable_pte(*sptep))
- audit_printk(kvm, "shadow page has writable "
- "mappings: gfn %llx role %x\n",
- sp->gfn, sp->role.word);
- }
-}
-
-static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- check_mappings_rmap(kvm, sp);
- audit_write_protection(kvm, sp);
-}
-
-static void audit_all_active_sps(struct kvm *kvm)
-{
- walk_all_active_sps(kvm, audit_sp);
-}
-
-static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- audit_sptes_have_rmaps(vcpu, sptep, level);
- audit_mappings(vcpu, sptep, level);
- audit_spte_after_sync(vcpu, sptep, level);
-}
-
-static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, audit_spte);
-}
-
-static bool mmu_audit;
-static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
-
-static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
-
- if (!__ratelimit(&ratelimit_state))
- return;
-
- vcpu->kvm->arch.audit_point = point;
- audit_all_active_sps(vcpu->kvm);
- audit_vcpu_spte(vcpu);
-}
-
-static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- if (static_branch_unlikely((&mmu_audit_key)))
- __kvm_mmu_audit(vcpu, point);
-}
-
-static void mmu_audit_enable(void)
-{
- if (mmu_audit)
- return;
-
- static_branch_inc(&mmu_audit_key);
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- static_branch_dec(&mmu_audit_key);
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = kstrtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static const struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index da6166b5c377..1bff453f7cbe 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -30,6 +30,8 @@ extern bool dbg;
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
+typedef u64 __rcu *tdp_ptep_t;
+
struct kvm_mmu_page {
/*
* Note, "link" through "spt" fit in a single 64 byte cache line on
@@ -59,8 +61,17 @@ struct kvm_mmu_page {
refcount_t tdp_mmu_root_count;
};
unsigned int unsync_children;
- struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
- DECLARE_BITMAP(unsync_child_bitmap, 512);
+ union {
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ tdp_ptep_t ptep;
+ };
+ union {
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+ struct {
+ struct work_struct tdp_mmu_async_work;
+ void *tdp_mmu_async_data;
+ };
+ };
struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index de5e8e4e1aa7..12247b96af01 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -416,6 +416,29 @@ TRACE_EVENT(
)
);
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 68eb1fb548b6..2e09d1b6249f 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -47,8 +47,8 @@ int kvm_page_track_create_memslot(struct kvm *kvm,
continue;
slot->arch.gfn_track[i] =
- kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL_ACCOUNT);
+ __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}
@@ -75,7 +75,8 @@ int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
return 0;
- gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT);
+ gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track),
+ GFP_KERNEL_ACCOUNT);
if (gfn_track == NULL)
return -ENOMEM;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5b5bdac97c7b..252c77805eb9 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -668,7 +668,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
for (shadow_walk_init(&it, vcpu, fault->addr);
@@ -904,12 +904,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
r = FNAME(fetch)(vcpu, fault, &walker);
- kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 73cfe62fdad1..4739b53c9734 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -192,6 +192,65 @@ out:
return wrprot;
}
+static u64 make_spte_executable(u64 spte)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ spte &= ~shadow_nx_mask;
+ spte |= shadow_x_mask;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+{
+ u64 child_spte;
+ int child_level;
+
+ if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
+ return 0;
+
+ if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
+ return 0;
+
+ child_spte = huge_spte;
+ child_level = huge_level - 1;
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+
+ if (child_level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page, mark the page executable as the
+ * NX hugepage mitigation no longer applies.
+ */
+ if (is_nx_huge_page_enabled())
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
+}
+
+
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
u64 spte = SPTE_MMU_PRESENT_MASK;
@@ -250,14 +309,7 @@ u64 mark_spte_for_access_track(u64 spte)
if (is_access_track_spte(spte))
return spte;
- /*
- * Making an Access Tracking PTE will result in removal of write access
- * from the PTE. So, verify that we will be able to restore the write
- * access in the fast page fault path later on.
- */
- WARN_ONCE((spte & PT_WRITABLE_MASK) &&
- !spte_can_locklessly_be_made_writable(spte),
- "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+ check_spte_writable_invariants(spte);
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
@@ -368,8 +420,8 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_acc_track_mask = 0;
shadow_me_mask = sme_me_mask;
- shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
- shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE;
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
/*
* Set a reserved PA bit in MMIO SPTEs to generate page faults with
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index be6a007a4af3..73f12615416f 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -75,33 +75,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
- * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
- * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
- * that map guest pages in read-only memslots and read-only VMAs.
- *
- * Invariants:
- * - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
- *
- *
- * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
- * allows writes to the guest page mapped by the SPTE. This bit is cleared when
- * the guest page mapped by the SPTE contains a page table that is being
- * monitored for shadow paging. In this case the SPTE can only be made writable
- * by unsyncing the shadow page under the mmu_lock.
- *
- * Invariants:
- * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
- * - If MMU-writable is set, Host-writable must be set.
- *
- * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
- * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
- * PT_WRITABLE_MASK upon the next write from the guest and record the write in
- * the dirty log (see fast_page_fault()).
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
*/
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
-#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
-#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
@@ -339,15 +319,86 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
__is_rsvd_bits_set(rsvd_check, spte, level);
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+/*
+ * An shadow-present leaf SPTE may be non-writable for 3 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * To keep track of why a given SPTE is write-protected, KVM uses 2
+ * software-only bits in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
{
- if (spte & shadow_mmu_writable_mask) {
- WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
- return true;
- }
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
+{
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+}
- WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
- return false;
+static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
}
static inline u64 get_mmio_spte_generation(u64 spte)
@@ -364,9 +415,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
bool host_writable, u64 *new_spte);
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index caa96c270b95..6d3b3e5a5533 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -40,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter)
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
+ int root_level = root->role.level;
+
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
iter->next_last_level_gfn = next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
- iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
tdp_iter_restart(iter);
}
@@ -87,7 +89,7 @@ static bool try_step_down(struct tdp_iter *iter)
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
@@ -121,7 +123,7 @@ static bool try_step_side(struct tdp_iter *iter)
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
return true;
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index e19cabbcb65c..b1eaf6ec0e0b 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -7,7 +7,20 @@
#include "mmu.h"
-typedef u64 __rcu *tdp_ptep_t;
+/*
+ * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
+ * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
+ * batched without having to collect the list of zapped SPs. Flows that can
+ * remove SPs must service pending TLB flushes prior to dropping RCU protection.
+ */
+static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
+{
+ return READ_ONCE(*rcu_dereference(sptep));
+}
+static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), val);
+}
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
@@ -57,17 +70,17 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \
- for (tdp_iter_start(&iter, root, root_level, min_level, start); \
+#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start); \
iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, root_level, start, end) \
- for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte(iter, root, start, end) \
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bc9e3553fba2..e7e7876251b3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -25,17 +25,22 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
+ kvm->arch.tdp_mmu_zap_wq =
+ alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
return true;
}
-static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+/* Arbitrarily returns true so that this may be used in if statements. */
+static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
bool shared)
{
if (shared)
lockdep_assert_held_read(&kvm->mmu_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return true;
}
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
@@ -43,20 +48,20 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
+ flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
* Ensure that all the outstanding RCU callbacks to free shadow pages
- * can run before the VM is torn down.
+ * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
+ * can call kvm_tdp_mmu_put_root and create new callbacks.
*/
rcu_barrier();
}
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush,
- bool shared);
-
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
free_page((unsigned long)sp->spt);
@@ -79,6 +84,56 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+static void tdp_mmu_zap_root_work(struct work_struct *work)
+{
+ struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
+ tdp_mmu_async_work);
+ struct kvm *kvm = root->tdp_mmu_async_data;
+
+ read_lock(&kvm->mmu_lock);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB flush when
+ * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
+ * to a different pCPU. Note, the local TLB flush on reuse also
+ * invalidates any paging-structure-cache entries, i.e. TLB entries for
+ * intermediate paging structures, that may be zapped, as such entries
+ * are associated with the ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, true);
+
+ /*
+ * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
+ * avoiding an infinite loop. By design, the root is reachable while
+ * it's being asynchronously zapped, thus a different task can put its
+ * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
+ * asynchronously zapped root is unavoidable.
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+
+ read_unlock(&kvm->mmu_lock);
+}
+
+static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+ root->tdp_mmu_async_data = kvm;
+ INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
+ queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
+}
+
+static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
+{
+ union kvm_mmu_page_role role = page->role;
+ role.invalid = true;
+
+ /* No need to use cmpxchg, only the invalid bit can change. */
+ role.word = xchg(&page->role.word, role.word);
+ return role.invalid;
+}
+
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
@@ -89,25 +144,63 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
WARN_ON(!root->tdp_mmu_page);
+ /*
+ * The root now has refcount=0. It is valid, but readers already
+ * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
+ * rejects it. This remains true for the rest of the execution
+ * of this function, because readers visit valid roots only
+ * (except for tdp_mmu_zap_root_work(), which however
+ * does not acquire any reference itself).
+ *
+ * Even though there are flows that need to visit all roots for
+ * correctness, they all take mmu_lock for write, so they cannot yet
+ * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
+ * since the root still has refcount=0.
+ *
+ * However, tdp_mmu_zap_root can yield, and writers do not expect to
+ * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
+ * So the root temporarily gets an extra reference, going to refcount=1
+ * while staying invalid. Readers still cannot acquire any reference;
+ * but writers are now allowed to run if tdp_mmu_zap_root yields and
+ * they might take an extra reference if they themselves yield.
+ * Therefore, when the reference is given back by the worker,
+ * there is no guarantee that the refcount is still 1. If not, whoever
+ * puts the last reference will free the page, but they will not have to
+ * zap the root because a root cannot go from invalid to valid.
+ */
+ if (!kvm_tdp_root_mark_invalid(root)) {
+ refcount_set(&root->tdp_mmu_root_count, 1);
+
+ /*
+ * Zapping the root in a worker is not just "nice to have";
+ * it is required because kvm_tdp_mmu_invalidate_all_roots()
+ * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
+ * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
+ * might return with some roots not zapped yet.
+ */
+ tdp_mmu_schedule_zap_root(kvm, root);
+ return;
+ }
+
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-
- zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
-
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
/*
- * Finds the next valid root after root (or the first valid root if root
- * is NULL), takes a reference on it, and returns that next root. If root
- * is not NULL, this thread should have already taken a reference on it, and
- * that reference will be dropped. If no valid root is found, this
- * function will return NULL.
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL). A reference to the returned root is acquired, and the reference to
+ * @prev_root is released (the caller obviously must hold a reference to
+ * @prev_root if it's non-NULL).
+ *
+ * If @only_valid is true, invalid roots are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool shared)
+ bool shared, bool only_valid)
{
struct kvm_mmu_page *next_root;
@@ -121,9 +214,14 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
typeof(*next_root), link);
- while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+ while (next_root) {
+ if ((!only_valid || !next_root->role.invalid) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&next_root->link, typeof(*next_root), link);
+ }
rcu_read_unlock();
@@ -143,71 +241,91 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* mode. In the unlikely event that this thread must free a root, the lock
* will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
- _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
- } else
-
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
- lockdep_is_held_type(&kvm->mmu_lock, 0) || \
- lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
} else
-static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
- int level)
-{
- union kvm_mmu_page_role role;
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
- role = vcpu->arch.mmu->mmu_role.base;
- role.level = level;
- role.direct = true;
- role.has_4_byte_gpte = false;
- role.access = ACC_ALL;
- role.ad_disabled = !shadow_accessed_mask;
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
- return role;
-}
+/*
+ * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
+ * the implication being that any flow that holds mmu_lock for read is
+ * inherently yield-friendly and should use the yield-safe variant above.
+ * Holding mmu_lock for write obviates the need for RCU protection as the list
+ * is guaranteed to be stable.
+ */
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
-static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- int level)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+ return sp;
+}
+
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
+ gfn_t gfn, union kvm_mmu_page_role role)
+{
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- sp->role.word = page_role_for_level(vcpu, level).word;
+ sp->role = role;
sp->gfn = gfn;
+ sp->ptep = sptep;
sp->tdp_mmu_page = true;
trace_kvm_mmu_get_page(sp, true);
+}
- return sp;
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *parent_sp;
+ union kvm_mmu_page_role role;
+
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
+
+ role = parent_sp->role;
+ role.level--;
+
+ tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role;
+ union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
-
- /* Check for an existing root before allocating a new one. */
+ /*
+ * Check for an existing root before allocating a new one. Note, the
+ * role check prevents consuming an invalid root.
+ */
for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(kvm, root))
+ kvm_tdp_mmu_get_root(root))
goto out;
}
- root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
+
refcount_set(&root->tdp_mmu_root_count, 1);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -252,25 +370,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
- * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
- *
- * @kvm: kvm instance
- * @sp: the new page
- * @account_nx: This page replaces a NX large page and should be marked for
- * eventual reclaim.
- */
-static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool account_nx)
-{
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-}
-
-/**
- * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
*
* @kvm: kvm instance
* @sp: the page to be removed
@@ -278,8 +378,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
*/
-static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared)
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
{
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -295,7 +395,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
}
/**
- * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ * handle_removed_pt() - handle a page table removed from the TDP structure
*
* @kvm: kvm instance
* @pt: the page removed from the paging structure
@@ -311,8 +411,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* this thread will be responsible for ensuring the page is freed. Hence the
* early rcu_dereferences in the function.
*/
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
- bool shared)
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
{
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
@@ -321,7 +420,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
trace_kvm_mmu_prepare_zap_page(sp);
- tdp_mmu_unlink_page(kvm, sp, shared);
+ tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
u64 *sptep = rcu_dereference(pt) + i;
@@ -372,9 +471,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
- KVM_PAGES_PER_HPAGE(level + 1));
-
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -435,6 +531,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -469,11 +568,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
/*
* Recursively handle child PTs if the change removed a subtree from
- * the paging structure.
+ * the paging structure. Note the WARN on the PFN changing without the
+ * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
+ * pages are kernel allocations and should never be migrated.
*/
- if (was_present && !was_leaf && (pfn_changed || !is_present))
- handle_removed_tdp_mmu_page(kvm,
- spte_to_child_pt(old_spte, level), shared);
+ if (was_present && !was_leaf &&
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -492,53 +593,72 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* and handle the associated bookkeeping. Do not mark the page dirty
* in KVM's dirty bitmaps.
*
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
+ *
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @new_spte: The value the SPTE should be set to
- * Returns: true if the SPTE was set, false if it was not. If false is returned,
- * this function will have no side-effects.
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
- WARN_ON_ONCE(iter->yielded);
-
- lockdep_assert_held_read(&kvm->mmu_lock);
+ u64 *sptep = rcu_dereference(iter->sptep);
+ u64 old_spte;
/*
- * Do not change removed SPTEs. Only the thread that froze the SPTE
- * may modify it.
+ * The caller is responsible for ensuring the old SPTE is not a REMOVED
+ * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
*/
- if (is_removed_spte(iter->old_spte))
- return false;
+ WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
* does not hold the mmu_lock.
*/
- if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
- new_spte) != iter->old_spte)
- return false;
+ old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
+ if (old_spte != iter->old_spte) {
+ /*
+ * The page table entry was modified by a different logical
+ * CPU. Refresh iter->old_spte with the current value so the
+ * caller operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic().
+ */
+ iter->old_spte = old_spte;
+ return -EBUSY;
+ }
__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
- return true;
+ return 0;
}
-static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
+ int ret;
+
/*
* Freeze the SPTE by setting it to a special,
* non-present value. This will stop other threads from
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
- return false;
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ if (ret)
+ return ret;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
KVM_PAGES_PER_HPAGE(iter->level));
@@ -551,17 +671,21 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present
* to non-present.
*/
- WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
+ kvm_tdp_mmu_write_spte(iter->sptep, 0);
- return true;
+ return 0;
}
/*
* __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
- * @kvm: kvm instance
- * @iter: a tdp_iter instance currently on the SPTE that should be set
- * @new_spte: The value the SPTE should be set to
+ * @kvm: KVM instance
+ * @as_id: Address space ID, i.e. regular vs. SMM
+ * @sptep: Pointer to the SPTE
+ * @old_spte: The current value of the SPTE
+ * @new_spte: The new value that will be set for the SPTE
+ * @gfn: The base GFN that was (or will be) mapped by the SPTE
+ * @level: The level _containing_ the SPTE (its parent PT's level)
* @record_acc_track: Notify the MM subsystem of changes to the accessed state
* of the page. Should be set unless handling an MMU
* notifier for access tracking. Leaving record_acc_track
@@ -573,58 +697,65 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* Leaving record_dirty_log unset in that case prevents page
* writes from being double counted.
*/
-static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte, bool record_acc_track,
- bool record_dirty_log)
+static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+ bool record_acc_track, bool record_dirty_log)
{
- WARN_ON_ONCE(iter->yielded);
-
lockdep_assert_held_write(&kvm->mmu_lock);
/*
- * No thread should be using this function to set SPTEs to the
+ * No thread should be using this function to set SPTEs to or from the
* temporary removed SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(is_removed_spte(iter->old_spte));
+ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
- WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
+ kvm_tdp_mmu_write_spte(sptep, new_spte);
+
+ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
- __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, false);
if (record_acc_track)
- handle_changed_spte_acc_track(iter->old_spte, new_spte,
- iter->level);
+ handle_changed_spte_acc_track(old_spte, new_spte, level);
if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
- iter->old_spte, new_spte,
- iter->level);
+ handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+ new_spte, level);
+}
+
+static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte, bool record_acc_track,
+ bool record_dirty_log)
+{
+ WARN_ON_ONCE(iter->yielded);
+
+ __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
+ new_spte, iter->gfn, iter->level,
+ record_acc_track, record_dirty_log);
}
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
}
static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
}
static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
+ for_each_tdp_pte(_iter, _root, _start, _end)
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -634,8 +765,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
- _mmu->shadow_root_level, _start, _end)
+ for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -662,11 +792,11 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return false;
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- rcu_read_unlock();
-
if (flush)
kvm_flush_remote_tlbs(kvm);
+ rcu_read_unlock();
+
if (shared)
cond_resched_rwlock_read(&kvm->mmu_lock);
else
@@ -682,6 +812,99 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded;
}
+static inline gfn_t tdp_mmu_max_gfn_host(void)
+{
+ /*
+ * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
+ * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
+ * and so KVM will never install a SPTE for such addresses.
+ */
+ return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+}
+
+static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared, int zap_level)
+{
+ struct tdp_iter iter;
+
+ gfn_t end = tdp_mmu_max_gfn_host();
+ gfn_t start = 0;
+
+ for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ if (iter.level > zap_level)
+ continue;
+
+ if (!shared)
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ goto retry;
+ }
+}
+
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+
+ /*
+ * The root must have an elevated refcount so that it's reachable via
+ * mmu_notifier callbacks, which allows this path to yield and drop
+ * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
+ * must drop all references to relevant pages prior to completing the
+ * callback. Dropping mmu_lock with an unreachable root would result
+ * in zapping SPTEs after a relevant mmu_notifier callback completes
+ * and lead to use-after-free as zapping a SPTE triggers "writeback" of
+ * dirty accessed bits to the SPTE's associated struct page.
+ */
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+
+ /*
+ * To avoid RCU stalls due to recursively removing huge swaths of SPs,
+ * split the zap into two passes. On the first pass, zap at the 1gb
+ * level, and then zap top-level SPs on the second pass. "1gb" is not
+ * arbitrary, as KVM must be able to zap a 1gb shadow page without
+ * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ *
+ * Because zapping a SP recurses on its children, stepping down to
+ * PG_LEVEL_4K in the iterator itself is unnecessary.
+ */
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
+ __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
+
+ rcu_read_unlock();
+}
+
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 old_spte;
+
+ /*
+ * This helper intentionally doesn't allow zapping a root shadow page,
+ * which doesn't have a parent page table and thus no associated entry.
+ */
+ if (WARN_ON_ONCE(!sp->ptep))
+ return false;
+
+ old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
+ if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+ return false;
+
+ __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1, true, true);
+
+ return true;
+}
+
/*
* Tears down the mappings for the range of gfns, [start, end), and frees the
* non-root pages mapping GFNs strictly within that range. Returns true if
@@ -693,18 +916,11 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
* operation can cause a soft lockup.
- *
- * If shared is true, this thread holds the MMU lock in read mode and must
- * account for the possibility that other threads are modifying the paging
- * structures concurrently. If shared is false, this thread should hold the
- * MMU lock in write mode.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush,
- bool shared)
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
{
- gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- bool zap_all = (start == 0 && end >= max_gfn_host);
+ bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
struct tdp_iter iter;
/*
@@ -713,22 +929,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
*/
int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
- /*
- * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
- * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
- * and so KVM will never install a SPTE for such addresses.
- */
- end = min(end, max_gfn_host);
+ end = min(end, tdp_mmu_max_gfn_host());
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
-retry:
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
continue;
}
@@ -747,21 +956,21 @@ retry:
!is_last_spte(iter.old_spte, iter.level))
continue;
- if (!shared) {
- tdp_mmu_set_spte(kvm, &iter, 0);
- flush = true;
- } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
- goto retry;
- }
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ flush = true;
}
+ /*
+ * Need to flush before releasing RCU. TODO: do it only if intermediate
+ * page tables were zapped; there is no need to flush under RCU protection
+ * if no 'struct kvm_mmu_page' is freed.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, start, end - start);
+
rcu_read_unlock();
- return flush;
+
+ return false;
}
/*
@@ -775,107 +984,57 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
- flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
- false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
+ flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
return flush;
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
- bool flush = false;
+ struct kvm_mmu_page *root;
int i;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-}
-
-static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
- struct kvm_mmu_page *prev_root)
-{
- struct kvm_mmu_page *next_root;
-
- if (prev_root)
- next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
- &prev_root->link,
- typeof(*prev_root), link);
- else
- next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
- typeof(*next_root), link);
-
- while (next_root && !(next_root->role.invalid &&
- refcount_read(&next_root->tdp_mmu_root_count)))
- next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
- &next_root->link,
- typeof(*next_root), link);
-
- return next_root;
+ /*
+ * Zap all roots, including invalid roots, as all SPTEs must be dropped
+ * before returning to the caller. Zap directly even if the root is
+ * also being zapped by a worker. Walking zapped top-level SPTEs isn't
+ * all that expensive and mmu_lock is already held, which means the
+ * worker has yielded, i.e. flushing the work instead of zapping here
+ * isn't guaranteed to be any faster.
+ *
+ * A TLB flush is unnecessary, KVM zaps everything if and only the VM
+ * is being destroyed or the userspace VMM has exited. In both cases,
+ * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+ */
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, i)
+ tdp_mmu_zap_root(kvm, root, false);
+ }
}
/*
- * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
- * invalidated root, they will not be freed until this function drops the
- * reference. Before dropping that reference, tear down the paging
- * structure so that whichever thread does drop the last reference
- * only has to do a trivial amount of work. Since the roots are invalid,
- * no new SPTEs should be created under them.
+ * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
+ * zap" completes.
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
- struct kvm_mmu_page *next_root;
- struct kvm_mmu_page *root;
- bool flush = false;
-
- lockdep_assert_held_read(&kvm->mmu_lock);
-
- rcu_read_lock();
-
- root = next_invalidated_root(kvm, NULL);
-
- while (root) {
- next_root = next_invalidated_root(kvm, root);
-
- rcu_read_unlock();
-
- flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
-
- /*
- * Put the reference acquired in
- * kvm_tdp_mmu_invalidate_roots
- */
- kvm_tdp_mmu_put_root(kvm, root, true);
-
- root = next_root;
-
- rcu_read_lock();
- }
-
- rcu_read_unlock();
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
+ flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
}
/*
- * Mark each TDP MMU root as invalid so that other threads
- * will drop their references and allow the root count to
- * go to 0.
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+ * zapping is performed asynchronously, so a reference is taken on all roots.
+ * Using a separate workqueue makes it easy to ensure that the destruction is
+ * performed before the "fast zap" completes, without keeping a separate list
+ * of invalidated roots; the list is effectively the list of work items in
+ * the workqueue.
*
- * Also take a reference on all roots so that this thread
- * can do the bulk of the work required to free the roots
- * once they are invalidated. Without this reference, a
- * vCPU thread might drop the last reference to a root and
- * get stuck with tearing down the entire paging structure.
- *
- * Roots which have a zero refcount should be skipped as
- * they're already being torn down.
- * Already invalid roots should be referenced again so that
- * they aren't freed before kvm_tdp_mmu_zap_all_fast is
- * done with them.
+ * Get a reference even if the root is already invalid, the asynchronous worker
+ * assumes it was gifted a reference to the root it processes. Because mmu_lock
+ * is held for write, it should be impossible to observe a root with zero refcount,
+ * i.e. the list of roots cannot be stale.
*
* This has essentially the same effect for the TDP MMU
* as updating mmu_valid_gen does for the shadow MMU.
@@ -885,9 +1044,13 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
- if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!root->role.invalid &&
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
root->role.invalid = true;
+ tdp_mmu_schedule_zap_root(kvm, root);
+ }
+ }
}
/*
@@ -913,8 +1076,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
+ else if (is_shadow_present_pte(iter->old_spte) &&
+ !is_last_spte(iter->old_spte, iter->level))
+ kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
+ KVM_PAGES_PER_HPAGE(iter->level + 1));
/*
* If the page fault was caused by a write but the page is write
@@ -947,6 +1114,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @account_nx: True if this page table is being installed to split a
+ * non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool account_nx,
+ bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_set_spte(kvm, iter, spte);
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ return 0;
+}
+
+/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
*/
@@ -955,8 +1160,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu *mmu = vcpu->arch.mmu;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- u64 *child_pt;
- u64 new_spte;
int ret;
kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -979,7 +1182,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
break;
/*
@@ -987,10 +1190,13 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* because the new value informs the !present
* path below.
*/
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
}
if (!is_shadow_present_pte(iter.old_spte)) {
+ bool account_nx = fault->huge_page_disallowed &&
+ fault->req_level >= iter.level;
+
/*
* If SPTE has been frozen by another thread, just
* give up and retry, avoiding unnecessary page table
@@ -999,26 +1205,21 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (is_removed_spte(iter.old_spte))
break;
- sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
- child_pt = sp->spt;
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- new_spte = make_nonleaf_spte(child_pt,
- !shadow_accessed_mask);
-
- if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
- tdp_mmu_link_page(vcpu->kvm, sp,
- fault->huge_page_disallowed &&
- fault->req_level >= iter.level);
-
- trace_kvm_mmu_get_page(sp, true);
- } else {
+ if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
tdp_mmu_free_sp(sp);
break;
}
}
}
- if (iter.level != fault->goal_level) {
+ /*
+ * Force the guest to retry the access if the upper level SPTEs aren't
+ * in place, or if the target leaf SPTE is frozen by another CPU.
+ */
+ if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
rcu_read_unlock();
return RET_PF_RETRY;
}
@@ -1032,13 +1233,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
- struct kvm_mmu_page *root;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
- flush = zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
-
- return flush;
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
+ range->end, range->may_block, flush);
}
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
@@ -1052,18 +1248,18 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
struct tdp_iter iter;
bool ret = false;
- rcu_read_lock();
-
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code.
*/
for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
ret |= handler(kvm, &iter, range);
- }
- rcu_read_unlock();
+ rcu_read_unlock();
+ }
return ret;
}
@@ -1155,13 +1351,12 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
*/
bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
-
- /* FIXME: return 'flush' instead of flushing here. */
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
-
- return false;
+ /*
+ * No need to handle the remote TLB flush under RCU protection, the
+ * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
+ * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
+ */
+ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
}
/*
@@ -1180,8 +1375,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1193,14 +1387,9 @@ retry:
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
- }
+
spte_set = true;
}
@@ -1221,13 +1410,197 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
return spte_set;
}
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+{
+ struct kvm_mmu_page *sp;
+
+ gfp |= __GFP_ZERO;
+
+ sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)__get_free_page(gfp);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool shared)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since we are allocating while under the MMU lock we have to be
+ * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
+ * reclaim and to avoid making any filesystem callbacks (which can end
+ * up invoking KVM MMU notifiers, resulting in a deadlock).
+ *
+ * If this allocation fails we drop the lock and retry with reclaim
+ * allowed.
+ */
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ if (sp)
+ return sp;
+
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ iter->yielded = true;
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ return sp;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ tdp_mmu_init_child_sp(sp, iter);
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ if (!sp) {
+ ret = -ENOMEM;
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, ret);
+ break;
+ }
+
+ if (iter.yielded)
+ continue;
+ }
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return ret;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root, shared);
+ break;
+ }
+ }
+}
+
/*
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
@@ -1249,6 +1622,9 @@ retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -1261,14 +1637,9 @@ retry:
continue;
}
- if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
- }
+
spte_set = true;
}
@@ -1291,7 +1662,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
@@ -1392,14 +1763,8 @@ retry:
continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
- if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
- /*
- * The iter must explicitly re-read the SPTE because
- * the atomic cmpxchg failed.
- */
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ if (tdp_mmu_zap_spte_atomic(kvm, &iter))
goto retry;
- }
}
rcu_read_unlock();
@@ -1416,7 +1781,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
zap_collapsible_spte_range(kvm, root, slot);
}
@@ -1436,8 +1801,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3899004a5d91..5e5ef2576c81 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,12 +7,8 @@
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
- if (root->role.invalid)
- return false;
-
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
@@ -26,24 +22,8 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
{
return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
}
-static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1);
-
- /*
- * Don't allow yielding, as the caller may have a flush pending. Note,
- * if mmu_lock is held for write, zapping will never yield in this case,
- * but explicitly disallow it for safety. The TDP MMU does not yield
- * until it has made forward progress (steps sideways), and when zapping
- * a single shadow page that it's guaranteed to see (thus the mmu_lock
- * requirement), its "step sideways" will always step beyond the bounds
- * of the shadow page's gfn range and stop iterating before yielding.
- */
- lockdep_assert_held_write(&kvm->mmu_lock);
- return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
- sp->gfn, end, false, false);
-}
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
@@ -71,6 +51,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level);
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
static inline void kvm_tdp_mmu_walk_lockless_begin(void)
{
rcu_read_lock();
@@ -94,7 +79,7 @@ static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu
static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
{
struct kvm_mmu_page *sp;
- hpa_t hpa = mmu->root_hpa;
+ hpa_t hpa = mmu->root.hpa;
if (WARN_ON(!VALID_PAGE(hpa)))
return false;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index fb3e20791338..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -324,18 +324,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
switch (id) {
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
- * AVIC hardware handles the generation of
- * IPIs when the specified Message Type is Fixed
- * (also known as fixed delivery mode) and
- * the Trigger Mode is edge-triggered. The hardware
- * also supports self and broadcast delivery modes
- * specified via the Destination Shorthand(DSH)
- * field of the ICRL. Logical and physical APIC ID
- * formats are supported. All other IPI types cause
- * a #VMEXIT, which needs to emulated.
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
+ * a trap, e.g. ICR holds the correct value and RIP has been
+ * advanced, KVM is responsible only for emulating the IPI.
+ * Sadly, hardware may sometimes leave the BUSY flag set, in
+ * which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
*/
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
@@ -477,30 +477,28 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
svm->dfr_reg = dfr;
}
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
- u32 offset = svm->vmcb->control.exit_info_1 &
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
case APIC_ID:
- if (avic_handle_apic_id_update(&svm->vcpu))
+ if (avic_handle_apic_id_update(vcpu))
return 0;
break;
case APIC_LDR:
- if (avic_handle_ldr_update(&svm->vcpu))
+ if (avic_handle_ldr_update(vcpu))
return 0;
break;
case APIC_DFR:
- avic_handle_dfr_update(&svm->vcpu);
+ avic_handle_dfr_update(vcpu);
break;
default:
break;
}
- kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
+ kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -550,7 +548,7 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
if (trap) {
/* Handling Trap */
WARN_ONCE(!write, "svm: Handling trap read.\n");
- ret = avic_unaccel_trap_write(svm);
+ ret = avic_unaccel_trap_write(vcpu);
} else {
/* Handling Fault */
ret = kvm_emulate_instruction(vcpu, 0);
@@ -578,7 +576,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
return ret;
}
-void avic_post_state_restore(struct kvm_vcpu *vcpu)
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
@@ -586,20 +584,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-}
-
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-}
-
-static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
unsigned long flags;
@@ -631,48 +616,6 @@ out:
return ret;
}
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb01.ptr;
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
- if (!enable_apicv)
- return;
-
- if (activated) {
- /**
- * During AVIC temporary deactivation, guest could update
- * APIC ID, DFR and LDR registers, which would not be trapped
- * by avic_unaccelerated_access_interception(). In this case,
- * we need to check and update the AVIC logical APIC ID table
- * accordingly before re-activating.
- */
- avic_post_state_restore(vcpu);
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- } else {
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- }
- vmcb_mark_dirty(vmcb, VMCB_AVIC);
-
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
-
- svm_set_pi_irte_mode(vcpu, activated);
-}
-
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
- return;
-}
-
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
@@ -770,7 +713,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
}
/*
- * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ * avic_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -778,8 +721,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
@@ -879,7 +822,7 @@ out:
return ret;
}
-bool svm_check_apicv_inhibit_reasons(ulong bit)
+bool avic_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
@@ -924,20 +867,15 @@ out:
return ret;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
@@ -961,7 +899,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void __avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
@@ -980,13 +918,63 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
+static void avic_vcpu_load(struct kvm_vcpu *vcpu)
+{
+ int cpu = get_cpu();
+
+ WARN_ON(cpu != vcpu->cpu);
+
+ __avic_vcpu_load(vcpu, cpu);
+
+ put_cpu();
+}
+
+static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+
+ __avic_vcpu_put(vcpu);
+
+ preempt_enable();
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ if (activated) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ } else {
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
+
+ if (activated)
+ avic_vcpu_load(vcpu);
+ else
+ avic_vcpu_put(vcpu);
+
+ avic_set_pi_irte_mode(vcpu, activated);
+}
+
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
- preempt_disable();
-
/*
* Unload the AVIC when the vCPU is about to block, _before_
* the vCPU actually blocks.
@@ -1001,21 +989,12 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
* the cause of errata #1235).
*/
avic_vcpu_put(vcpu);
-
- preempt_enable();
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- int cpu;
-
if (!kvm_vcpu_apicv_active(vcpu))
return;
- cpu = get_cpu();
- WARN_ON(cpu != vcpu->cpu);
-
- avic_vcpu_load(vcpu, cpu);
-
- put_cpu();
+ avic_vcpu_load(vcpu);
}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000000..7d6d97968fb9
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB
+ * control area to expose SVM enlightenments to guests.
+ */
+struct hv_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 39d280e7e80e..96bab464967f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -28,6 +28,7 @@
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
+#include "hyperv.h"
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
@@ -165,14 +166,30 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ int i;
+
/*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is optimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- int i;
+ if (!svm->nested.force_msr_bitmap_recalc &&
+ kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
@@ -193,6 +210,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
+ svm->nested.force_msr_bitmap_recalc = false;
+
+set_msrpm_base_pa:
svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
@@ -298,7 +318,8 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
}
static
-void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
struct vmcb_control_area *from)
{
unsigned int i;
@@ -331,12 +352,19 @@ void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
to->asid = from->asid;
to->msrpm_base_pa &= ~0x0fffULL;
to->iopm_base_pa &= ~0x0fffULL;
+
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(to->reserved_sw, from->reserved_sw,
+ sizeof(struct hv_enlightenments));
+ }
}
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control)
{
- __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control);
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
}
static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
@@ -464,14 +492,14 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
CC(!load_pdptrs(vcpu, cr3)))
return -EINVAL;
- if (!nested_npt)
- kvm_mmu_new_pgd(vcpu, cr3);
-
vcpu->arch.cr3 = cr3;
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
return 0;
}
@@ -494,6 +522,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
new_vmcb12 = true;
svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
@@ -1302,6 +1331,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'reserved_sw' are not changed by KVM */
}
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
@@ -1434,7 +1464,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl);
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
@@ -1495,6 +1525,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
if (WARN_ON_ONCE(ret))
goto out_free;
+ svm->nested.force_msr_bitmap_recalc = true;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 5aa45f13b16d..d4de52409335 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
- if (!enable_pmu)
+ if (!vcpu->kvm->arch.enable_pmu)
return NULL;
switch (msr) {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 17b53457d866..75fa6dd268f0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -258,6 +258,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free;
INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
return 0;
@@ -1623,9 +1624,12 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
}
}
-static void sev_migrate_from(struct kvm_sev_info *dst,
- struct kvm_sev_info *src)
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *mirror;
+
dst->active = true;
dst->asid = src->asid;
dst->handle = src->handle;
@@ -1639,6 +1643,30 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
src->enc_context_owner = NULL;
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1681,7 +1709,7 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
return 0;
}
-int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
@@ -1708,15 +1736,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
src_sev = &to_kvm_svm(source_kvm)->sev_info;
- /*
- * VMs mirroring src's encryption context rely on it to keep the
- * ASID allocated, but below we are clearing src_sev->asid.
- */
- if (src_sev->num_mirrored_vms) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1738,7 +1757,8 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
}
- sev_migrate_from(dst_sev, src_sev);
+
+ sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
@@ -1761,7 +1781,7 @@ out_fput:
return ret;
}
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
int r;
@@ -1858,8 +1878,8 @@ out:
return r;
}
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct enc_region *region;
@@ -1932,8 +1952,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
kfree(region);
}
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct enc_region *region;
int ret;
@@ -1972,7 +1992,7 @@ failed:
return ret;
}
-int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
@@ -2008,10 +2028,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
*/
source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
- source_sev->num_mirrored_vms++;
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
- mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev->asid;
@@ -2019,6 +2039,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->es_active = source_sev->es_active;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
ret = 0;
/*
@@ -2041,19 +2062,17 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
- WARN_ON(sev->num_mirrored_vms);
-
if (!sev_guest(kvm))
return;
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
- struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
mutex_lock(&owner_kvm->lock);
- if (!WARN_ON(!owner_sev->num_mirrored_vms))
- owner_sev->num_mirrored_vms--;
+ list_del(&sev->mirror_entry);
mutex_unlock(&owner_kvm->lock);
kvm_put_kvm(owner_kvm);
return;
@@ -2173,7 +2192,7 @@ out:
#endif
}
-void sev_hardware_teardown(void)
+void sev_hardware_unsetup(void)
{
if (!sev_enabled)
return;
@@ -2358,7 +2377,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
@@ -2463,7 +2482,7 @@ static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
goto vmgexit_err;
}
- return true;
+ return 0;
vmgexit_err:
vcpu = &svm->vcpu;
@@ -2486,7 +2505,8 @@ vmgexit_err:
ghcb_set_sw_exit_info_1(ghcb, 2);
ghcb_set_sw_exit_info_2(ghcb, reason);
- return false;
+ /* Resume the guest to "return" the error code. */
+ return 1;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
@@ -2545,7 +2565,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
}
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
-static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct ghcb *ghcb = svm->sev_es.ghcb;
@@ -2598,14 +2618,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
}
scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- goto e_scratch;
+ return -ENOMEM;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
kvfree(scratch_va);
- goto e_scratch;
+ return -EFAULT;
}
/*
@@ -2621,13 +2641,13 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
svm->sev_es.ghcb_sa = scratch_va;
svm->sev_es.ghcb_sa_len = len;
- return true;
+ return 0;
e_scratch:
ghcb_set_sw_exit_info_1(ghcb, 2);
ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
- return false;
+ return 1;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -2765,17 +2785,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
exit_code = ghcb_get_sw_exit_code(ghcb);
- if (!sev_es_validate_vmgexit(svm))
- return 1;
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
sev_es_sync_from_ghcb(svm);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = 1;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
- if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
break;
ret = kvm_sev_es_mmio_read(vcpu,
@@ -2784,7 +2805,8 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_MMIO_WRITE:
- if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
break;
ret = kvm_sev_es_mmio_write(vcpu,
@@ -2817,6 +2839,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
+ ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
@@ -2836,6 +2859,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
{
int count;
int bytes;
+ int r;
if (svm->vmcb->control.exit_info_2 > INT_MAX)
return -EINVAL;
@@ -2844,8 +2868,9 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
if (unlikely(check_mul_overflow(count, size, &bytes)))
return -EINVAL;
- if (!setup_vmgexit_scratch(svm, in, bytes))
- return 1;
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
count, in);
@@ -2907,20 +2932,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
+void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- struct vmcb_save_area *hostsa;
-
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
- * of which one step is to perform a VMLOAD. Since hardware does not
- * perform a VMSAVE on VMRUN, the host savearea must be updated.
+ * of which one step is to perform a VMLOAD. KVM performs the
+ * corresponding VMSAVE in svm_prepare_guest_switch for both
+ * traditional and SEV-ES guests.
*/
- vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
- hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
/* PKRU is restored on VMEXIT, save the current host value */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index fd3a00c892c7..0884c3414a1b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -263,7 +263,7 @@ u32 svm_msrpm_offset(u32 msr)
return MSR_INVALID;
}
-#define MAX_INST_SIZE 15
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
static int get_npt_level(void)
{
@@ -353,7 +353,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -401,7 +401,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(vcpu);
+ (void)svm_skip_emulated_instruction(vcpu);
rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
@@ -668,6 +668,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
u32 msr, int read, int write)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 bit_read, bit_write;
unsigned long tmp;
u32 offset;
@@ -698,7 +699,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
msrpm[offset] = tmp;
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
-
+ svm->nested.force_msr_bitmap_recalc = true;
}
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -873,11 +874,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-static void svm_hardware_teardown(void)
+static void svm_hardware_unsetup(void)
{
int cpu;
- sev_hardware_teardown();
+ sev_hardware_unsetup();
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1175,7 +1176,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
-static int svm_create_vcpu(struct kvm_vcpu *vcpu)
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb01_page;
@@ -1246,7 +1247,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1265,7 +1266,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
@@ -1280,10 +1281,12 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
+ vmsave(__sme_page_pa(sd->save_area));
if (sev_es_guest(vcpu->kvm)) {
- sev_es_prepare_guest_switch(svm, vcpu->cpu);
- } else {
- vmsave(__sme_page_pa(sd->save_area));
+ struct vmcb_save_area *hostsa;
+ hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+
+ sev_es_prepare_switch_to_guest(hostsa);
}
if (tsc_scaling) {
@@ -1315,13 +1318,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
- avic_vcpu_load(vcpu, cpu);
+ __avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
if (kvm_vcpu_apicv_active(vcpu))
- avic_vcpu_put(vcpu);
+ __avic_vcpu_put(vcpu);
svm_prepare_host_switch(vcpu);
@@ -1529,6 +1532,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1563,7 +1575,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1647,7 +1659,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = vcpu->arch.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
+ svm_flush_tlb_current(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled) {
@@ -2269,7 +2281,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(vcpu))
+ if (!svm_skip_emulated_instruction(vcpu))
return 0;
}
@@ -3149,7 +3161,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(u64 exit_code)
{
return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
svm_exit_handlers[exit_code]);
@@ -3169,7 +3181,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (!svm_check_exit_valid(vcpu, exit_code))
+ if (!svm_check_exit_valid(exit_code))
return svm_handle_invalid_exit(vcpu, exit_code);
#ifdef CONFIG_RETPOLINE
@@ -3204,7 +3216,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
*error_code = 0;
}
-static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
@@ -3301,7 +3313,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
++vcpu->stat.nmi_injections;
}
-static void svm_set_irq(struct kvm_vcpu *vcpu)
+static void svm_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3531,17 +3543,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
-{
- return 0;
-}
-
-void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3915,11 +3917,6 @@ static int __init svm_check_processor_compat(void)
return 0;
}
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4254,7 +4251,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
* by 0x400 (matches the offset of 'struct vmcb_save_area'
* within 'struct vmcb'). Note: HSAVE area may also be used by
* L1 hypervisor to save additional host context (e.g. KVM does
- * that, see svm_prepare_guest_switch()) which must be
+ * that, see svm_prepare_switch_to_guest()) which must be
* preserved.
*/
if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
@@ -4529,21 +4526,20 @@ static int svm_vm_init(struct kvm *kvm)
static struct kvm_x86_ops svm_x86_ops __initdata = {
.name = "kvm_amd",
- .hardware_unsetup = svm_hardware_teardown,
+ .hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.has_emulated_msr = svm_has_emulated_msr,
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
- .prepare_guest_switch = svm_prepare_guest_switch,
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
.vcpu_blocking = avic_vcpu_blocking,
@@ -4557,9 +4553,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_segment = svm_get_segment,
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
- .post_set_cr3 = svm_post_set_cr3,
+ .post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
@@ -4574,21 +4570,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
- .tlb_flush_all = svm_flush_tlb,
- .tlb_flush_current = svm_flush_tlb,
- .tlb_flush_gva = svm_flush_tlb_gva,
- .tlb_flush_guest = svm_flush_tlb,
+ .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_current,
.vcpu_pre_run = svm_vcpu_pre_run,
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
.update_emulated_instruction = NULL,
.set_interrupt_shadow = svm_set_interrupt_shadow,
.get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4598,18 +4594,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = svm_set_virtual_apic_mode,
- .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- .load_eoi_exitmap = svm_load_eoi_exitmap,
- .hwapic_irr_update = svm_hwapic_irr_update,
- .hwapic_isr_update = svm_hwapic_isr_update,
- .apicv_post_state_restore = avic_post_state_restore,
-
- .set_tss_addr = svm_set_tss_addr,
- .set_identity_map_addr = svm_set_identity_map_addr,
- .get_mt_mask = svm_get_mt_mask,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
+ .get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
@@ -4634,8 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
- .update_pi_irte = svm_update_pi_irte,
+ .pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
.smi_allowed = svm_smi_allowed,
@@ -4643,12 +4631,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
- .mem_enc_op = svm_mem_enc_op,
- .mem_enc_reg_region = svm_register_enc_region,
- .mem_enc_unreg_region = svm_unregister_enc_region,
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
- .vm_copy_enc_context_from = svm_vm_copy_asid_from,
- .vm_move_enc_context_from = svm_vm_migrate_from,
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
.can_emulate_instruction = svm_can_emulate_instruction,
@@ -4893,7 +4881,7 @@ static __init int svm_hardware_setup(void)
return 0;
err:
- svm_hardware_teardown();
+ svm_hardware_unsetup();
return r;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fa98d6844728..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,7 +79,8 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
- unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
@@ -137,6 +138,8 @@ struct vmcb_ctrl_area_cached {
u32 event_inj_err;
u64 nested_cr3;
u64 virt_ext;
+ u32 clean;
+ u8 reserved_sw[32];
};
struct svm_nested_state {
@@ -163,6 +166,15 @@ struct svm_nested_state {
struct vmcb_save_area_cached save;
bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
};
struct vcpu_sev_es_state {
@@ -321,7 +333,7 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
/*
* Only the PDPTRs are loaded on demand into the shadow MMU. All other
- * fields are synchronized in handle_exit, because accessing the VMCB is cheap.
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
*
* CR3 might be out of date in the VMCB but it is not marked dirty; instead,
* KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
@@ -480,7 +492,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -558,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -565,18 +587,17 @@ void avic_init_vmcb(struct vcpu_svm *svm);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void avic_vcpu_put(struct kvm_vcpu *vcpu);
-void avic_post_state_restore(struct kvm_vcpu *vcpu);
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void __avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool avic_check_apicv_inhibit_reasons(ulong bit);
+void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
@@ -590,17 +611,17 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu);
extern unsigned int max_sev_asid;
void sev_vm_destroy(struct kvm *kvm);
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
-int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd);
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
-void sev_hardware_teardown(void);
+void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
@@ -608,7 +629,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 489ca56212c6..e2fc59380465 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -7,35 +7,12 @@
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
#if IS_ENABLED(CONFIG_HYPERV)
-#include <asm/mshyperv.h>
-#include "hyperv.h"
#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
static struct kvm_x86_ops svm_x86_ops;
-/*
- * Hyper-V uses the software reserved 32 bytes in VMCB
- * control area to expose SVM enlightenments to guests.
- */
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
-
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
-
int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 92e6f6702f00..193f5ba930d1 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall,
* Tracepoint for hypercall.
*/
TRACE_EVENT(kvm_hv_hypercall,
- TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
- __u64 ingpa, __u64 outgpa),
- TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+ TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt,
+ __u16 rep_idx, __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa),
TP_STRUCT__entry(
__field( __u16, rep_cnt )
@@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall,
__field( __u64, ingpa )
__field( __u64, outgpa )
__field( __u16, code )
+ __field( __u16, var_cnt )
__field( bool, fast )
),
@@ -83,13 +84,14 @@ TRACE_EVENT(kvm_hv_hypercall,
__entry->ingpa = ingpa;
__entry->outgpa = outgpa;
__entry->code = code;
+ __entry->var_cnt = var_cnt;
__entry->fast = fast;
),
- TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
__entry->code, __entry->fast ? "fast" : "slow",
- __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
- __entry->outgpa)
+ __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx,
+ __entry->ingpa, __entry->outgpa)
);
TRACE_EVENT(kvm_hv_hypercall_done,
@@ -251,13 +253,13 @@ TRACE_EVENT(kvm_cpuid,
* Tracepoint for apic access.
*/
TRACE_EVENT(kvm_apic,
- TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
TP_ARGS(rw, reg, val),
TP_STRUCT__entry(
__field( unsigned int, rw )
__field( unsigned int, reg )
- __field( unsigned int, val )
+ __field( u64, val )
),
TP_fast_assign(
@@ -266,7 +268,7 @@ TRACE_EVENT(kvm_apic,
__entry->val = val;
),
- TP_printk("apic_%s %s = 0x%x",
+ TP_printk("apic_%s %s = 0x%llx",
__entry->rw ? "write" : "read",
__print_symbolic(__entry->reg, kvm_trace_symbol_apic),
__entry->val)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index dc822a1d403d..f18744f7ff82 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -320,7 +320,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
nested_release_evmcs(vcpu);
@@ -1125,15 +1125,15 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return -EINVAL;
}
- if (!nested_ept)
- kvm_mmu_new_pgd(vcpu, cr3);
-
vcpu->arch.cr3 = cr3;
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
+ if (!nested_ept)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
return 0;
}
@@ -4802,7 +4802,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl)
{
struct vcpu_vmx *vmx;
@@ -4810,7 +4811,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
return;
vmx = to_vmx(vcpu);
- if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ if (vcpu_has_perf_global_ctrl) {
vmx->nested.msrs.entry_ctls_high |=
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high |=
@@ -5011,7 +5012,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
vmx->nested.current_vmptr >> PAGE_SHIFT,
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
vmx->nested.current_vmptr = INVALID_GPA;
}
@@ -5470,7 +5471,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
roots_to_free = 0;
- if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
+ if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
operand.eptp))
roots_to_free |= KVM_MMU_ROOT_CURRENT;
@@ -5490,7 +5491,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
if (roots_to_free)
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
return nested_vmx_succeed(vcpu);
}
@@ -5579,7 +5580,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
*/
if (!enable_ept)
- kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu);
+ kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
return nested_vmx_succeed(vcpu);
}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b69a80f43b37..c92cea0b8ccc 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl);
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9b26596099a1..0684e519181b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -487,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry || !enable_pmu)
+ if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;
@@ -541,7 +541,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
bitmap_set(pmu->all_valid_pmc_idx,
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
- nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+ nested_vmx_pmu_refresh(vcpu,
+ intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
if (intel_pmu_lbr_is_compatible(vcpu))
x86_perf_get_lbr(&lbr_desc->records);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index aa1fe9085d77..3834bb30ce54 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -244,7 +244,7 @@ void vmx_pi_start_assignment(struct kvm *kvm)
}
/*
- * pi_update_irte - set IRTE for Posted-Interrupts
+ * vmx_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -252,8 +252,8 @@ void vmx_pi_start_assignment(struct kvm *kvm)
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set)
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index eb14e76b84ef..9a45d5c9f116 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -97,8 +97,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set);
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b730d799c26e..e8963f5af618 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -541,11 +541,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
static int possible_passthrough_msr_slot(u32 msr)
{
u32 i;
@@ -645,10 +640,10 @@ static void __loaded_vmcs_clear(void *arg)
/*
* Ensure all writes to loaded_vmcs, including deleting it from its
- * current percpu list, complete before setting loaded_vmcs->vcpu to
- * -1, otherwise a different cpu can see vcpu == -1 first and add
- * loaded_vmcs to its percpu list before it's deleted from this cpu's
- * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
@@ -2334,7 +2329,7 @@ fault:
return -EFAULT;
}
-static int hardware_enable(void)
+static int vmx_hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2375,7 +2370,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void hardware_disable(void)
+static void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
@@ -2950,7 +2945,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 root_hpa = mmu->root_hpa;
+ u64 root_hpa = mmu->root.hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
@@ -3930,31 +3925,33 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
/*
- * The vector of interrupt to be delivered to vcpu had
- * been set in PIR before this function.
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
*
- * Following cases will be reached in this block, and
- * we always send a notification event in all cases as
- * explained below.
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
*
- * Case 1: vcpu keeps in non-root mode. Sending a
- * notification event posts the interrupt to vcpu.
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
*
- * Case 2: vcpu exits to root mode and is still
- * runnable. PIR will be synced to vIRR before the
- * next vcpu entry. Sending a notification event in
- * this case has no effect, as vcpu is not in root
- * mode.
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
*
- * Case 3: vcpu exits to root mode and is blocked.
- * vcpu_block() has already synced PIR to vIRR and
- * never blocks vcpu if vIRR is not cleared. Therefore,
- * a blocked vcpu here does not wait for any requested
- * interrupts in PIR, and sending a notification event
- * which has no effect is safe here.
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
*/
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ if (vcpu != kvm_get_running_vcpu())
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return;
}
#endif
@@ -5177,7 +5174,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ if (vmx_get_cpl(vcpu) > 0)
goto out;
dr7 = vmcs_readl(GUEST_DR7);
@@ -5310,9 +5307,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
- u32 offset = exit_qualification & 0xfff;
- /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
+
kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -6975,7 +6979,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return vmx_exit_handlers_fastpath(vcpu);
}
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6986,7 +6990,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_loaded_vmcs(vmx->loaded_vmcs);
}
-static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7348,11 +7352,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_secondary_exec_control(vmx));
if (nested_vmx_allowed(vcpu))
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ vmx->msr_ia32_feature_control_valid_bits &=
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
@@ -7691,7 +7695,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void hardware_unsetup(void)
+static void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -7714,21 +7718,20 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
- .hardware_unsetup = hardware_unsetup,
+ .hardware_unsetup = vmx_hardware_unsetup,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
.vcpu_reset = vmx_vcpu_reset,
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
@@ -7756,21 +7759,21 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_rflags = vmx_set_rflags,
.get_if_flag = vmx_get_if_flag,
- .tlb_flush_all = vmx_flush_tlb_all,
- .tlb_flush_current = vmx_flush_tlb_current,
- .tlb_flush_gva = vmx_flush_tlb_gva,
- .tlb_flush_guest = vmx_flush_tlb_guest,
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
.vcpu_pre_run = vmx_vcpu_pre_run,
- .run = vmx_vcpu_run,
+ .vcpu_run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
.cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
@@ -7823,8 +7826,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
- .update_pi_irte = pi_update_irte,
- .start_assignment = vmx_pi_start_assignment,
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
@@ -8059,7 +8062,7 @@ static __init int hardware_setup(void)
vmx_set_cpu_caps();
r = alloc_kvm_area();
- if (r)
+ if (r && nested)
nested_vmx_hardware_unsetup();
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
@@ -8139,7 +8142,6 @@ static int __init vmx_init(void)
ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
KVM_EVMCS_VERSION) {
- int cpu;
/* Check that we have assist pages on all online CPUs */
for_each_online_cpu(cpu) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6db3a506b402..02cf0a7e1d14 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -110,6 +110,8 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
+
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -126,16 +128,15 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_x86_ops);
#define KVM_X86_OP(func) \
DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
*(((struct kvm_x86_ops *)0)->func));
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
@@ -194,6 +195,9 @@ bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444);
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -760,7 +764,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
- fault_mmu->root_hpa);
+ fault_mmu->root.hpa);
fault_mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
@@ -853,7 +857,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
* Shadow page roots need to be reconstructed instead.
*/
if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
- kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
@@ -869,6 +873,13 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
+
+ /*
+ * Clearing CR0.PG is defined to flush the TLB from the guest's
+ * perspective.
+ */
+ if (!(cr0 & X86_CR0_PG))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
@@ -1067,28 +1078,43 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
/*
- * If any role bit is changed, the MMU needs to be reset.
- *
- * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed.
* If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
* according to the SDM; however, stale prev_roots could be reused
* incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
- * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it
- * is slow, but changing CR4.PCIDE is a rare case.
- *
- * If CR4.PGE is changed, the guest TLB must be flushed.
+ * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
+ * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
+ * so fall through.
+ */
+ if (!tdp_enabled &&
+ (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
+ kvm_mmu_unload(vcpu);
+
+ /*
+ * The TLB has to be flushed for all PCIDs if any of the following
+ * (architecturally required) changes happen:
+ * - CR4.PCIDE is changed from 1 to 0
+ * - CR4.PGE is toggled
*
- * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and
- * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence
- * the usage of "else if".
+ * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
*/
- if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
- kvm_mmu_reset_context(vcpu);
- else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE)
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
- else if ((cr4 ^ old_cr4) & X86_CR4_PGE)
+ if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+
+ /*
+ * The TLB has to be flushed for the current PCID if any of the
+ * following (architecturally required) changes happen:
+ * - CR4.SMEP is changed from 0 to 1
+ * - CR4.PAE is toggled
+ */
+ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
+ ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
@@ -1166,7 +1192,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
}
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
@@ -1656,8 +1682,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
@@ -2026,17 +2051,10 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
return 1;
if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
- ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
- ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
- ((u32)(data >> 32) != X2APIC_BROADCAST)) {
-
- data &= ~(1 << 12);
- kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
- kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
- trace_kvm_apic_write(APIC_ICR, (u32)data);
- return 0;
- }
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST))
+ return kvm_x2apic_icr_write(vcpu->arch.apic, data);
return 1;
}
@@ -2413,7 +2431,7 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
+u64 kvm_scale_tsc(u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
@@ -2428,7 +2446,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
return target_tsc - tsc;
}
@@ -2436,7 +2454,7 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
return vcpu->arch.l1_tsc_offset +
- kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
+ kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
@@ -2639,7 +2657,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+ adjustment = kvm_scale_tsc((u64) adjustment,
vcpu->arch.l1_tsc_scaling_ratio);
adjust_tsc_offset_guest(vcpu, adjustment);
}
@@ -3059,7 +3077,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
- tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+ tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
@@ -3282,7 +3300,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_all)(vcpu);
+ static_call(kvm_x86_flush_tlb_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -3300,14 +3318,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_prev_roots(vcpu);
}
- static_call(kvm_x86_tlb_flush_guest)(vcpu);
+ static_call(kvm_x86_flush_tlb_guest)(vcpu);
}
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
}
/*
@@ -3869,7 +3887,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ratio = vcpu->arch.tsc_scaling_ratio;
}
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
break;
}
case MSR_MTRRcap:
@@ -4245,6 +4263,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_VAPIC:
case KVM_CAP_ENABLE_CAP:
r = 1;
break;
@@ -4286,9 +4305,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
- case KVM_CAP_VAPIC:
- r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
- break;
case KVM_CAP_NR_VCPUS:
r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
break;
@@ -4343,7 +4359,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
if (r < sizeof(struct kvm_xsave))
r = sizeof(struct kvm_xsave);
break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
+ break;
}
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = KVM_X86_VALID_QUIRKS;
+ break;
default:
break;
}
@@ -5145,7 +5167,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
kvm->arch.last_tsc_offset == offset);
- tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
@@ -5890,6 +5912,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ break;
+ fallthrough;
case KVM_CAP_DISABLE_QUIRKS:
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
@@ -5990,15 +6017,18 @@ split_irqchip_unlock:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
r = -EINVAL;
- if (kvm_x86_ops.vm_copy_enc_context_from)
- r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
- return r;
+ if (!kvm_x86_ops.vm_copy_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ break;
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
r = -EINVAL;
- if (kvm_x86_ops.vm_move_enc_context_from)
- r = kvm_x86_ops.vm_move_enc_context_from(
- kvm, cap->args[0]);
- return r;
+ if (!kvm_x86_ops.vm_move_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ break;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
r = -EINVAL;
@@ -6014,6 +6044,18 @@ split_irqchip_unlock:
kvm->arch.exit_on_emulation_error = cap->args[0];
r = 0;
break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = -EINVAL;
+ if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
default:
r = -EINVAL;
break;
@@ -6473,8 +6515,10 @@ set_pit2_out:
break;
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_op)
- r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -6485,8 +6529,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_reg_region)
- r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_register_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -6497,8 +6543,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_unreg_region)
- r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_unregister_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -8426,8 +8474,7 @@ writeback:
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- if (kvm_x86_ops.update_emulated_instruction)
- static_call(kvm_x86_update_emulated_instruction)(vcpu);
+ static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -8826,6 +8873,12 @@ int kvm_arch_init(void *opaque)
goto out;
}
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
r = -ENOMEM;
x86_emulator_cache = kvm_alloc_emulator_cache();
@@ -8985,7 +9038,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
*
* @apicid - apicid of vcpu to be kicked.
*/
-static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
{
struct kvm_lapic_irq lapic_irq;
@@ -9104,7 +9157,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
break;
- kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ kvm_pv_kick_cpu_op(vcpu->kvm, a1);
kvm_sched_yield(vcpu, a1);
ret = 0;
break;
@@ -9268,10 +9321,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
can_inject = false;
}
}
@@ -9351,7 +9404,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
@@ -9365,7 +9418,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
@@ -9693,8 +9746,7 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
+ if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
@@ -9727,10 +9779,12 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
} else
kvm->arch.apicv_inhibit_reasons = new;
}
-EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
+ if (!enable_apicv)
+ return;
+
down_write(&kvm->arch.apicv_update_lock);
__kvm_request_apicv_update(kvm, activate, bit);
up_write(&kvm->arch.apicv_update_lock);
@@ -9769,11 +9823,11 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
- static_call(kvm_x86_load_eoi_exitmap)(
+ static_call_cond(kvm_x86_load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
@@ -9796,10 +9850,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- if (!kvm_x86_ops.set_apic_access_page_addr)
- return;
-
- static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -9844,8 +9895,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
}
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
- kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
@@ -9990,7 +10041,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_guest_switch)(vcpu);
+ static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -10071,7 +10122,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
- exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -10373,10 +10424,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- /*
- * Exclude PKRU from restore as restored separately in
- * kvm_x86_ops.run().
- */
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
}
@@ -10566,16 +10614,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
@@ -11348,15 +11386,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static_call(kvm_x86_update_exception_bitmap)(vcpu);
/*
- * Reset the MMU context if paging was enabled prior to INIT (which is
- * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
- * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
- * checked because it is unconditionally cleared on INIT and all other
- * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
- * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
+ * On the standard CR0/CR4/EFER modification paths, there are several
+ * complex conditions determining whether the MMU has to be reset and/or
+ * which PCIDs have to be flushed. However, CR0.WP and the paging-related
+ * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
+ * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
+ * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
*/
- if (old_cr0 & X86_CR0_PG)
+ if (old_cr0 & X86_CR0_PG) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
kvm_mmu_reset_context(vcpu);
+ }
/*
* Intel's SDM states that all TLB entries are flushed on INIT. AMD's
@@ -11614,6 +11654,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.guest_can_read_msr_platform_info = true;
+ kvm->arch.enable_pmu = enable_pmu;
#if IS_ENABLED(CONFIG_HYPERV)
spin_lock_init(&kvm->arch.hv_root_tdp_lock);
@@ -11811,7 +11852,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
if (slot->arch.rmap[i])
continue;
- slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i]) {
memslot_rmap_free(slot);
return -ENOMEM;
@@ -11848,7 +11889,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
lpages = __kvm_mmu_slot_lpages(slot, npages, level);
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
+ linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -11998,6 +12039,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
return;
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
+
if (kvm_x86_ops.cpu_dirty_log_size) {
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
@@ -12042,8 +12086,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
- kvm_x86_ops.guest_apic_has_interrupt &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -12296,14 +12339,28 @@ static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+
+ if (!kvm_pv_async_pf_enabled(vcpu))
return false;
- if (!kvm_pv_async_pf_enabled(vcpu) ||
- (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
+ if (vcpu->arch.apf.send_user_only &&
+ static_call(kvm_x86_get_cpl)(vcpu) == 0)
return false;
- return true;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * L1 needs to opt into the special #PF vmexits that are
+ * used to deliver async page faults.
+ */
+ return vcpu->arch.apf.delivery_as_pf_vmexit;
+ } else {
+ /*
+ * Play it safe in case the guest temporarily disables paging.
+ * The real mode IDT in particular is unlikely to have a #PF
+ * exception setup.
+ */
+ return is_paging(vcpu);
+ }
}
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -12398,7 +12455,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- static_call_cond(kvm_x86_start_assignment)(kvm);
+ static_call_cond(kvm_x86_pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -12446,7 +12503,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -12471,7 +12528,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -12482,7 +12539,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f11d945ac41f..588792f00334 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -302,6 +302,8 @@ extern int pi_inject_timer;
extern bool report_ignored_msrs;
+extern bool eager_page_split;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 74be1fda58e3..4aa0f2b31665 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -732,7 +732,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -867,7 +867,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];