From c74ad08f3333db2e44d3346b863f6d10d35e37dd Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 3 May 2021 15:54:43 +0300 Subject: KVM: nSVM: fix few bugs in the vmcb02 caching logic * Define and use an invalid GPA (all ones) for init value of last and current nested vmcb physical addresses. * Reset the current vmcb12 gpa to the invalid value when leaving the nested mode, similar to what is done on nested vmexit. * Reset the last seen vmcb12 address when disabling the nested SVM, as it relies on vmcb02 fields which are freed at that point. Fixes: 4995a3685f1b ("KVM: SVM: Use a separate vmcb for the nested L2 guest") Signed-off-by: Maxim Levitsky Message-Id: <20210503125446.1353307-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cbbcee0a84f9..848956bb3cf1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -113,6 +113,7 @@ #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define UNMAPPED_GVA (~(gpa_t)0) +#define INVALID_GPA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G -- cgit v1.2.3 From c02027b5742b5aa804ef08a4a9db433295533046 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:42 +0200 Subject: x86/kvm: Disable kvmclock on all CPUs on shutdown Currenly, we disable kvmclock from machine_shutdown() hook and this only happens for boot CPU. We need to disable it for all CPUs to guard against memory corruption e.g. on restore from hibernate. Note, writing '0' to kvmclock MSR doesn't clear memory location, it just prevents hypervisor from updating the location so for the short while after write and while CPU is still alive, the clock remains usable and correct so we don't need to switch to some other clocksource. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_para.h | 4 ++-- arch/x86/kernel/kvm.c | 1 + arch/x86/kernel/kvmclock.c | 5 +---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 338119852512..9c56e0defd45 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,8 +7,6 @@ #include #include -extern void kvmclock_init(void); - #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); #else @@ -86,6 +84,8 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, } #ifdef CONFIG_KVM_GUEST +void kvmclock_init(void); +void kvmclock_disable(void); bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9d5f96321c7f..25dd126a3325 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -459,6 +459,7 @@ static void kvm_guest_cpu_offline(void) wrmsrl(MSR_KVM_PV_EOI_EN, 0); kvm_pv_disable_apf(); apf_task_wake_all(); + kvmclock_disable(); } static int kvm_cpu_online(unsigned int cpu) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d37ed4e1d033..081686df6cd8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -220,11 +220,9 @@ static void kvm_crash_shutdown(struct pt_regs *regs) } #endif -static void kvm_shutdown(void) +void kvmclock_disable(void) { native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_shutdown(); } static void __init kvmclock_init_mem(void) @@ -351,7 +349,6 @@ void __init kvmclock_init(void) #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; - machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC_CORE machine_ops.crash_shutdown = kvm_crash_shutdown; #endif -- cgit v1.2.3 From 3d6b84132d2a57b5a74100f6923a8feb679ac2ce Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 14 Apr 2021 14:35:43 +0200 Subject: x86/kvm: Disable all PV features on crash Crash shutdown handler only disables kvmclock and steal time, other PV features remain active so we risk corrupting memory or getting some side-effects in kdump kernel. Move crash handler to kvm.c and unify with CPU offline. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210414123544.1060604-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_para.h | 6 ------ arch/x86/kernel/kvm.c | 44 ++++++++++++++++++++++++++++++----------- arch/x86/kernel/kvmclock.c | 21 -------------------- 3 files changed, 32 insertions(+), 39 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 9c56e0defd45..69299878b200 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -92,7 +92,6 @@ unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_apf_flags(void); -void kvm_disable_steal_time(void); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -137,11 +136,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void) return 0; } -static inline void kvm_disable_steal_time(void) -{ - return; -} - static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) { return false; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 25dd126a3325..8eb91dc0f5a8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -38,6 +38,7 @@ #include #include #include +#include #include DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -375,6 +376,14 @@ static void kvm_pv_disable_apf(void) pr_info("disable async PF for cpu %d\n", smp_processor_id()); } +static void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + static void kvm_pv_guest_cpu_reboot(void *unused) { /* @@ -417,14 +426,6 @@ static u64 kvm_steal_clock(int cpu) return steal; } -void kvm_disable_steal_time(void) -{ - if (!has_steal_clock) - return; - - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); -} - static inline void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); @@ -452,13 +453,14 @@ static void __init sev_map_percpu_data(void) } } -static void kvm_guest_cpu_offline(void) +static void kvm_guest_cpu_offline(bool shutdown) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrl(MSR_KVM_PV_EOI_EN, 0); kvm_pv_disable_apf(); - apf_task_wake_all(); + if (!shutdown) + apf_task_wake_all(); kvmclock_disable(); } @@ -661,7 +663,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) unsigned long flags; local_irq_save(flags); - kvm_guest_cpu_offline(); + kvm_guest_cpu_offline(false); local_irq_restore(flags); return 0; } @@ -670,7 +672,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_suspend(void) { - kvm_guest_cpu_offline(); + kvm_guest_cpu_offline(false); return 0; } @@ -685,6 +687,20 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, }; +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); +} +#endif + static void __init kvm_guest_init(void) { int i; @@ -727,6 +743,10 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + register_syscore_ops(&kvm_syscore_ops); /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 081686df6cd8..ad273e5861c1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -20,7 +20,6 @@ #include #include #include -#include #include static int kvmclock __initdata = 1; @@ -203,23 +202,6 @@ static void kvm_setup_secondary_clock(void) } #endif -/* - * After the clock is registered, the host will keep writing to the - * registered memory location. If the guest happens to shutdown, this memory - * won't be valid. In cases like kexec, in which you install a new kernel, this - * means a random memory location will be kept being written. So before any - * kind of shutdown from our side, we unregister the clock by writing anything - * that does not have the 'enable' bit set in the msr - */ -#ifdef CONFIG_KEXEC_CORE -static void kvm_crash_shutdown(struct pt_regs *regs) -{ - native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_crash_shutdown(regs); -} -#endif - void kvmclock_disable(void) { native_write_msr(msr_kvm_system_time, 0, 0); @@ -349,9 +331,6 @@ void __init kvmclock_init(void) #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; -#ifdef CONFIG_KEXEC_CORE - machine_ops.crash_shutdown = kvm_crash_shutdown; -#endif kvm_get_preset_lpj(); /* -- cgit v1.2.3 From 70f094f4f01dc4d6f78ac6407f85627293a6553c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 3 May 2021 17:08:52 +0200 Subject: KVM: nVMX: Properly pad 'struct kvm_vmx_nested_state_hdr' Eliminate the probably unwanted hole in 'struct kvm_vmx_nested_state_hdr': Pre-patch: struct kvm_vmx_nested_state_hdr { __u64 vmxon_pa; /* 0 8 */ __u64 vmcs12_pa; /* 8 8 */ struct { __u16 flags; /* 16 2 */ } smm; /* 16 2 */ /* XXX 2 bytes hole, try to pack */ __u32 flags; /* 20 4 */ __u64 preemption_timer_deadline; /* 24 8 */ }; Post-patch: struct kvm_vmx_nested_state_hdr { __u64 vmxon_pa; /* 0 8 */ __u64 vmcs12_pa; /* 8 8 */ struct { __u16 flags; /* 16 2 */ } smm; /* 16 2 */ __u16 pad; /* 18 2 */ __u32 flags; /* 20 4 */ __u64 preemption_timer_deadline; /* 24 8 */ }; Signed-off-by: Vitaly Kuznetsov Message-Id: <20210503150854.1144255-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5a3022c8af82..0662f644aad9 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr { __u16 flags; } smm; + __u16 pad; + __u32 flags; __u64 preemption_timer_deadline; }; -- cgit v1.2.3 From 5104d7ffcf24749939bea7fdb5378d186473f890 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:24 -0700 Subject: KVM: VMX: Disable preemption when probing user return MSRs Disable preemption when probing a user return MSR via RDSMR/WRMSR. If the MSR holds a different value per logical CPU, the WRMSR could corrupt the host's value if KVM is preempted between the RDMSR and WRMSR, and then rescheduled on a different CPU. Opportunistically land the helper in common x86, SVM will use the helper in a future commit. Fixes: 4be534102624 ("KVM: VMX: Initialize vmx->guest_msrs[] right after allocation") Cc: stable@vger.kernel.org Cc: Xiaoyao Li Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-6-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 5 +---- arch/x86/kvm/x86.c | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 848956bb3cf1..e8dcbc632cf8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1777,6 +1777,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long icr, int op_64_bit); void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4a625c748275..11ff9c3d95d5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6914,12 +6914,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { u32 index = vmx_uret_msrs_list[i]; - u32 data_low, data_high; int j = vmx->nr_uret_msrs; - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) + if (kvm_probe_user_return_msr(index)) continue; vmx->guest_uret_msrs[j].slot = i; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 102f116d9bb4..bd90c73c37b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -339,6 +339,22 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } +int kvm_probe_user_return_msr(u32 msr) +{ + u64 val; + int ret; + + preempt_disable(); + ret = rdmsrl_safe(msr, &val); + if (ret) + goto out; + ret = wrmsrl_safe(msr, val); +out: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); + void kvm_define_user_return_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); -- cgit v1.2.3 From 8ea8b8d6f869425e21f34e60bdbe7e47e6c9d6b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:29 -0700 Subject: KVM: VMX: Use common x86's uret MSR list as the one true list Drop VMX's global list of user return MSRs now that VMX doesn't resort said list to isolate "active" MSRs, i.e. now that VMX's list and x86's list have the same MSRs in the same order. In addition to eliminating the redundant list, this will also allow moving more of the list management into common x86. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 97 +++++++++++++++++------------------------ arch/x86/kvm/x86.c | 12 +++++ 3 files changed, 53 insertions(+), 57 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8dcbc632cf8..6b15f27f49d0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1777,6 +1777,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long icr, int op_64_bit); void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_find_user_return_msr(u32 msr); int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index adefedac0e3b..39506704be96 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -454,26 +454,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; -/* - * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm - * will emulate SYSCALL in legacy mode if the vendor string in guest - * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To - * support this emulation, MSR_STAR is included in the list for i386, - * but is never loaded into hardware. MSR_CSTAR is also never loaded - * into hardware and is here purely for emulation purposes. - */ -static u32 vmx_uret_msrs_list[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, - MSR_IA32_TSX_CTRL, -}; - -/* - * Number of user return MSRs that are actually supported in hardware. - * vmx_uret_msrs_list is modified when KVM is loaded to drop unsupported MSRs. - */ +/* Number of user return MSRs that are actually supported in hardware. */ static int vmx_nr_uret_msrs; #if IS_ENABLED(CONFIG_HYPERV) @@ -703,22 +684,11 @@ static bool is_valid_passthrough_msr(u32 msr) return r; } -static inline int __vmx_find_uret_msr(u32 msr) -{ - int i; - - for (i = 0; i < vmx_nr_uret_msrs; ++i) { - if (vmx_uret_msrs_list[i] == msr) - return i; - } - return -1; -} - struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __vmx_find_uret_msr(msr); + i = kvm_find_user_return_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL; @@ -1086,7 +1056,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return false; } - i = __vmx_find_uret_msr(MSR_EFER); + i = kvm_find_user_return_msr(MSR_EFER); if (i < 0) return false; @@ -6922,6 +6892,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { + struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; int i, cpu, err; @@ -6946,29 +6917,25 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < vmx_nr_uret_msrs; ++i) { vmx->guest_uret_msrs[i].data = 0; - - switch (vmx_uret_msrs_list[i]) { - case MSR_IA32_TSX_CTRL: - /* - * TSX_CTRL_CPUID_CLEAR is handled in the CPUID - * interception. Keep the host value unchanged to avoid - * changing CPUID bits under the host kernel's feet. - * - * hle=0, rtm=0, tsx_ctrl=1 can be found with some - * combinations of new kernel and old userspace. If - * those guests run on a tsx=off host, do allow guests - * to use TSX_CTRL, but do not change the value on the - * host so that TSX remains always disabled. - */ - if (boot_cpu_has(X86_FEATURE_RTM)) - vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; - else - vmx->guest_uret_msrs[i].mask = 0; - break; - default: - vmx->guest_uret_msrs[i].mask = -1ull; - break; - } + vmx->guest_uret_msrs[i].mask = -1ull; + } + tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); + if (tsx_ctrl) { + /* + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. + * Keep the host value unchanged to avoid changing CPUID bits + * under the host kernel's feet. + * + * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations + * of new kernel and old userspace. If those guests run on a + * tsx=off host, do allow guests to use TSX_CTRL, but do not + * change the value on the host so that TSX remains always + * disabled. + */ + if (boot_cpu_has(X86_FEATURE_RTM)) + vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + else + vmx->guest_uret_msrs[i].mask = 0; } err = alloc_loaded_vmcs(&vmx->vmcs01); @@ -7829,6 +7796,22 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { static __init void vmx_setup_user_return_msrs(void) { + + /* + * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm + * will emulate SYSCALL in legacy mode if the vendor string in guest + * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To + * support this emulation, MSR_STAR is included in the list for i386, + * but is never loaded into hardware. MSR_CSTAR is also never loaded + * into hardware and is here purely for emulation purposes. + */ + const u32 vmx_uret_msrs_list[] = { + #ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, + #endif + MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_IA32_TSX_CTRL, + }; u32 msr; int i; @@ -7841,7 +7824,7 @@ static __init void vmx_setup_user_return_msrs(void) continue; kvm_define_user_return_msr(vmx_nr_uret_msrs, msr); - vmx_uret_msrs_list[vmx_nr_uret_msrs++] = msr; + vmx_nr_uret_msrs++; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0856636efc44..d514031ed25f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -364,6 +364,18 @@ void kvm_define_user_return_msr(unsigned slot, u32 msr) } EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); +int kvm_find_user_return_msr(u32 msr) +{ + int i; + + for (i = 0; i < user_return_msrs_global.nr; ++i) { + if (user_return_msrs_global.msrs[i] == msr) + return i; + } + return -1; +} +EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); + static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); -- cgit v1.2.3 From 9cc39a5a43c05f8eda206bf9e144119820ecf5c8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:31 -0700 Subject: KVM: x86: Export the number of uret MSRs to vendor modules Split out and export the number of configured user return MSRs so that VMX can iterate over the set of MSRs without having to do its own tracking. Keep the list itself internal to x86 so that vendor code still has to go through the "official" APIs to add/modify entries. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 29 +++++++++++++---------------- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b15f27f49d0..22505e74c3da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1418,6 +1418,7 @@ struct kvm_arch_async_pf { bool direct_map; }; +extern u32 __read_mostly kvm_nr_uret_msrs; extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d514031ed25f..5e1deed8ea5d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); */ #define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_user_return_msrs_global { - int nr; - u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; -}; - struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; @@ -198,7 +193,9 @@ struct kvm_user_return_msrs { } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +u32 __read_mostly kvm_nr_uret_msrs; +EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ @@ -330,10 +327,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn) user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(user_return_msrs_global.msrs[slot], values->host); + wrmsrl(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -358,9 +355,9 @@ EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); void kvm_define_user_return_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); - user_return_msrs_global.msrs[slot] = msr; - if (slot >= user_return_msrs_global.nr) - user_return_msrs_global.nr = slot + 1; + kvm_uret_msrs_list[slot] = msr; + if (slot >= kvm_nr_uret_msrs) + kvm_nr_uret_msrs = slot + 1; } EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); @@ -368,8 +365,8 @@ int kvm_find_user_return_msr(u32 msr) { int i; - for (i = 0; i < user_return_msrs_global.nr; ++i) { - if (user_return_msrs_global.msrs[i] == msr) + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + if (kvm_uret_msrs_list[i] == msr) return i; } return -1; @@ -383,8 +380,8 @@ static void kvm_user_return_msr_cpu_online(void) u64 value; int i; - for (i = 0; i < user_return_msrs_global.nr; ++i) { - rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + rdmsrl_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } @@ -399,7 +396,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); + err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; -- cgit v1.2.3 From e5fda4bbadb053e3b5164476146cf43092785c0b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:32 -0700 Subject: KVM: x86: Move uret MSR slot management to common x86 Now that SVM and VMX both probe MSRs before "defining" user return slots for them, consolidate the code for probe+define into common x86 and eliminate the odd behavior of having the vendor code define the slot for a given MSR. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/svm.c | 5 +---- arch/x86/kvm/vmx/vmx.c | 19 ++++--------------- arch/x86/kvm/x86.c | 19 +++++++++++-------- 4 files changed, 17 insertions(+), 29 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 22505e74c3da..8155eaac22a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1777,9 +1777,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); -int kvm_probe_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0ba0a00f8dc6..d13b72bfb736 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -959,10 +959,7 @@ static __init int svm_hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 32; } - if (!kvm_probe_user_return_msr(MSR_TSC_AUX)) { - tsc_aux_uret_slot = 0; - kvm_define_user_return_msr(tsc_aux_uret_slot, MSR_TSC_AUX); - } + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bd2187b4cc53..042823b49273 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -454,9 +454,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; -/* Number of user return MSRs that are actually supported in hardware. */ -static int vmx_nr_uret_msrs; - #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -1218,7 +1215,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ if (!vmx->guest_uret_msrs_loaded) { vmx->guest_uret_msrs_loaded = true; - for (i = 0; i < vmx_nr_uret_msrs; ++i) { + for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (!vmx->guest_uret_msrs[i].load_into_hardware) continue; @@ -6921,7 +6918,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - for (i = 0; i < vmx_nr_uret_msrs; ++i) { + for (i = 0; i < kvm_nr_uret_msrs; ++i) { vmx->guest_uret_msrs[i].data = 0; vmx->guest_uret_msrs[i].mask = -1ull; } @@ -7810,20 +7807,12 @@ static __init void vmx_setup_user_return_msrs(void) MSR_EFER, MSR_TSC_AUX, MSR_STAR, MSR_IA32_TSX_CTRL, }; - u32 msr; int i; BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { - msr = vmx_uret_msrs_list[i]; - - if (kvm_probe_user_return_msr(msr)) - continue; - - kvm_define_user_return_msr(vmx_nr_uret_msrs, msr); - vmx_nr_uret_msrs++; - } + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } static __init int hardware_setup(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5e1deed8ea5d..0a0eebf35dd5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -336,7 +336,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } -int kvm_probe_user_return_msr(u32 msr) +static int kvm_probe_user_return_msr(u32 msr) { u64 val; int ret; @@ -350,16 +350,18 @@ out: preempt_enable(); return ret; } -EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); -void kvm_define_user_return_msr(unsigned slot, u32 msr) +int kvm_add_user_return_msr(u32 msr) { - BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); - kvm_uret_msrs_list[slot] = msr; - if (slot >= kvm_nr_uret_msrs) - kvm_nr_uret_msrs = slot + 1; + BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); + + if (kvm_probe_user_return_msr(msr)) + return -1; + + kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; + return kvm_nr_uret_msrs++; } -EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); +EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { @@ -8132,6 +8134,7 @@ int kvm_arch_init(void *opaque) printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); goto out_free_x86_emulator_cache; } + kvm_nr_uret_msrs = 0; r = kvm_mmu_module_init(); if (r) -- cgit v1.2.3 From 61a05d444d2ca8d40add453a5f7058fbb1b57eca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 10:17:33 -0700 Subject: KVM: x86: Tie Intel and AMD behavior for MSR_TSC_AUX to guest CPU model Squish the Intel and AMD emulation of MSR_TSC_AUX together and tie it to the guest CPU model instead of the host CPU behavior. While not strictly necessary to avoid guest breakage, emulating cross-vendor "architecture" will provide consistent behavior for the guest, e.g. WRMSR fault behavior won't change if the vCPU is migrated to a host with divergent behavior. Note, the "new" kvm_is_supported_user_return_msr() checks do not add new functionality on either SVM or VMX. On SVM, the equivalent was "tsc_aux_uret_slot < 0", and on VMX the check was buried in the vmx_find_uret_msr() call at the find_uret_msr label. Signed-off-by: Sean Christopherson Message-Id: <20210504171734.1434054-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/svm/svm.c | 24 ------------------------ arch/x86/kvm/vmx/vmx.c | 15 --------------- arch/x86/kvm/x86.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 39 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8155eaac22a7..f85480b1d215 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1781,6 +1781,11 @@ int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +static inline bool kvm_is_supported_user_return_msr(u32 msr) +{ + return kvm_find_user_return_msr(msr) >= 0; +} + u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d13b72bfb736..0922d8e173e6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2663,12 +2663,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: - if (tsc_aux_uret_slot < 0) - return 1; - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; msr_info->data = svm->tsc_aux; break; /* @@ -2885,24 +2879,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: - if (tsc_aux_uret_slot < 0) - return 1; - - if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; - - /* - * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has - * incomplete and conflicting architectural behavior. Current - * AMD CPUs completely ignore bits 63:32, i.e. they aren't - * reserved and always read as zeros. Emulate AMD CPU behavior - * to avoid explosions if the vCPU is migrated from an AMD host - * to an Intel host. - */ - data = (u32)data; - /* * TSC_AUX is usually changed only during boot and never read * directly. Intercept TSC_AUX instead of exposing it to the diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 042823b49273..61f7e5221bf3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1981,12 +1981,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; - goto find_uret_msr; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); break; @@ -2302,15 +2296,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else vmx->pt_desc.guest.addr_a[index / 2] = data; break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) - return 1; - /* Check reserved bit, higher 32 bits should be zero */ - if ((data >> 32) != 0) - return 1; - goto find_uret_msr; case MSR_IA32_PERF_CAPABILITIES: if (data && !vcpu_to_pmu(vcpu)->version) return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a0eebf35dd5..4efd2978ec08 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1642,6 +1642,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * invokes 64-bit SYSENTER. */ data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); + break; + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + return 1; + + /* + * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has + * incomplete and conflicting architectural behavior. Current + * AMD CPUs completely ignore bits 63:32, i.e. they aren't + * reserved and always read as zeros. Enforce Intel's reserved + * bits check if and only if the guest CPU is Intel, and clear + * the bits in all other cases. This ensures cross-vendor + * migration will provide consistent behavior for the guest. + */ + if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) + return 1; + + data = (u32)data; + break; } msr.data = data; @@ -1678,6 +1702,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; + switch (index) { + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + return 1; + break; + } + msr.index = index; msr.host_initiated = host_initiated; -- cgit v1.2.3 From e8ea85fb280ec55674bca88ea7cd85f60d19567f Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 2 Feb 2021 17:04:32 +0800 Subject: KVM: X86: Add support for the emulation of DR6_BUS_LOCK bit Bus lock debug exception introduces a new bit DR6_BUS_LOCK (bit 11 of DR6) to indicate that bus lock #DB exception is generated. The set/clear of DR6_BUS_LOCK is similar to the DR6_RTM. The processor clears DR6_BUS_LOCK when the exception is generated. For all other #DB, the processor sets this bit to 1. Software #DB handler should set this bit before returning to the interrupted task. In VMM, to avoid breaking the CPUs without bus lock #DB exception support, activate the DR6_BUS_LOCK conditionally in DR6_FIXED_1 bits. When intercepting the #DB exception caused by bus locks, bit 11 of the exit qualification is set to identify it. The VMM should emulate the exception by clearing the bit 11 of the guest DR6. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Chenyi Qiang Message-Id: <20210202090433.13441-3-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/x86.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f85480b1d215..8836b3096217 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -200,6 +200,7 @@ enum x86_intercept_stage; #define KVM_NR_DB_REGS 4 +#define DR6_BUS_LOCK (1 << 11) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) @@ -213,7 +214,7 @@ enum x86_intercept_stage; * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. */ #define DR6_ACTIVE_LOW 0xffff0ff0 -#define DR6_VOLATILE 0x0001e00f +#define DR6_VOLATILE 0x0001e80f #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR7_BP_EN_MASK 0x000000ff diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4efd2978ec08..9b1a7040eae3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1176,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + fixed |= DR6_BUS_LOCK; return fixed; } -- cgit v1.2.3 From 03ca4589fabcc66b27e4cb8f8e95d64cf43badd0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 May 2021 13:42:21 -0700 Subject: KVM: x86: Prevent KVM SVM from loading on kernels with 5-level paging Disallow loading KVM SVM if 5-level paging is supported. In theory, NPT for L1 should simply work, but there unknowns with respect to how the guest's MAXPHYADDR will be handled by hardware. Nested NPT is more problematic, as running an L1 VMM that is using 2-level page tables requires stacking single-entry PDP and PML4 tables in KVM's NPT for L2, as there are no equivalent entries in L1's NPT to shadow. Barring hardware magic, for 5-level paging, KVM would need stack another layer to handle PML5. Opportunistically rename the lm_root pointer, which is used for the aforementioned stacking when shadowing 2-level L1 NPT, to pml4_root to call out that it's specifically for PML4. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20210505204221.1934471-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++---------- arch/x86/kvm/svm/svm.c | 5 +++++ 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8836b3096217..55efbacfc244 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -409,7 +409,7 @@ struct kvm_mmu { u32 pkru_mask; u64 *pae_root; - u64 *lm_root; + u64 *pml4_root; /* * check zero bits on shadow page table entries, these diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4b3ee244ebe0..0144c40d09c7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - if (WARN_ON_ONCE(!mmu->lm_root)) { + if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask; + mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; } for (i = 0; i < 4; ++i) { @@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) - mmu->root_hpa = __pa(mmu->lm_root); + mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3350,7 +3350,7 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *lm_root, *pae_root; + u64 *pml4_root, *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) return -EIO; - if (mmu->pae_root && mmu->lm_root) + if (mmu->pae_root && mmu->pml4_root) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) return -EIO; /* @@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; - lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!lm_root) { + pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml4_root) { free_page((unsigned long)pae_root); return -ENOMEM; } mmu->pae_root = pae_root; - mmu->lm_root = lm_root; + mmu->pml4_root = pml4_root; return 0; } @@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) if (!tdp_enabled && mmu->pae_root) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); - free_page((unsigned long)mmu->lm_root); + free_page((unsigned long)mmu->pml4_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0922d8e173e6..8124f51e9488 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -447,6 +447,11 @@ static int has_svm(void) return 0; } + if (pgtable_l5_enabled()) { + pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); + return 0; + } + return 1; } -- cgit v1.2.3 From e759959fe3b8313c81d6200be44cb8a644d845ea Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:34 -0500 Subject: x86/sev-es: Rename sev-es.{ch} to sev.{ch} SEV-SNP builds upon the SEV-ES functionality while adding new hardware protection. Version 2 of the GHCB specification adds new NAE events that are SEV-SNP specific. Rename the sev-es.{ch} to sev.{ch} so that all SEV* functionality can be consolidated in one place. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-2-brijesh.singh@amd.com --- arch/x86/boot/compressed/Makefile | 6 +- arch/x86/boot/compressed/sev-es.c | 206 ------ arch/x86/boot/compressed/sev.c | 206 ++++++ arch/x86/include/asm/sev-es.h | 114 --- arch/x86/include/asm/sev.h | 114 +++ arch/x86/kernel/Makefile | 6 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/sev-es-shared.c | 525 ------------- arch/x86/kernel/sev-es.c | 1461 ------------------------------------- arch/x86/kernel/sev-shared.c | 525 +++++++++++++ arch/x86/kernel/sev.c | 1461 +++++++++++++++++++++++++++++++++++++ arch/x86/mm/extable.c | 2 +- arch/x86/platform/efi/efi_64.c | 2 +- arch/x86/realmode/init.c | 2 +- 15 files changed, 2317 insertions(+), 2317 deletions(-) delete mode 100644 arch/x86/boot/compressed/sev-es.c create mode 100644 arch/x86/boot/compressed/sev.c delete mode 100644 arch/x86/include/asm/sev-es.h create mode 100644 arch/x86/include/asm/sev.h delete mode 100644 arch/x86/kernel/sev-es-shared.c delete mode 100644 arch/x86/kernel/sev-es.c create mode 100644 arch/x86/kernel/sev-shared.c create mode 100644 arch/x86/kernel/sev.c (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6e5522aebbbd..2a2975236c9e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -48,10 +48,10 @@ KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h KBUILD_CFLAGS += $(CLANG_FLAGS) -# sev-es.c indirectly inludes inat-table.h which is generated during +# sev.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). -CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ +CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -93,7 +93,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o - vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c deleted file mode 100644 index 82041bd380e5..000000000000 --- a/arch/x86/boot/compressed/sev-es.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel - */ - -/* - * misc.h needs to be first because it knows how to include the other kernel - * headers in the pre-decompression code in a way that does not break - * compilation. - */ -#include "misc.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "error.h" - -struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); -struct ghcb *boot_ghcb; - -/* - * Copy a version of this function here - insn-eval.c can't be used in - * pre-decompression code. - */ -static bool insn_has_rep_prefix(struct insn *insn) -{ - insn_byte_t p; - int i; - - insn_get_prefixes(insn); - - for_each_insn_prefix(insn, i, p) { - if (p == 0xf2 || p == 0xf3) - return true; - } - - return false; -} - -/* - * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and - * doesn't use segments. - */ -static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) -{ - return 0UL; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - unsigned long low, high; - - asm volatile("rdmsr" : "=a" (low), "=d" (high) : - "c" (MSR_AMD64_SEV_ES_GHCB)); - - return ((high << 32) | low); -} - -static inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; - - low = val & 0xffffffffUL; - high = val >> 32; - - asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), - "a"(low), "d" (high) : "memory"); -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int ret; - - memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - - return ES_OK; -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - void *dst, char *buf, size_t size) -{ - memcpy(dst, buf, size); - - return ES_OK; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - void *src, char *buf, size_t size) -{ - memcpy(buf, src, size); - - return ES_OK; -} - -#undef __init -#undef __pa -#define __init -#define __pa(x) ((unsigned long)(x)) - -#define __BOOT_COMPRESSED - -/* Basic instruction decoding support needed */ -#include "../../lib/inat.c" -#include "../../lib/insn.c" - -/* Include code for early handlers */ -#include "../../kernel/sev-es-shared.c" - -static bool early_setup_sev_es(void) -{ - if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); - - if (set_page_decrypted((unsigned long)&boot_ghcb_page)) - return false; - - /* Page is now mapped decrypted, clear it */ - memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); - - boot_ghcb = &boot_ghcb_page; - - /* Initialize lookup tables for the instruction decoder */ - inat_init_tables(); - - return true; -} - -void sev_es_shutdown_ghcb(void) -{ - if (!boot_ghcb) - return; - - if (!sev_es_check_cpu_features()) - error("SEV-ES CPU Features missing."); - - /* - * GHCB Page must be flushed from the cache and mapped encrypted again. - * Otherwise the running kernel will see strange cache effects when - * trying to use that page. - */ - if (set_page_encrypted((unsigned long)&boot_ghcb_page)) - error("Can't map GHCB page encrypted"); - - /* - * GHCB page is mapped encrypted again and flushed from the cache. - * Mark it non-present now to catch bugs when #VC exceptions trigger - * after this point. - */ - if (set_page_non_present((unsigned long)&boot_ghcb_page)) - error("Can't unmap GHCB page"); -} - -bool sev_es_check_ghcb_fault(unsigned long address) -{ - /* Check whether the fault was on the GHCB page */ - return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); -} - -void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) -{ - struct es_em_ctxt ctxt; - enum es_result result; - - if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - - vc_ghcb_invalidate(boot_ghcb); - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result != ES_OK) - goto finish; - - switch (exit_code) { - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(boot_ghcb, &ctxt); - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(boot_ghcb, &ctxt); - break; - default: - result = ES_UNSUPPORTED; - break; - } - -finish: - if (result == ES_OK) - vc_finish_insn(&ctxt); - else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); -} diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c new file mode 100644 index 000000000000..670e998fe930 --- /dev/null +++ b/arch/x86/boot/compressed/sev.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "error.h" + +struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +static bool insn_has_rep_prefix(struct insn *insn) +{ + insn_byte_t p; + int i; + + insn_get_prefixes(insn); + + for_each_insn_prefix(insn, i, p) { + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + unsigned long low, high; + + asm volatile("rdmsr" : "=a" (low), "=d" (high) : + "c" (MSR_AMD64_SEV_ES_GHCB)); + + return ((high << 32) | low); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = val & 0xffffffffUL; + high = val >> 32; + + asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), + "a"(low), "d" (high) : "memory"); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + + return ES_OK; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +#undef __init +#undef __pa +#define __init +#define __pa(x) ((unsigned long)(x)) + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* Include code for early handlers */ +#include "../../kernel/sev-shared.c" + +static bool early_setup_sev_es(void) +{ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + + if (set_page_decrypted((unsigned long)&boot_ghcb_page)) + return false; + + /* Page is now mapped decrypted, clear it */ + memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + + boot_ghcb = &boot_ghcb_page; + + /* Initialize lookup tables for the instruction decoder */ + inat_init_tables(); + + return true; +} + +void sev_es_shutdown_ghcb(void) +{ + if (!boot_ghcb) + return; + + if (!sev_es_check_cpu_features()) + error("SEV-ES CPU Features missing."); + + /* + * GHCB Page must be flushed from the cache and mapped encrypted again. + * Otherwise the running kernel will see strange cache effects when + * trying to use that page. + */ + if (set_page_encrypted((unsigned long)&boot_ghcb_page)) + error("Can't map GHCB page encrypted"); + + /* + * GHCB page is mapped encrypted again and flushed from the cache. + * Mark it non-present now to catch bugs when #VC exceptions trigger + * after this point. + */ + if (set_page_non_present((unsigned long)&boot_ghcb_page)) + error("Can't unmap GHCB page"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ + /* Check whether the fault was on the GHCB page */ + return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_sev_es()) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) + vc_finish_insn(&ctxt); + else if (result != ES_RETRY) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); +} diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h deleted file mode 100644 index cf1d957c7091..000000000000 --- a/arch/x86/include/asm/sev-es.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel - */ - -#ifndef __ASM_ENCRYPTED_STATE_H -#define __ASM_ENCRYPTED_STATE_H - -#include -#include - -#define GHCB_SEV_INFO 0x001UL -#define GHCB_SEV_INFO_REQ 0x002UL -#define GHCB_INFO(v) ((v) & 0xfffUL) -#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) -#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) -#define GHCB_PROTO_OUR 0x0001UL -#define GHCB_SEV_CPUID_REQ 0x004UL -#define GHCB_CPUID_REQ_EAX 0 -#define GHCB_CPUID_REQ_EBX 1 -#define GHCB_CPUID_REQ_ECX 2 -#define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ - (((unsigned long)reg & 3) << 30) | \ - (((unsigned long)fn) << 32)) - -#define GHCB_PROTOCOL_MAX 0x0001UL -#define GHCB_DEFAULT_USAGE 0x0000UL - -#define GHCB_SEV_CPUID_RESP 0x005UL -#define GHCB_SEV_TERMINATE 0x100UL -#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & 0x7) << 12) | \ - ((((u64)reason_val) & 0xff) << 16)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 - -#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) -#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } - -enum es_result { - ES_OK, /* All good */ - ES_UNSUPPORTED, /* Requested operation not supported */ - ES_VMM_ERROR, /* Unexpected state from the VMM */ - ES_DECODE_FAILED, /* Instruction decoding failed */ - ES_EXCEPTION, /* Instruction caused exception */ - ES_RETRY, /* Retry instruction emulation */ -}; - -struct es_fault_info { - unsigned long vector; - unsigned long error_code; - unsigned long cr2; -}; - -struct pt_regs; - -/* ES instruction emulation context */ -struct es_em_ctxt { - struct pt_regs *regs; - struct insn insn; - struct es_fault_info fi; -}; - -void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); - -static inline u64 lower_bits(u64 val, unsigned int bits) -{ - u64 mask = (1ULL << bits) - 1; - - return (val & mask); -} - -struct real_mode_header; -enum stack_type; - -/* Early IDT entry points for #VC handler */ -extern void vc_no_ghcb(void); -extern void vc_boot_ghcb(void); -extern bool handle_vc_boot_ghcb(struct pt_regs *regs); - -#ifdef CONFIG_AMD_MEM_ENCRYPT -extern struct static_key_false sev_es_enable_key; -extern void __sev_es_ist_enter(struct pt_regs *regs); -extern void __sev_es_ist_exit(void); -static __always_inline void sev_es_ist_enter(struct pt_regs *regs) -{ - if (static_branch_unlikely(&sev_es_enable_key)) - __sev_es_ist_enter(regs); -} -static __always_inline void sev_es_ist_exit(void) -{ - if (static_branch_unlikely(&sev_es_enable_key)) - __sev_es_ist_exit(); -} -extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); -extern void __sev_es_nmi_complete(void); -static __always_inline void sev_es_nmi_complete(void) -{ - if (static_branch_unlikely(&sev_es_enable_key)) - __sev_es_nmi_complete(); -} -extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); -#else -static inline void sev_es_ist_enter(struct pt_regs *regs) { } -static inline void sev_es_ist_exit(void) { } -static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } -static inline void sev_es_nmi_complete(void) { } -static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } -#endif - -#endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h new file mode 100644 index 000000000000..cf1d957c7091 --- /dev/null +++ b/arch/x86/include/asm/sev.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + */ + +#ifndef __ASM_ENCRYPTED_STATE_H +#define __ASM_ENCRYPTED_STATE_H + +#include +#include + +#define GHCB_SEV_INFO 0x001UL +#define GHCB_SEV_INFO_REQ 0x002UL +#define GHCB_INFO(v) ((v) & 0xfffUL) +#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) +#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_SEV_CPUID_REQ 0x004UL +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ + (((unsigned long)reg & 3) << 30) | \ + (((unsigned long)fn) << 32)) + +#define GHCB_PROTOCOL_MAX 0x0001UL +#define GHCB_DEFAULT_USAGE 0x0000UL + +#define GHCB_SEV_CPUID_RESP 0x005UL +#define GHCB_SEV_TERMINATE 0x100UL +#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & 0x7) << 12) | \ + ((((u64)reason_val) & 0xff) << 16)) +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) +#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } + +enum es_result { + ES_OK, /* All good */ + ES_UNSUPPORTED, /* Requested operation not supported */ + ES_VMM_ERROR, /* Unexpected state from the VMM */ + ES_DECODE_FAILED, /* Instruction decoding failed */ + ES_EXCEPTION, /* Instruction caused exception */ + ES_RETRY, /* Retry instruction emulation */ +}; + +struct es_fault_info { + unsigned long vector; + unsigned long error_code; + unsigned long cr2; +}; + +struct pt_regs; + +/* ES instruction emulation context */ +struct es_em_ctxt { + struct pt_regs *regs; + struct insn insn; + struct es_fault_info fi; +}; + +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); + +static inline u64 lower_bits(u64 val, unsigned int bits) +{ + u64 mask = (1ULL << bits) - 1; + + return (val & mask); +} + +struct real_mode_header; +enum stack_type; + +/* Early IDT entry points for #VC handler */ +extern void vc_no_ghcb(void); +extern void vc_boot_ghcb(void); +extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern struct static_key_false sev_es_enable_key; +extern void __sev_es_ist_enter(struct pt_regs *regs); +extern void __sev_es_ist_exit(void); +static __always_inline void sev_es_ist_enter(struct pt_regs *regs) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_enter(regs); +} +static __always_inline void sev_es_ist_exit(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_exit(); +} +extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); +extern void __sev_es_nmi_complete(void); +static __always_inline void sev_es_nmi_complete(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_nmi_complete(); +} +extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +#else +static inline void sev_es_ist_enter(struct pt_regs *regs) { } +static inline void sev_es_ist_exit(void) { } +static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } +static inline void sev_es_nmi_complete(void) { } +static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +#endif + +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0704c2a94272..0f66682ac02a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg -CFLAGS_REMOVE_sev-es.o = -pg +CFLAGS_REMOVE_sev.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -28,7 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -KASAN_SANITIZE_sev-es.o := n +KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 18be44163a50..de01903c3735 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include /* * Manage page tables very early on. diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2ef961cf4cfc..4bce802d25fb 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c deleted file mode 100644 index 0aa9f13efd57..000000000000 --- a/arch/x86/kernel/sev-es-shared.c +++ /dev/null @@ -1,525 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel - * - * This file is not compiled stand-alone. It contains code shared - * between the pre-decompression boot code and the running Linux kernel - * and is included directly into both code-bases. - */ - -#ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) -#endif - -static bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } - - return true; -} - -static void __noreturn sev_es_terminate(unsigned int reason) -{ - u64 val = GHCB_SEV_TERMINATE; - - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERMINATE_REASON(0, reason); - - /* Request Guest Termination from Hypvervisor */ - sev_es_wr_ghcb_msr(val); - VMGEXIT(); - - while (true) - asm volatile("hlt\n" : : : "memory"); -} - -static bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_INFO(val) != GHCB_SEV_INFO) - return false; - - if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) - return false; - - return true; -} - -static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) -{ - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); -} - -static bool vc_decoding_needed(unsigned long exit_code) -{ - /* Exceptions don't require to decode the instruction */ - return !(exit_code >= SVM_EXIT_EXCP_BASE && - exit_code <= SVM_EXIT_LAST_EXCP); -} - -static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, - struct pt_regs *regs, - unsigned long exit_code) -{ - enum es_result ret = ES_OK; - - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->regs = regs; - - if (vc_decoding_needed(exit_code)) - ret = vc_decode_insn(ctxt); - - return ret; -} - -static void vc_finish_insn(struct es_em_ctxt *ctxt) -{ - ctxt->regs->ip += ctxt->insn.length; -} - -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - enum es_result ret; - - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v; - - info = ghcb->save.sw_exit_info_2; - v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - ret = ES_EXCEPTION; - } else { - ret = ES_VMM_ERROR; - } - } else { - ret = ES_OK; - } - - return ret; -} - -/* - * Boot VC Handler - This is the first VC handler during boot, there is no GHCB - * page yet, so it only supports the MSR based communication with the - * hypervisor and only the CPUID exit-code. - */ -void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) -{ - unsigned int fn = lower_bits(regs->ax, 32); - unsigned long val; - - /* Only CPUID is supported via MSR protocol */ - if (exit_code != SVM_EXIT_CPUID) - goto fail; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->ax = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->bx = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->cx = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->dx = val >> 32; - - /* - * This is a VC handler and the #VC is only raised when SEV-ES is - * active, which means SEV must be active too. Do sanity checks on the - * CPUID results to make sure the hypervisor does not trick the kernel - * into the no-sev path. This could map sensitive data unencrypted and - * make it accessible to the hypervisor. - * - * In particular, check for: - * - Availability of CPUID leaf 0x8000001f - * - SEV CPUID bit. - * - * The hypervisor might still report the wrong C-bit position, but this - * can't be checked here. - */ - - if (fn == 0x80000000 && (regs->ax < 0x8000001f)) - /* SEV leaf check */ - goto fail; - else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) - /* SEV bit */ - goto fail; - - /* Skip over the CPUID two-byte opcode */ - regs->ip += 2; - - return; - -fail: - /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); -} - -static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, - void *src, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, b = backwards ? -1 : 1; - enum es_result ret = ES_OK; - - for (i = 0; i < count; i++) { - void *s = src + (i * data_size * b); - char *d = buf + (i * data_size); - - ret = vc_read_mem(ctxt, s, d, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, - void *dst, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, s = backwards ? -1 : 1; - enum es_result ret = ES_OK; - - for (i = 0; i < count; i++) { - void *d = dst + (i * data_size * s); - char *b = buf + (i * data_size); - - ret = vc_write_mem(ctxt, d, b, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -#define IOIO_TYPE_STR BIT(2) -#define IOIO_TYPE_IN 1 -#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) -#define IOIO_TYPE_OUT 0 -#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) - -#define IOIO_REP BIT(3) - -#define IOIO_ADDR_64 BIT(9) -#define IOIO_ADDR_32 BIT(8) -#define IOIO_ADDR_16 BIT(7) - -#define IOIO_DATA_32 BIT(6) -#define IOIO_DATA_16 BIT(5) -#define IOIO_DATA_8 BIT(4) - -#define IOIO_SEG_ES (0 << 10) -#define IOIO_SEG_DS (3 << 10) - -static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) -{ - struct insn *insn = &ctxt->insn; - *exitinfo = 0; - - switch (insn->opcode.bytes[0]) { - /* INS opcodes */ - case 0x6c: - case 0x6d: - *exitinfo |= IOIO_TYPE_INS; - *exitinfo |= IOIO_SEG_ES; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - /* OUTS opcodes */ - case 0x6e: - case 0x6f: - *exitinfo |= IOIO_TYPE_OUTS; - *exitinfo |= IOIO_SEG_DS; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - /* IN immediate opcodes */ - case 0xe4: - case 0xe5: - *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u8)insn->immediate.value << 16; - break; - - /* OUT immediate opcodes */ - case 0xe6: - case 0xe7: - *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u8)insn->immediate.value << 16; - break; - - /* IN register opcodes */ - case 0xec: - case 0xed: - *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - /* OUT register opcodes */ - case 0xee: - case 0xef: - *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - default: - return ES_DECODE_FAILED; - } - - switch (insn->opcode.bytes[0]) { - case 0x6c: - case 0x6e: - case 0xe4: - case 0xe6: - case 0xec: - case 0xee: - /* Single byte opcodes */ - *exitinfo |= IOIO_DATA_8; - break; - default: - /* Length determined by instruction parsing */ - *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 - : IOIO_DATA_32; - } - switch (insn->addr_bytes) { - case 2: - *exitinfo |= IOIO_ADDR_16; - break; - case 4: - *exitinfo |= IOIO_ADDR_32; - break; - case 8: - *exitinfo |= IOIO_ADDR_64; - break; - } - - if (insn_has_rep_prefix(insn)) - *exitinfo |= IOIO_REP; - - return ES_OK; -} - -static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u64 exit_info_1, exit_info_2; - enum es_result ret; - - ret = vc_ioio_exitinfo(ctxt, &exit_info_1); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_STR) { - - /* (REP) INS/OUTS */ - - bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); - unsigned int io_bytes, exit_bytes; - unsigned int ghcb_count, op_count; - unsigned long es_base; - u64 sw_scratch; - - /* - * For the string variants with rep prefix the amount of in/out - * operations per #VC exception is limited so that the kernel - * has a chance to take interrupts and re-schedule while the - * instruction is emulated. - */ - io_bytes = (exit_info_1 >> 4) & 0x7; - ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; - - op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; - exit_info_2 = min(op_count, ghcb_count); - exit_bytes = exit_info_2 * io_bytes; - - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - /* Read bytes of OUTS into the shared buffer */ - if (!(exit_info_1 & IOIO_TYPE_IN)) { - ret = vc_insn_string_read(ctxt, - (void *)(es_base + regs->si), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - } - - /* - * Issue an VMGEXIT to the HV to consume the bytes from the - * shared buffer or to have it write them into the shared buffer - * depending on the instruction: OUTS or INS. - */ - sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); - ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, - exit_info_1, exit_info_2); - if (ret != ES_OK) - return ret; - - /* Read bytes from shared buffer into the guest's destination. */ - if (exit_info_1 & IOIO_TYPE_IN) { - ret = vc_insn_string_write(ctxt, - (void *)(es_base + regs->di), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - - if (df) - regs->di -= exit_bytes; - else - regs->di += exit_bytes; - } else { - if (df) - regs->si -= exit_bytes; - else - regs->si += exit_bytes; - } - - if (exit_info_1 & IOIO_REP) - regs->cx -= exit_info_2; - - ret = regs->cx ? ES_RETRY : ES_OK; - - } else { - - /* IN/OUT into/from rAX */ - - int bits = (exit_info_1 & 0x70) >> 1; - u64 rax = 0; - - if (!(exit_info_1 & IOIO_TYPE_IN)) - rax = lower_bits(regs->ax, bits); - - ghcb_set_rax(ghcb, rax); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_IN) { - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - regs->ax = lower_bits(ghcb->save.rax, bits); - } - } - - return ret; -} - -static enum es_result vc_handle_cpuid(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u32 cr4 = native_read_cr4(); - enum es_result ret; - - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rcx(ghcb, regs->cx); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #GP - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - regs->ax = ghcb->save.rax; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); - enum es_result ret; - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && - (!rdtscp || ghcb_rcx_is_valid(ghcb)))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - if (rdtscp) - ctxt->regs->cx = ghcb->save.rcx; - - return ES_OK; -} diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c deleted file mode 100644 index 73873b007838..000000000000 --- a/arch/x86/kernel/sev-es.c +++ /dev/null @@ -1,1461 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * AMD Memory Encryption Support - * - * Copyright (C) 2019 SUSE - * - * Author: Joerg Roedel - */ - -#define pr_fmt(fmt) "SEV-ES: " fmt - -#include /* For show_regs() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DR7_RESET_VALUE 0x400 - -/* For early boot hypervisor communication in SEV-ES enabled guests */ -static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -static struct ghcb __initdata *boot_ghcb; - -/* #VC handler runtime per-CPU data */ -struct sev_es_runtime_data { - struct ghcb ghcb_page; - - /* Physical storage for the per-CPU IST stack of the #VC handler */ - char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); - - /* - * Physical storage for the per-CPU fall-back stack of the #VC handler. - * The fall-back stack is used when it is not safe to switch back to the - * interrupted stack in the #VC entry code. - */ - char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); - - /* - * Reserve one page per CPU as backup storage for the unencrypted GHCB. - * It is needed when an NMI happens while the #VC handler uses the real - * GHCB, and the NMI handler itself is causing another #VC exception. In - * that case the GHCB content of the first handler needs to be backed up - * and restored. - */ - struct ghcb backup_ghcb; - - /* - * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. - * There is no need for it to be atomic, because nothing is written to - * the GHCB between the read and the write of ghcb_active. So it is safe - * to use it when a nested #VC exception happens before the write. - * - * This is necessary for example in the #VC->NMI->#VC case when the NMI - * happens while the first #VC handler uses the GHCB. When the NMI code - * raises a second #VC handler it might overwrite the contents of the - * GHCB written by the first handler. To avoid this the content of the - * GHCB is saved and restored when the GHCB is detected to be in use - * already. - */ - bool ghcb_active; - bool backup_ghcb_active; - - /* - * Cached DR7 value - write it on DR7 writes and return it on reads. - * That value will never make it to the real hardware DR7 as debugging - * is currently unsupported in SEV-ES guests. - */ - unsigned long dr7; -}; - -struct ghcb_state { - struct ghcb *ghcb; -}; - -static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); - -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - -static void __init setup_vc_stacks(int cpu) -{ - struct sev_es_runtime_data *data; - struct cpu_entry_area *cea; - unsigned long vaddr; - phys_addr_t pa; - - data = per_cpu(runtime_data, cpu); - cea = get_cpu_entry_area(cpu); - - /* Map #VC IST stack */ - vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); - pa = __pa(data->ist_stack); - cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); - - /* Map VC fall-back stack */ - vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); - pa = __pa(data->fallback_stack); - cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); -} - -static __always_inline bool on_vc_stack(struct pt_regs *regs) -{ - unsigned long sp = regs->sp; - - /* User-mode RSP is not trusted */ - if (user_mode(regs)) - return false; - - /* SYSCALL gap still has user-mode RSP */ - if (ip_within_syscall_gap(regs)) - return false; - - return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); -} - -/* - * This function handles the case when an NMI is raised in the #VC - * exception handler entry code, before the #VC handler has switched off - * its IST stack. In this case, the IST entry for #VC must be adjusted, - * so that any nested #VC exception will not overwrite the stack - * contents of the interrupted #VC handler. - * - * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a - * nested sev_es_ist_exit() call may adjust back the IST entry too - * early. - * - * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run - * on the NMI IST stack, as they are only called from NMI handling code - * right now. - */ -void noinstr __sev_es_ist_enter(struct pt_regs *regs) -{ - unsigned long old_ist, new_ist; - - /* Read old IST entry */ - new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - /* - * If NMI happened while on the #VC IST stack, set the new IST - * value below regs->sp, so that the interrupted stack frame is - * not overwritten by subsequent #VC exceptions. - */ - if (on_vc_stack(regs)) - new_ist = regs->sp; - - /* - * Reserve additional 8 bytes and store old IST value so this - * adjustment can be unrolled in __sev_es_ist_exit(). - */ - new_ist -= sizeof(old_ist); - *(unsigned long *)new_ist = old_ist; - - /* Set new IST entry */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); -} - -void noinstr __sev_es_ist_exit(void) -{ - unsigned long ist; - - /* Read IST entry */ - ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) - return; - - /* Read back old IST entry and write it to the TSS */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); -} - -static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) - return NULL; - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - -static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - data->ghcb_active = false; - } -} - -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); -} - -static __always_inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; - - low = (u32)(val); - high = (u32)(val >> 32); - - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); -} - -static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, - unsigned char *buffer) -{ - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); -} - -static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int res; - - res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (!res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } - - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) - return ES_DECODE_FAILED; - - if (ctxt->insn.immediate.got) - return ES_OK; - else - return ES_DECODE_FAILED; -} - -static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int res, ret; - - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - else - return ES_OK; -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - if (user_mode(ctxt->regs)) - return __vc_decode_user_insn(ctxt); - else - return __vc_decode_kern_insn(ctxt); -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - char *dst, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; - char __user *target = (char __user *)dst; - u64 d8; - u32 d4; - u16 d2; - u8 d1; - - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(target, size)) { - memcpy(dst, buf, size); - return ES_OK; - } - - switch (size) { - case 1: - memcpy(&d1, buf, 1); - if (put_user(d1, target)) - goto fault; - break; - case 2: - memcpy(&d2, buf, 2); - if (put_user(d2, target)) - goto fault; - break; - case 4: - memcpy(&d4, buf, 4); - if (put_user(d4, target)) - goto fault; - break; - case 8: - memcpy(&d8, buf, 8); - if (put_user(d8, target)) - goto fault; - break; - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)dst; - - return ES_EXCEPTION; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - char *src, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT; - char __user *s = (char __user *)src; - u64 d8; - u32 d4; - u16 d2; - u8 d1; - - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(s, size)) { - memcpy(buf, src, size); - return ES_OK; - } - - switch (size) { - case 1: - if (get_user(d1, s)) - goto fault; - memcpy(buf, &d1, 1); - break; - case 2: - if (get_user(d2, s)) - goto fault; - memcpy(buf, &d2, 2); - break; - case 4: - if (get_user(d4, s)) - goto fault; - memcpy(buf, &d4, 4); - break; - case 8: - if (get_user(d8, s)) - goto fault; - memcpy(buf, &d8, 8); - break; - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)src; - - return ES_EXCEPTION; -} - -static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) -{ - unsigned long va = (unsigned long)vaddr; - unsigned int level; - phys_addr_t pa; - pgd_t *pgd; - pte_t *pte; - - pgd = __va(read_cr3_pa()); - pgd = &pgd[pgd_index(va)]; - pte = lookup_address_in_pgd(pgd, va, &level); - if (!pte) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.cr2 = vaddr; - ctxt->fi.error_code = 0; - - if (user_mode(ctxt->regs)) - ctxt->fi.error_code |= X86_PF_USER; - - return ES_EXCEPTION; - } - - if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) - /* Emulated MMIO to/from encrypted memory not supported */ - return ES_UNSUPPORTED; - - pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - pa |= va & ~page_level_mask(level); - - *paddr = pa; - - return ES_OK; -} - -/* Include code shared with pre-decompression boot stage */ -#include "sev-es-shared.c" - -void noinstr __sev_es_nmi_complete(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = sev_es_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); - VMGEXIT(); - - sev_es_put_ghcb(&state); -} - -static u64 get_jump_table_addr(void) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u64 ret = 0; - - local_irq_save(flags); - - ghcb = sev_es_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (ghcb_sw_exit_info_1_is_valid(ghcb) && - ghcb_sw_exit_info_2_is_valid(ghcb)) - ret = ghcb->save.sw_exit_info_2; - - sev_es_put_ghcb(&state); - - local_irq_restore(flags); - - return ret; -} - -int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) -{ - u16 startup_cs, startup_ip; - phys_addr_t jump_table_pa; - u64 jump_table_addr; - u16 __iomem *jump_table; - - jump_table_addr = get_jump_table_addr(); - - /* On UP guests there is no jump table so this is not a failure */ - if (!jump_table_addr) - return 0; - - /* Check if AP Jump Table is page-aligned */ - if (jump_table_addr & ~PAGE_MASK) - return -EINVAL; - - jump_table_pa = jump_table_addr & PAGE_MASK; - - startup_cs = (u16)(rmh->trampoline_start >> 4); - startup_ip = (u16)(rmh->sev_es_trampoline_start - - rmh->trampoline_start); - - jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); - if (!jump_table) - return -EIO; - - writew(startup_ip, &jump_table[0]); - writew(startup_cs, &jump_table[1]); - - iounmap(jump_table); - - return 0; -} - -/* - * This is needed by the OVMF UEFI firmware which will use whatever it finds in - * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu - * runtime GHCBs used by the kernel are also mapped in the EFI page-table. - */ -int __init sev_es_efi_map_ghcbs(pgd_t *pgd) -{ - struct sev_es_runtime_data *data; - unsigned long address, pflags; - int cpu; - u64 pfn; - - if (!sev_es_active()) - return 0; - - pflags = _PAGE_NX | _PAGE_RW; - - for_each_possible_cpu(cpu) { - data = per_cpu(runtime_data, cpu); - - address = __pa(&data->ghcb_page); - pfn = address >> PAGE_SHIFT; - - if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) - return 1; - } - - return 0; -} - -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - enum es_result ret; - u64 exit_info_1; - - /* Is it a WRMSR? */ - exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; - - ghcb_set_rcx(ghcb, regs->cx); - if (exit_info_1) { - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rdx(ghcb, regs->dx); - } - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); - - if ((ret == ES_OK) && (!exit_info_1)) { - regs->ax = ghcb->save.rax; - regs->dx = ghcb->save.rdx; - } - - return ret; -} - -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) -{ - /* First make sure the hypervisor talks a supported protocol. */ - if (!sev_es_negotiate_protocol()) - return false; - - /* - * Clear the boot_ghcb. The first exception comes in before the bss - * section is cleared. - */ - memset(&boot_ghcb_page, 0, PAGE_SIZE); - - /* Alright - Make the boot-ghcb public */ - boot_ghcb = &boot_ghcb_page; - - return true; -} - -#ifdef CONFIG_HOTPLUG_CPU -static void sev_es_ap_hlt_loop(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = sev_es_get_ghcb(&state); - - while (true) { - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - /* Wakeup signal? */ - if (ghcb_sw_exit_info_2_is_valid(ghcb) && - ghcb->save.sw_exit_info_2) - break; - } - - sev_es_put_ghcb(&state); -} - -/* - * Play_dead handler when running under SEV-ES. This is needed because - * the hypervisor can't deliver an SIPI request to restart the AP. - * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the - * hypervisor wakes it up again. - */ -static void sev_es_play_dead(void) -{ - play_dead_common(); - - /* IRQs now disabled */ - - sev_es_ap_hlt_loop(); - - /* - * If we get here, the VCPU was woken up again. Jump to CPU - * startup code to get it back online. - */ - start_cpu0(); -} -#else /* CONFIG_HOTPLUG_CPU */ -#define sev_es_play_dead native_play_dead -#endif /* CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_SMP -static void __init sev_es_setup_play_dead(void) -{ - smp_ops.play_dead = sev_es_play_dead; -} -#else -static inline void sev_es_setup_play_dead(void) { } -#endif - -static void __init alloc_runtime_data(int cpu) -{ - struct sev_es_runtime_data *data; - - data = memblock_alloc(sizeof(*data), PAGE_SIZE); - if (!data) - panic("Can't allocate SEV-ES runtime data"); - - per_cpu(runtime_data, cpu) = data; -} - -static void __init init_ghcb(int cpu) -{ - struct sev_es_runtime_data *data; - int err; - - data = per_cpu(runtime_data, cpu); - - err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, - sizeof(data->ghcb_page)); - if (err) - panic("Can't map GHCBs unencrypted"); - - memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); - - data->ghcb_active = false; - data->backup_ghcb_active = false; -} - -void __init sev_es_init_vc_handling(void) -{ - int cpu; - - BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - - if (!sev_es_active()) - return; - - if (!sev_es_check_cpu_features()) - panic("SEV-ES CPU Features missing"); - - /* Enable SEV-ES special handling */ - static_branch_enable(&sev_es_enable_key); - - /* Initialize per-cpu GHCB pages */ - for_each_possible_cpu(cpu) { - alloc_runtime_data(cpu); - init_ghcb(cpu); - setup_vc_stacks(cpu); - } - - sev_es_setup_play_dead(); - - /* Secondary CPUs use the runtime #VC handler */ - initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; -} - -static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) -{ - int trapnr = ctxt->fi.vector; - - if (trapnr == X86_TRAP_PF) - native_write_cr2(ctxt->fi.cr2); - - ctxt->regs->orig_ax = ctxt->fi.error_code; - do_early_exception(ctxt->regs, trapnr); -} - -static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} - -static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} -static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned int bytes, bool read) -{ - u64 exit_code, exit_info_1, exit_info_2; - unsigned long ghcb_pa = __pa(ghcb); - enum es_result res; - phys_addr_t paddr; - void __user *ref; - - ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); - if (ref == (void __user *)-1L) - return ES_UNSUPPORTED; - - exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - - res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); - if (res != ES_OK) { - if (res == ES_EXCEPTION && !read) - ctxt->fi.error_code |= X86_PF_WRITE; - - return res; - } - - exit_info_1 = paddr; - /* Can never be greater than 8 */ - exit_info_2 = bytes; - - ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - int sign_byte; - long *reg_data; - - switch (insn->opcode.bytes[1]) { - /* MMIO Read w/ zero-extension */ - case 0xb6: - bytes = 1; - fallthrough; - case 0xb7: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memset(reg_data, 0, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MMIO Read w/ sign-extension */ - case 0xbe: - bytes = 1; - fallthrough; - case 0xbf: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Sign extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - memset(reg_data, sign_byte, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - default: - ret = ES_UNSUPPORTED; - } - - return ret; -} - -/* - * The MOVS instruction has two memory operands, which raises the - * problem that it is not known whether the access to the source or the - * destination caused the #VC exception (and hence whether an MMIO read - * or write operation needs to be emulated). - * - * Instead of playing games with walking page-tables and trying to guess - * whether the source or destination is an MMIO range, split the move - * into two operations, a read and a write with only one memory operand. - * This will cause a nested #VC exception on the MMIO address which can - * then be handled. - * - * This implementation has the benefit that it also supports MOVS where - * source _and_ destination are MMIO regions. - * - * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a - * rare operation. If it turns out to be a performance problem the split - * operations can be moved to memcpy_fromio() and memcpy_toio(). - */ -static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, - unsigned int bytes) -{ - unsigned long ds_base, es_base; - unsigned char *src, *dst; - unsigned char buffer[8]; - enum es_result ret; - bool rep; - int off; - - ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - if (ds_base == -1L || es_base == -1L) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - src = ds_base + (unsigned char *)ctxt->regs->si; - dst = es_base + (unsigned char *)ctxt->regs->di; - - ret = vc_read_mem(ctxt, src, buffer, bytes); - if (ret != ES_OK) - return ret; - - ret = vc_write_mem(ctxt, dst, buffer, bytes); - if (ret != ES_OK) - return ret; - - if (ctxt->regs->flags & X86_EFLAGS_DF) - off = -bytes; - else - off = bytes; - - ctxt->regs->si += off; - ctxt->regs->di += off; - - rep = insn_has_rep_prefix(&ctxt->insn); - if (rep) - ctxt->regs->cx -= 1; - - if (!rep || ctxt->regs->cx == 0) - return ES_OK; - else - return ES_RETRY; -} - -static enum es_result vc_handle_mmio(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - long *reg_data; - - switch (insn->opcode.bytes[0]) { - /* MMIO Write */ - case 0x88: - bytes = 1; - fallthrough; - case 0x89: - if (!bytes) - bytes = insn->opnd_bytes; - - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memcpy(ghcb->shared_buffer, reg_data, bytes); - - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - - case 0xc6: - bytes = 1; - fallthrough; - case 0xc7: - if (!bytes) - bytes = insn->opnd_bytes; - - memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - - /* MMIO Read */ - case 0x8a: - bytes = 1; - fallthrough; - case 0x8b: - if (!bytes) - bytes = insn->opnd_bytes; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - /* Zero-extend for 32-bit operation */ - if (bytes == 4) - *reg_data = 0; - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MOVS instruction */ - case 0xa4: - bytes = 1; - fallthrough; - case 0xa5: - if (!bytes) - bytes = insn->opnd_bytes; - - ret = vc_handle_mmio_movs(ctxt, bytes); - break; - /* Two-Byte Opcodes */ - case 0x0f: - ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); - break; - default: - ret = ES_UNSUPPORTED; - } - - return ret; -} - -static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long val, *reg = vc_insn_get_rm(ctxt); - enum es_result ret; - - if (!reg) - return ES_DECODE_FAILED; - - val = *reg; - - /* Upper 32 bits must be written as zeroes */ - if (val >> 32) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - /* Clear out other reserved bits and set bit 10 */ - val = (val & 0xffff23ffL) | BIT(10); - - /* Early non-zero writes to DR7 are not supported */ - if (!data && (val & ~DR7_RESET_VALUE)) - return ES_UNSUPPORTED; - - /* Using a value of 0 for ExitInfo1 means RAX holds the value */ - ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); - if (ret != ES_OK) - return ret; - - if (data) - data->dr7 = val; - - return ES_OK; -} - -static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long *reg = vc_insn_get_rm(ctxt); - - if (!reg) - return ES_DECODE_FAILED; - - if (data) - *reg = data->dr7; - else - *reg = DR7_RESET_VALUE; - - return ES_OK; -} - -static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); -} - -static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rcx(ghcb, ctxt->regs->cx); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_monitor(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Treat it as a NOP and do not leak a physical address to the - * hypervisor. - */ - return ES_OK; -} - -static enum es_result vc_handle_mwait(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* Treat the same as MONITOR/MONITORX */ - return ES_OK; -} - -static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rax(ghcb, ctxt->regs->ax); - ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); - - if (x86_platform.hyper.sev_es_hcall_prepare) - x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); - if (ret != ES_OK) - return ret; - - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - - /* - * Call sev_es_hcall_finish() after regs->ax is already set. - * This allows the hypervisor handler to overwrite it again if - * necessary. - */ - if (x86_platform.hyper.sev_es_hcall_finish && - !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) - return ES_VMM_ERROR; - - return ES_OK; -} - -static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Calling ecx_alignment_check() directly does not work, because it - * enables IRQs and the GHCB is active. Forward the exception and call - * it later from vc_forward_exception(). - */ - ctxt->fi.vector = X86_TRAP_AC; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; -} - -static __always_inline void vc_handle_trap_db(struct pt_regs *regs) -{ - if (user_mode(regs)) - noist_exc_debug(regs); - else - exc_debug(regs); -} - -static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, - struct ghcb *ghcb, - unsigned long exit_code) -{ - enum es_result result; - - switch (exit_code) { - case SVM_EXIT_READ_DR7: - result = vc_handle_dr7_read(ghcb, ctxt); - break; - case SVM_EXIT_WRITE_DR7: - result = vc_handle_dr7_write(ghcb, ctxt); - break; - case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: - result = vc_handle_trap_ac(ghcb, ctxt); - break; - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(ghcb, ctxt, exit_code); - break; - case SVM_EXIT_RDPMC: - result = vc_handle_rdpmc(ghcb, ctxt); - break; - case SVM_EXIT_INVD: - pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); - result = ES_UNSUPPORTED; - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(ghcb, ctxt); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(ghcb, ctxt); - break; - case SVM_EXIT_MSR: - result = vc_handle_msr(ghcb, ctxt); - break; - case SVM_EXIT_VMMCALL: - result = vc_handle_vmmcall(ghcb, ctxt); - break; - case SVM_EXIT_WBINVD: - result = vc_handle_wbinvd(ghcb, ctxt); - break; - case SVM_EXIT_MONITOR: - result = vc_handle_monitor(ghcb, ctxt); - break; - case SVM_EXIT_MWAIT: - result = vc_handle_mwait(ghcb, ctxt); - break; - case SVM_EXIT_NPF: - result = vc_handle_mmio(ghcb, ctxt); - break; - default: - /* - * Unexpected #VC exception - */ - result = ES_UNSUPPORTED; - } - - return result; -} - -static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) -{ - long error_code = ctxt->fi.error_code; - int trapnr = ctxt->fi.vector; - - ctxt->regs->orig_ax = ctxt->fi.error_code; - - switch (trapnr) { - case X86_TRAP_GP: - exc_general_protection(ctxt->regs, error_code); - break; - case X86_TRAP_UD: - exc_invalid_op(ctxt->regs); - break; - case X86_TRAP_AC: - exc_alignment_check(ctxt->regs, error_code); - break; - default: - pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); - BUG(); - } -} - -static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) -{ - unsigned long sp = (unsigned long)regs; - - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); -} - -/* - * Main #VC exception handler. It is called when the entry code was able to - * switch off the IST to a safe kernel stack. - * - * With the current implementation it is always possible to switch to a safe - * stack because #VC exceptions only happen at known places, like intercepted - * instructions or accesses to MMIO areas/IO ports. They can also happen with - * code instrumentation when the hypervisor intercepts #DB, but the critical - * paths are forbidden to be instrumented, so #DB exceptions currently also - * only happen in safe places. - */ -DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - irqentry_state_t irq_state; - struct ghcb_state state; - struct es_em_ctxt ctxt; - enum es_result result; - struct ghcb *ghcb; - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { - vc_handle_trap_db(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - lockdep_assert_irqs_disabled(); - instrumentation_begin(); - - /* - * This is invoked through an interrupt gate, so IRQs are disabled. The - * code below might walk page-tables for user or kernel addresses, so - * keep the IRQs disabled to protect us against concurrent TLB flushes. - */ - - ghcb = sev_es_get_ghcb(&state); - if (!ghcb) { - /* - * Mark GHCBs inactive so that panic() is able to print the - * message. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - } - - vc_ghcb_invalidate(ghcb); - result = vc_init_em_ctxt(&ctxt, regs, error_code); - - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, ghcb, error_code); - - sev_es_put_ghcb(&state); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - error_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - pr_emerg("Unknown result in %s():%d\n", __func__, result); - /* - * Emulating the instruction which caused the #VC exception - * failed - can't continue so print debug information - */ - BUG(); - } - -out: - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); - - return; - -fail: - if (user_mode(regs)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } else { - pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", - result); - - /* Show some debug info */ - show_regs(regs); - - /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - - /* If that fails and we get here - just panic */ - panic("Returned from Terminate-Request to Hypervisor\n"); - } - - goto out; -} - -/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ -DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) -{ - instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); -} - -DEFINE_IDTENTRY_VC(exc_vmm_communication) -{ - if (likely(!on_vc_fallback_stack(regs))) - safe_stack_exc_vmm_communication(regs, error_code); - else - ist_exc_vmm_communication(regs, error_code); -} - -bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -{ - unsigned long exit_code = regs->orig_ax; - struct es_em_ctxt ctxt; - enum es_result result; - - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - - vc_ghcb_invalidate(boot_ghcb); - - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_early_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - BUG(); - } - - return true; - -fail: - show_regs(regs); - - while (true) - halt(); -} diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c new file mode 100644 index 000000000000..0aa9f13efd57 --- /dev/null +++ b/arch/x86/kernel/sev-shared.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#endif + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void __noreturn sev_es_terminate(unsigned int reason) +{ + u64 val = GHCB_SEV_TERMINATE; + + /* + * Tell the hypervisor what went wrong - only reason-set 0 is + * currently supported. + */ + val |= GHCB_SEV_TERMINATE_REASON(0, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_INFO(val) != GHCB_SEV_INFO) + return false; + + if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + return false; + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + enum es_result ret; + + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v; + + info = ghcb->save.sw_exit_info_2; + v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + ret = ES_EXCEPTION; + } else { + ret = ES_VMM_ERROR; + } + } else { + ret = ES_OK; + } + + return ret; +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int fn = lower_bits(regs->ax, 32); + unsigned long val; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->ax = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->bx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->cx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->dx = val >> 32; + + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (u8)insn->immediate.value << 16; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (u8)insn->immediate.value << 16; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + default: + return ES_DECODE_FAILED; + } + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + } + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return ES_OK; +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c new file mode 100644 index 000000000000..9578c82832aa --- /dev/null +++ b/arch/x86/kernel/sev.c @@ -0,0 +1,1461 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel + */ + +#define pr_fmt(fmt) "SEV-ES: " fmt + +#include /* For show_regs() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DR7_RESET_VALUE 0x400 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb __initdata *boot_ghcb; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* Physical storage for the per-CPU IST stack of the #VC handler */ + char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Physical storage for the per-CPU fall-back stack of the #VC handler. + * The fall-back stack is used when it is not safe to switch back to the + * interrupted stack in the #VC entry code. + */ + char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static void __init setup_vc_stacks(int cpu) +{ + struct sev_es_runtime_data *data; + struct cpu_entry_area *cea; + unsigned long vaddr; + phys_addr_t pa; + + data = per_cpu(runtime_data, cpu); + cea = get_cpu_entry_area(cpu); + + /* Map #VC IST stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); + pa = __pa(data->ist_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + + /* Map VC fall-back stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); + pa = __pa(data->fallback_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); +} + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; + + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) + return NULL; + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + data->ghcb_active = false; + } +} + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res; + + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; + + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + char __user *target = (char __user *)dst; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(target, size)) { + memcpy(dst, buf, size); + return ES_OK; + } + + switch (size) { + case 1: + memcpy(&d1, buf, 1); + if (put_user(d1, target)) + goto fault; + break; + case 2: + memcpy(&d2, buf, 2); + if (put_user(d2, target)) + goto fault; + break; + case 4: + memcpy(&d4, buf, 4); + if (put_user(d4, target)) + goto fault; + break; + case 8: + memcpy(&d8, buf, 8); + if (put_user(d8, target)) + goto fault; + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + char __user *s = (char __user *)src; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(s, size)) { + memcpy(buf, src, size); + return ES_OK; + } + + switch (size) { + case 1: + if (get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + case 2: + if (get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + case 4: + if (get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + case 8: + if (get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + sev_es_put_ghcb(&state); +} + +static u64 get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + local_irq_save(flags); + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + sev_es_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!sev_es_active()) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +/* + * This function runs on the first #VC exception after the kernel + * switched to virtual addresses. + */ +static bool __init sev_es_setup_ghcb(void) +{ + /* First make sure the hypervisor talks a supported protocol. */ + if (!sev_es_negotiate_protocol()) + return false; + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + sev_es_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + start_cpu0(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!sev_es_active()) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* Enable SEV-ES special handling */ + static_branch_enable(&sev_es_enable_key); + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + setup_vc_stacks(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + int sign_byte; + long *reg_data; + + switch (insn->opcode.bytes[1]) { + /* MMIO Read w/ zero-extension */ + case 0xb6: + bytes = 1; + fallthrough; + case 0xb7: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memset(reg_data, 0, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MMIO Read w/ sign-extension */ + case 0xbe: + bytes = 1; + fallthrough; + case 0xbf: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Sign extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + memset(reg_data, sign_byte, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + long *reg_data; + + switch (insn->opcode.bytes[0]) { + /* MMIO Write */ + case 0x88: + bytes = 1; + fallthrough; + case 0x89: + if (!bytes) + bytes = insn->opnd_bytes; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memcpy(ghcb->shared_buffer, reg_data, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + case 0xc6: + bytes = 1; + fallthrough; + case 0xc7: + if (!bytes) + bytes = insn->opnd_bytes; + + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + /* MMIO Read */ + case 0x8a: + bytes = 1; + fallthrough; + case 0x8b: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MOVS instruction */ + case 0xa4: + bytes = 1; + fallthrough; + case 0xa5: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + /* Two-Byte Opcodes */ + case 0x0f: + ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + break; + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static __always_inline void vc_handle_trap_db(struct pt_regs *regs) +{ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +{ + unsigned long sp = (unsigned long)regs; + + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +/* + * Main #VC exception handler. It is called when the entry code was able to + * switch off the IST to a safe kernel stack. + * + * With the current implementation it is always possible to switch to a safe + * stack because #VC exceptions only happen at known places, like intercepted + * instructions or accesses to MMIO areas/IO ports. They can also happen with + * code instrumentation when the hypervisor intercepts #DB, but the critical + * paths are forbidden to be instrumented, so #DB exceptions currently also + * only happen in safe places. + */ +DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { + vc_handle_trap_db(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); + instrumentation_begin(); + + /* + * This is invoked through an interrupt gate, so IRQs are disabled. The + * code below might walk page-tables for user or kernel addresses, so + * keep the IRQs disabled to protect us against concurrent TLB flushes. + */ + + ghcb = sev_es_get_ghcb(&state); + if (!ghcb) { + /* + * Mark GHCBs inactive so that panic() is able to print the + * message. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + } + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + sev_es_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + +out: + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); + + return; + +fail: + if (user_mode(regs)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } else { + pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", + result); + + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + goto out; +} + +/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ +DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +{ + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); +} + +DEFINE_IDTENTRY_VC(exc_vmm_communication) +{ + if (likely(!on_vc_fallback_stack(regs))) + safe_stack_exc_vmm_communication(regs, error_code); + else + ist_exc_vmm_communication(regs, error_code); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + /* Do initial setup or terminate the guest */ + if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + while (true) + halt(); +} diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index b93d6cd08a7f..121921b2927c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index df7b5477fc4f..7515e78ef898 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include /* * We allocate runtime services regions top-down, starting from -4G, i.e. diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 1be71ef5e4c4..2e1c1bec0f9e 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; -- cgit v1.2.3 From b81fc74d53d1248de6db3136dd6b29e5d5528021 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:35 -0500 Subject: x86/sev: Move GHCB MSR protocol and NAE definitions in a common header The guest and the hypervisor contain separate macros to get and set the GHCB MSR protocol and NAE event fields. Consolidate the GHCB protocol definitions and helper macros in one place. Leave the supported protocol version define in separate files to keep the guest and hypervisor flexibility to support different GHCB version in the same release. There is no functional change intended. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-3-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 62 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/sev.h | 30 +++---------------- arch/x86/kernel/sev-shared.c | 20 ++++++------- arch/x86/kvm/svm/svm.h | 38 +++--------------------- 4 files changed, 80 insertions(+), 70 deletions(-) create mode 100644 arch/x86/include/asm/sev-common.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h new file mode 100644 index 000000000000..629c3df243f0 --- /dev/null +++ b/arch/x86/include/asm/sev-common.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header common between the guest and the hypervisor. + * + * Author: Brijesh Singh + */ + +#ifndef __ASM_X86_SEV_COMMON_H +#define __ASM_X86_SEV_COMMON_H + +#define GHCB_MSR_INFO_POS 0 +#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) + +#define GHCB_MSR_SEV_INFO_RESP 0x001 +#define GHCB_MSR_SEV_INFO_REQ 0x002 +#define GHCB_MSR_VER_MAX_POS 48 +#define GHCB_MSR_VER_MAX_MASK 0xffff +#define GHCB_MSR_VER_MIN_POS 32 +#define GHCB_MSR_VER_MIN_MASK 0xffff +#define GHCB_MSR_CBIT_POS 24 +#define GHCB_MSR_CBIT_MASK 0xff +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ + (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ + (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + GHCB_MSR_SEV_INFO_RESP) +#define GHCB_MSR_INFO(v) ((v) & 0xfffUL) +#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) +#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) + +#define GHCB_MSR_CPUID_REQ 0x004 +#define GHCB_MSR_CPUID_RESP 0x005 +#define GHCB_MSR_CPUID_FUNC_POS 32 +#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff +#define GHCB_MSR_CPUID_VALUE_POS 32 +#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff +#define GHCB_MSR_CPUID_REG_POS 30 +#define GHCB_MSR_CPUID_REG_MASK 0x3 +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) \ + (GHCB_MSR_CPUID_REQ | \ + (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ + (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) + +#define GHCB_MSR_TERM_REQ 0x100 +#define GHCB_MSR_TERM_REASON_SET_POS 12 +#define GHCB_MSR_TERM_REASON_SET_MASK 0xf +#define GHCB_MSR_TERM_REASON_POS 16 +#define GHCB_MSR_TERM_REASON_MASK 0xff +#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \ + ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS)) + +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) + +#endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index cf1d957c7091..fa5cd05d3b5b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -10,34 +10,12 @@ #include #include +#include -#define GHCB_SEV_INFO 0x001UL -#define GHCB_SEV_INFO_REQ 0x002UL -#define GHCB_INFO(v) ((v) & 0xfffUL) -#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) -#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) -#define GHCB_PROTO_OUR 0x0001UL -#define GHCB_SEV_CPUID_REQ 0x004UL -#define GHCB_CPUID_REQ_EAX 0 -#define GHCB_CPUID_REQ_EBX 1 -#define GHCB_CPUID_REQ_ECX 2 -#define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ - (((unsigned long)reg & 3) << 30) | \ - (((unsigned long)fn) << 32)) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_PROTOCOL_MAX 1ULL +#define GHCB_DEFAULT_USAGE 0ULL -#define GHCB_PROTOCOL_MAX 0x0001UL -#define GHCB_DEFAULT_USAGE 0x0000UL - -#define GHCB_SEV_CPUID_RESP 0x005UL -#define GHCB_SEV_TERMINATE 0x100UL -#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & 0x7) << 12) | \ - ((((u64)reason_val) & 0xff) << 16)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 - -#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } enum es_result { diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 0aa9f13efd57..6ec8b3bfd76e 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -26,13 +26,13 @@ static bool __init sev_es_check_cpu_features(void) static void __noreturn sev_es_terminate(unsigned int reason) { - u64 val = GHCB_SEV_TERMINATE; + u64 val = GHCB_MSR_TERM_REQ; /* * Tell the hypervisor what went wrong - only reason-set 0 is * currently supported. */ - val |= GHCB_SEV_TERMINATE_REASON(0, reason); + val |= GHCB_SEV_TERM_REASON(0, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -47,15 +47,15 @@ static bool sev_es_negotiate_protocol(void) u64 val; /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_INFO(val) != GHCB_SEV_INFO) + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) return false; return true; @@ -153,28 +153,28 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->ax = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->bx = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->cx = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->dx = val >> 32; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 84b3133c2251..42f8a7b9048f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -20,6 +20,7 @@ #include #include +#include #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) @@ -525,40 +526,9 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ -#define GHCB_VERSION_MAX 1ULL -#define GHCB_VERSION_MIN 1ULL - -#define GHCB_MSR_INFO_POS 0 -#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) - -#define GHCB_MSR_SEV_INFO_RESP 0x001 -#define GHCB_MSR_SEV_INFO_REQ 0x002 -#define GHCB_MSR_VER_MAX_POS 48 -#define GHCB_MSR_VER_MAX_MASK 0xffff -#define GHCB_MSR_VER_MIN_POS 32 -#define GHCB_MSR_VER_MIN_MASK 0xffff -#define GHCB_MSR_CBIT_POS 24 -#define GHCB_MSR_CBIT_MASK 0xff -#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ - ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ - (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ - (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ - GHCB_MSR_SEV_INFO_RESP) - -#define GHCB_MSR_CPUID_REQ 0x004 -#define GHCB_MSR_CPUID_RESP 0x005 -#define GHCB_MSR_CPUID_FUNC_POS 32 -#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff -#define GHCB_MSR_CPUID_VALUE_POS 32 -#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff -#define GHCB_MSR_CPUID_REG_POS 30 -#define GHCB_MSR_CPUID_REG_MASK 0x3 - -#define GHCB_MSR_TERM_REQ 0x100 -#define GHCB_MSR_TERM_REASON_SET_POS 12 -#define GHCB_MSR_TERM_REASON_SET_MASK 0xf -#define GHCB_MSR_TERM_REASON_POS 16 -#define GHCB_MSR_TERM_REASON_MASK 0xff +#define GHCB_VERSION_MAX 1ULL +#define GHCB_VERSION_MIN 1ULL + extern unsigned int max_sev_asid; -- cgit v1.2.3 From 059e5c321a65657877924256ea8ad9c0df257b45 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:36 -0500 Subject: x86/msr: Rename MSR_K8_SYSCFG to MSR_AMD64_SYSCFG The SYSCFG MSR continued being updated beyond the K8 family; drop the K8 name from it. Suggested-by: Borislav Petkov Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-4-brijesh.singh@amd.com --- Documentation/virt/kvm/amd-memory-encryption.rst | 2 +- Documentation/x86/amd-memory-encryption.rst | 6 +++--- arch/x86/include/asm/msr-index.h | 6 +++--- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 6 +++--- arch/x86/pci/amd_bus.c | 2 +- arch/x86/realmode/rm/trampoline_64.S | 4 ++-- drivers/edac/amd64_edac.c | 2 +- tools/arch/x86/include/asm/msr-index.h | 6 +++--- 14 files changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index 5ec8a1902e15..5c081c8c7164 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -22,7 +22,7 @@ to SEV:: [ecx]: Bits[31:0] Number of encrypted guests supported simultaneously -If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +If support for SEV is present, MSR 0xc001_0010 (MSR_AMD64_SYSCFG) and MSR 0xc001_0015 (MSR_K7_HWCR) can be used to determine if it can be enabled:: 0xc001_0010: diff --git a/Documentation/x86/amd-memory-encryption.rst b/Documentation/x86/amd-memory-encryption.rst index c48d452d0718..a1940ebe7be5 100644 --- a/Documentation/x86/amd-memory-encryption.rst +++ b/Documentation/x86/amd-memory-encryption.rst @@ -53,7 +53,7 @@ CPUID function 0x8000001f reports information related to SME:: system physical addresses, not guest physical addresses) -If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +If support for SME is present, MSR 0xc00100010 (MSR_AMD64_SYSCFG) can be used to determine if SME is enabled and/or to enable memory encryption:: 0xc0010010: @@ -79,7 +79,7 @@ The state of SME in the Linux kernel can be documented as follows: The CPU supports SME (determined through CPUID instruction). - Enabled: - Supported and bit 23 of MSR_K8_SYSCFG is set. + Supported and bit 23 of MSR_AMD64_SYSCFG is set. - Active: Supported, Enabled and the Linux kernel is actively applying @@ -89,7 +89,7 @@ The state of SME in the Linux kernel can be documented as follows: SME can also be enabled and activated in the BIOS. If SME is enabled and activated in the BIOS, then all memory accesses will be encrypted and it will not be necessary to activate the Linux memory encryption support. If the BIOS -merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or by supplying mem_encrypt=on on the kernel command line. However, if BIOS does not enable SME, then Linux will not be able to activate memory encryption, even diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 742d89a00721..211ba3375ee9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -537,9 +537,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab..0adb0341cd7c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -593,8 +593,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; /* diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 0c3b372318b7..b5f43049fa5f 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0) return 0; /* * Memory between 4GB and top of mem is forced WB by this magic bit. diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b90f3f437765..558108296f3c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -53,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - rdmsr(MSR_K8_SYSCFG, lo, hi); + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; - mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index b5cb49e57df8..c94dec6a1834 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void) return; /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is not enabled? */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b649f92287a2..433e8e4fb3a6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -858,8 +858,8 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; enc_bit = cpuid_ebx(0x8000001f) & 0x3f; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6eda2834fc05..853c40e89335 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3402,7 +3402,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: + case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 04aba7e80a36..a9639f663d25 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -529,7 +529,7 @@ void __init sme_enable(struct boot_params *bp) /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a - * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there * might be other hypervisors which emulate that MSR as non-zero * or even pass it through to the guest. * A malicious hypervisor can still trick a guest into this @@ -542,8 +542,8 @@ void __init sme_enable(struct boot_params *bp) return; /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + msr = __rdmsr(MSR_AMD64_SYSCFG); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; } else { /* SEV state cannot be controlled by a command line option */ diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ae744b6a0785..dd40d3fea74e 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -284,7 +284,7 @@ static int __init early_root_info_init(void) /* need to take out [4G, TOM2) for RAM*/ /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is enabled? */ if (val & (1<<21)) { diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 84c5d1b33d10..cc8391f86cdb 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -123,9 +123,9 @@ SYM_CODE_START(startup_32) */ btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags jnc .Ldone - movl $MSR_K8_SYSCFG, %ecx + movl $MSR_AMD64_SYSCFG, %ecx rdmsr - bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + bts $MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT, %eax jc .Ldone /* diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9fa4dfc6ebee..f0d8f60acee1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3083,7 +3083,7 @@ static void read_mc_regs(struct amd64_pvt *pvt) edac_dbg(0, " TOP_MEM: 0x%016llx\n", pvt->top_mem); /* Check first whether TOP_MEM2 is enabled: */ - rdmsrl(MSR_K8_SYSCFG, msr_val); + rdmsrl(MSR_AMD64_SYSCFG, msr_val); if (msr_val & BIT(21)) { rdmsrl(MSR_K8_TOP_MEM2, pvt->top_mem2); edac_dbg(0, " TOP_MEM2: 0x%016llx\n", pvt->top_mem2); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 45029354e0a8..c60b09e7602f 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -533,9 +533,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 -- cgit v1.2.3 From 3743d55b289c203d8f77b7cd47c24926b9d186ae Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Sun, 25 Apr 2021 15:34:51 +0800 Subject: x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some AMD Ryzen generations has different calculation method on maximum performance. 255 is not for all ASICs, some specific generations should use 166 as the maximum performance. Otherwise, it will report incorrect frequency value like below: ~ → lscpu | grep MHz CPU MHz: 3400.000 CPU max MHz: 7228.3198 CPU min MHz: 2200.0000 [ mingo: Tidied up whitespace use. ] [ Alexander Monakov : fix 225 -> 255 typo. ] Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") Fixes: 3c55e94c0ade ("cpufreq: ACPI: Extend frequency tables to cover boost frequencies") Reported-by: Jason Bagavatsingham Fixed-by: Alexander Monakov Reviewed-by: Rafael J. Wysocki Signed-off-by: Huang Rui Signed-off-by: Ingo Molnar Tested-by: Jason Bagavatsingham Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210425073451.2557394-1-ray.huang@amd.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=211791 Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/amd.c | 16 ++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- drivers/cpufreq/acpi-cpufreq.c | 6 +++++- 4 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 154321d29050..556b2b17c3e2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -787,8 +787,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_shadow); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); +extern u32 amd_get_highest_perf(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } +static inline u32 amd_get_highest_perf(void) { return 0; } #endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab..6d7b3b3ea80b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1165,3 +1165,19 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +u32 amd_get_highest_perf(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || + (c->x86_model >= 0x70 && c->x86_model < 0x80))) + return 166; + + if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || + (c->x86_model >= 0x40 && c->x86_model < 0x70))) + return 166; + + return 255; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0ad5214f598a..7770245cc7fa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -2043,7 +2043,7 @@ static bool amd_set_max_freq_ratio(void) return false; } - highest_perf = perf_caps.highest_perf; + highest_perf = amd_get_highest_perf(); nominal_perf = perf_caps.nominal_perf; if (!highest_perf || !nominal_perf) { diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index d1bbc16fba4b..7e7450453714 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -646,7 +646,11 @@ static u64 get_max_boost_ratio(unsigned int cpu) return 0; } - highest_perf = perf_caps.highest_perf; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + highest_perf = amd_get_highest_perf(); + else + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; if (!highest_perf || !nominal_perf) { -- cgit v1.2.3 From 3486d2c9be652a31033363bdd50391b0c8a8fe21 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 May 2021 09:32:46 +0200 Subject: clocksource/drivers/hyper-v: Re-enable VDSO_CLOCKMODE_HVCLOCK on X86 Mohammed reports (https://bugzilla.kernel.org/show_bug.cgi?id=213029) the commit e4ab4658f1cf ("clocksource/drivers/hyper-v: Handle vDSO differences inline") broke vDSO on x86. The problem appears to be that VDSO_CLOCKMODE_HVCLOCK is an enum value in 'enum vdso_clock_mode' and '#ifdef VDSO_CLOCKMODE_HVCLOCK' branch evaluates to false (it is not a define). Use a dedicated HAVE_VDSO_CLOCKMODE_HVCLOCK define instead. Fixes: e4ab4658f1cf ("clocksource/drivers/hyper-v: Handle vDSO differences inline") Reported-by: Mohammed Gamal Suggested-by: Thomas Gleixner Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210513073246.1715070-1-vkuznets@redhat.com --- arch/x86/include/asm/vdso/clocksource.h | 2 ++ drivers/clocksource/hyperv_timer.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h index 119ac8612d89..136e5e57cfe1 100644 --- a/arch/x86/include/asm/vdso/clocksource.h +++ b/arch/x86/include/asm/vdso/clocksource.h @@ -7,4 +7,6 @@ VDSO_CLOCKMODE_PVCLOCK, \ VDSO_CLOCKMODE_HVCLOCK +#define HAVE_VDSO_CLOCKMODE_HVCLOCK + #endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 977fd05ac35f..d6ece7bbce89 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -419,7 +419,7 @@ static void resume_hv_clock_tsc(struct clocksource *arg) hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); } -#ifdef VDSO_CLOCKMODE_HVCLOCK +#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); @@ -435,7 +435,7 @@ static struct clocksource hyperv_cs_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .suspend= suspend_hv_clock_tsc, .resume = resume_hv_clock_tsc, -#ifdef VDSO_CLOCKMODE_HVCLOCK +#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK .enable = hv_cs_enable, .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, #else -- cgit v1.2.3