From 62b1fa69f31969e11d03529d3a80599ddda2d043 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:40 +1300 Subject: KVM: Export hardware virtualization enabling/disabling functions To support TDX, KVM will need to enable TDX during KVM module loading time. Enabling TDX requires enabling hardware virtualization first so that all online CPUs (and the new CPU going online) are in post-VMXON state. KVM by default enables hardware virtualization but that is done in kvm_init(), which must be the last step after all initialization is done thus is too late for enabling TDX. Export functions to enable/disable hardware virtualization so that TDX code can use them to handle hardware virtualization enabling before kvm_init(). Signed-off-by: Kai Huang Message-ID: Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..1e75fa114f34 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2571,4 +2571,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +int kvm_enable_virtualization(void); +void kvm_disable_virtualization(void); +#else +static inline int kvm_enable_virtualization(void) { return 0; } +static inline void kvm_disable_virtualization(void) { } +#endif + #endif -- cgit v1.2.3 From fcdbdf63431c9faf639bffc957ea2ce9b545432e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:41 +1300 Subject: KVM: VMX: Initialize TDX during KVM module load Before KVM can use TDX to create and run TDX guests, TDX needs to be initialized from two perspectives: 1) TDX module must be initialized properly to a working state; 2) A per-cpu TDX initialization, a.k.a the TDH.SYS.LP.INIT SEAMCALL must be done on any logical cpu before it can run any other TDX SEAMCALLs. The TDX host core-kernel provides two functions to do the above two respectively: tdx_enable() and tdx_cpu_enable(). There are two options in terms of when to initialize TDX: initialize TDX at KVM module loading time, or when creating the first TDX guest. Choose to initialize TDX during KVM module loading time: Initializing TDX module is both memory and CPU time consuming: 1) the kernel needs to allocate a non-trivial size(~1/256) of system memory as metadata used by TDX module to track each TDX-usable memory page's status; 2) the TDX module needs to initialize this metadata, one entry for each TDX-usable memory page. Also, the kernel uses alloc_contig_pages() to allocate those metadata chunks, because they are large and need to be physically contiguous. alloc_contig_pages() can fail. If initializing TDX when creating the first TDX guest, then there's chance that KVM won't be able to run any TDX guests albeit KVM _declares_ to be able to support TDX. This isn't good for the user. On the other hand, initializing TDX at KVM module loading time can make sure KVM is providing a consistent view of whether KVM can support TDX to the user. Always only try to initialize TDX after VMX has been initialized. TDX is based on VMX, and if VMX fails to initialize then TDX is likely to be broken anyway. Also, in practice, supporting TDX will require part of VMX and common x86 infrastructure in working order, so TDX cannot be enabled alone w/o VMX support. There are two cases that can result in failure to initialize TDX: 1) TDX cannot be supported (e.g., because of TDX is not supported or enabled by hardware, or module is not loaded, or missing some dependency in KVM's configuration); 2) Any unexpected error during TDX bring-up. For the first case only mark TDX is disabled but still allow KVM module to be loaded. For the second case just fail to load the KVM module so that the user can be aware. Because TDX costs additional memory, don't enable TDX by default. Add a new module parameter 'enable_tdx' to allow the user to opt-in. Note, the name tdx_init() has already been taken by the early boot code. Use tdx_bringup() for initializing TDX (and tdx_cleanup() since KVM doesn't actually teardown TDX). They don't match vt_init()/vt_exit(), vmx_init()/vmx_exit() etc but it's not end of the world. Also, once initialized, the TDX module cannot be disabled and enabled again w/o the TDX module runtime update, which isn't supported by the kernel. After TDX is enabled, nothing needs to be done when KVM disables hardware virtualization, e.g., when offlining CPU, or during suspend/resume. TDX host core-kernel code internally tracks TDX status and can handle "multiple enabling" scenario. Similar to KVM_AMD_SEV, add a new KVM_INTEL_TDX Kconfig to guide KVM TDX code. Make it depend on INTEL_TDX_HOST but not replace INTEL_TDX_HOST because in the longer term there's a use case that requires making SEAMCALLs w/o KVM as mentioned by Dan [1]. Link: https://lore.kernel.org/6723fc2070a96_60c3294dc@dwillia2-mobl3.amr.corp.intel.com.notmuch/ [1] Signed-off-by: Kai Huang Message-ID: <162f9dee05c729203b9ad6688db1ca2960b4b502.1731664295.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 + arch/x86/include/asm/tdx_global_metadata.h | 44 ++++++++ arch/x86/kvm/Kconfig | 10 ++ arch/x86/kvm/Makefile | 1 + arch/x86/kvm/vmx/main.c | 9 ++ arch/x86/kvm/vmx/tdx.c | 162 ++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h | 13 +++ arch/x86/virt/vmx/tdx/tdx.c | 14 +++ arch/x86/virt/vmx/tdx/tdx.h | 1 - arch/x86/virt/vmx/tdx/tdx_global_metadata.h | 44 -------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 3 +- 12 files changed, 259 insertions(+), 46 deletions(-) create mode 100644 arch/x86/include/asm/tdx_global_metadata.h create mode 100644 arch/x86/kvm/vmx/tdx.c create mode 100644 arch/x86/kvm/vmx/tdx.h delete mode 100644 arch/x86/virt/vmx/tdx/tdx_global_metadata.h (limited to 'include') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 75a91869453d..1a8c687603aa 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ #include +#include #include /* @@ -121,6 +122,7 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); void tdx_guest_keyid_free(unsigned int keyid); @@ -179,6 +181,7 @@ static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h new file mode 100644 index 000000000000..060a2ad744bf --- /dev/null +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Automatically generated TDX global metadata structures. */ +#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H +#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H + +#include + +struct tdx_sys_info_features { + u64 tdx_features0; +}; + +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; +}; + +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + +struct tdx_sys_info { + struct tdx_sys_info_features features; + struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; +}; + +#endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8cbee6f614 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -128,6 +128,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 54cf95cb8d42..97c453187cc1 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -6,6 +6,7 @@ #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -172,6 +173,7 @@ struct kvm_x86_init_ops vt_init_ops __initdata = { static void __exit vt_exit(void) { kvm_exit(); + tdx_cleanup(); vmx_exit(); } module_exit(vt_exit); @@ -184,6 +186,11 @@ static int __init vt_init(void) if (r) return r; + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -196,6 +203,8 @@ static int __init vt_init(void) return 0; err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: vmx_exit(); return r; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..3c089ed3b843 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "capabilities.h" +#include "tdx.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +static enum cpuhp_state tdx_cpuhp_state; + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, NULL); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + int r; + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r; + + if (!enable_tdx) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..9d4a0e8265bf --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#ifdef CONFIG_KVM_INTEL_TDX +int tdx_bringup(void); +void tdx_cleanup(void); +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} +#endif + +#endif diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 0122051af6b3..9f0c482c1a03 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1462,6 +1462,20 @@ void __init tdx_init(void) check_tdx_erratum(); } +const struct tdx_sys_info *tdx_get_sysinfo(void) +{ + const struct tdx_sys_info *p = NULL; + + /* Make sure all fields in @tdx_sysinfo have been populated */ + mutex_lock(&tdx_module_lock); + if (tdx_module_status == TDX_MODULE_INITIALIZED) + p = (const struct tdx_sys_info *)&tdx_sysinfo; + mutex_unlock(&tdx_module_lock); + + return p; +} +EXPORT_SYMBOL_GPL(tdx_get_sysinfo); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 62cb7832c42d..da384387d4eb 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -3,7 +3,6 @@ #define _X86_VIRT_TDX_H #include -#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h deleted file mode 100644 index 060a2ad744bf..000000000000 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Automatically generated TDX global metadata structures. */ -#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H -#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H - -#include - -struct tdx_sys_info_features { - u64 tdx_features0; -}; - -struct tdx_sys_info_tdmr { - u16 max_tdmrs; - u16 max_reserved_per_tdmr; - u16 pamt_4k_entry_size; - u16 pamt_2m_entry_size; - u16 pamt_1g_entry_size; -}; - -struct tdx_sys_info_td_ctrl { - u16 tdr_base_size; - u16 tdcs_base_size; - u16 tdvps_base_size; -}; - -struct tdx_sys_info_td_conf { - u64 attributes_fixed0; - u64 attributes_fixed1; - u64 xfam_fixed0; - u64 xfam_fixed1; - u16 num_cpuid_config; - u16 max_vcpus_per_td; - u64 cpuid_config_leaves[128]; - u64 cpuid_config_values[128][2]; -}; - -struct tdx_sys_info { - struct tdx_sys_info_features features; - struct tdx_sys_info_tdmr tdmr; - struct tdx_sys_info_td_ctrl td_ctrl; - struct tdx_sys_info_td_conf td_conf; -}; - -#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1e75fa114f34..3bfe3140f444 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2284,6 +2284,7 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +extern bool enable_virt_at_load; extern bool kvm_rebooting; #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e40383fbe47..622b5a99078a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5464,8 +5464,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -static bool enable_virt_at_load = true; +bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_GPL(enable_virt_at_load); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -- cgit v1.2.3 From 7c035bea94074b19ed560a4f23a76c5a6c8e594f Mon Sep 17 00:00:00 2001 From: Zhiming Hu Date: Wed, 19 Feb 2025 09:02:51 -0500 Subject: KVM: TDX: Register TDX host key IDs to cgroup misc controller TDX host key IDs (HKID) are limit resources in a machine, and the misc cgroup lets the machine owner track their usage and limits the possibility of abusing them outside the owner's control. The cgroup v2 miscellaneous subsystem was introduced to control the resource of AMD SEV & SEV-ES ASIDs. Likewise introduce HKIDs as a misc resource. Signed-off-by: Zhiming Hu Signed-off-by: Isaku Yamahata Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/kvm/vmx/tdx.c | 14 ++++++++++++++ arch/x86/kvm/vmx/tdx.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 6 ++++++ include/linux/misc_cgroup.h | 4 ++++ kernel/cgroup/misc.c | 4 ++++ 6 files changed, 31 insertions(+) (limited to 'include') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 1a8c687603aa..2879fc518a32 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -125,6 +125,7 @@ const char *tdx_dump_mce_info(struct mce *m); const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); +u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); struct tdx_td { @@ -180,6 +181,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } +static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 2ac925ecccd5..01166cb8f2e6 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include "capabilities.h" #include "mmu.h" @@ -140,6 +141,9 @@ static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) tdx_guest_keyid_free(kvm_tdx->hkid); kvm_tdx->hkid = -1; atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; } static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) @@ -675,6 +679,10 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, if (ret < 0) return ret; kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; ret = -ENOMEM; @@ -1459,6 +1467,11 @@ static int __init __tdx_bringup(void) goto get_sysinfo_err; } + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + /* * Leave hardware virtualization enabled after TDX is enabled * successfully. TDX CPU hotplug depends on this. @@ -1475,6 +1488,7 @@ tdx_bringup_err: void tdx_cleanup(void) { if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); __tdx_cleanup(); kvm_disable_virtualization(); } diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 6ec7ac1d91e3..0559126c8f9d 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -21,6 +21,7 @@ enum kvm_tdx_state { struct kvm_tdx { struct kvm kvm; + struct misc_cg *misc_cg; int hkid; enum kvm_tdx_state state; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 9f0c482c1a03..3a272e9ff2ca 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1476,6 +1476,12 @@ const struct tdx_sys_info *tdx_get_sysinfo(void) } EXPORT_SYMBOL_GPL(tdx_get_sysinfo); +u32 tdx_get_nr_guest_keyids(void) +{ + return tdx_nr_guest_keyids; +} +EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..8c0e4f4d71be 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -17,6 +17,10 @@ enum misc_res_type { MISC_CG_RES_SEV, /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */ MISC_CG_RES_SEV_ES, +#endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + MISC_CG_RES_TDX, #endif /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ MISC_CG_RES_TYPES diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..264aad22c967 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ -- cgit v1.2.3 From c4a92f12cf35b83ce81757f6e5e8eb6223b87388 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 13 Jan 2025 11:08:41 +0800 Subject: KVM: Add parameter "kvm" to kvm_cpu_dirty_log_size() and its callers Add a parameter "kvm" to kvm_cpu_dirty_log_size() and down to its callers: kvm_dirty_ring_get_rsvd_entries(), kvm_dirty_ring_alloc(). This is a preparation to make cpu_dirty_log_size a per-VM value rather than a system-wide value. No function changes expected. Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- include/linux/kvm_dirty_ring.h | 11 ++++++----- virt/kvm/dirty_ring.c | 11 ++++++----- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3c208291154c..a0c736c11091 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1311,7 +1311,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { return kvm_x86_ops.cpu_dirty_log_size; } diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 4862c98d80d3..da4d9b5f58f1 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -32,7 +32,7 @@ struct kvm_dirty_ring { * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should * not be included as well, so define these nop functions for the arch. */ -static inline u32 kvm_dirty_ring_get_rsvd_entries(void) +static inline u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { return 0; } @@ -42,7 +42,7 @@ static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) return true; } -static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, +static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size) { return 0; @@ -71,11 +71,12 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ -int kvm_cpu_dirty_log_size(void); +int kvm_cpu_dirty_log_size(struct kvm *kvm); bool kvm_use_dirty_bitmap(struct kvm *kvm); bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); -u32 kvm_dirty_ring_get_rsvd_entries(void); -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size); /* * called with kvm->slots_lock held, returns the number of diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 7bc74969a819..d14ffc7513ee 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -11,14 +11,14 @@ #include #include "kvm_mm.h" -int __weak kvm_cpu_dirty_log_size(void) +int __weak kvm_cpu_dirty_log_size(struct kvm *kvm) { return 0; } -u32 kvm_dirty_ring_get_rsvd_entries(void) +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { - return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); + return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm); } bool kvm_use_dirty_bitmap(struct kvm *kvm) @@ -74,14 +74,15 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) KVM_MMU_UNLOCK(kvm); } -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size) { ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; ring->size = size / sizeof(struct kvm_dirty_gfn); - ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(); + ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm); ring->dirty_index = 0; ring->reset_index = 0; ring->index = index; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 622b5a99078a..549537da3062 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4108,7 +4108,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { - r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; @@ -4847,7 +4847,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ - if (size < kvm_dirty_ring_get_rsvd_entries() * + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; -- cgit v1.2.3 From 44428e4936022a7a31743017849b167e64f33a32 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:18 +0800 Subject: KVM: x86: Move pv_unhalted check out of kvm_vcpu_has_events() Move pv_unhalted check out of kvm_vcpu_has_events(), check pv_unhalted explicitly when handling PV unhalt and expose kvm_vcpu_has_events(). kvm_vcpu_has_events() returns true if pv_unhalted is set, and pv_unhalted is only cleared on transitions to KVM_MP_STATE_RUNNABLE. If the guest initiates a spurious wakeup, pv_unhalted could be left set in perpetuity. Currently, this is not problematic because kvm_vcpu_has_events() is only called when handling PV unhalt. However, if kvm_vcpu_has_events() is used for other purposes in the future, it could return the unexpected results. Export kvm_vcpu_has_events() for its usage in broader contexts. Suggested-by: Sean Christopherson Signed-off-by: Binbin Wu Message-ID: <20250222014225.897298-3-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 +++++------ include/linux/kvm_host.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a792207a0dd1..3cae210ffaa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11143,7 +11143,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11152,9 +11152,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11192,10 +11189,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11331,7 +11330,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) vcpu->arch.pv.pv_unhalted = false; else vcpu->arch.mp_state = state; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bfe3140f444..ed1968f6f841 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1609,6 +1609,7 @@ void kvm_arch_disable_virtualization(void); int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 79462faa2b2aa89db029af5e61df11b5bb6ef4e3 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:23 +0800 Subject: KVM: TDX: Handle TDG.VP.VMCALL Convert TDG.VP.VMCALL to KVM_EXIT_SYSTEM_EVENT with a new type KVM_SYSTEM_EVENT_TDX_FATAL and forward it to userspace for handling. TD guest can use TDG.VP.VMCALL to report the fatal error it has experienced. This hypercall is special because TD guest is requesting a termination with the error information, KVM needs to forward the hypercall to userspace anyway, KVM doesn't do parsing or conversion, it just dumps the 16 general-purpose registers to userspace and let userspace decide what to do. Signed-off-by: Binbin Wu Message-ID: <20250222014225.897298-8-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/kvm/vmx/tdx.c | 28 ++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 3 files changed, 35 insertions(+) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 2b52eb77e29c..8dd3151fb9e4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6823,6 +6823,7 @@ should put the acknowledged interrupt vector into the 'epr' field. #define KVM_SYSTEM_EVENT_WAKEUP 4 #define KVM_SYSTEM_EVENT_SUSPEND 5 #define KVM_SYSTEM_EVENT_SEV_TERM 6 + #define KVM_SYSTEM_EVENT_TDX_FATAL 7 __u32 type; __u32 ndata; __u64 data[16]; @@ -6849,6 +6850,11 @@ Valid values for 'type' are: reset/shutdown of the VM. - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination. The guest physical address of the guest's GHCB is stored in `data[0]`. + - KVM_SYSTEM_EVENT_TDX_FATAL -- a TDX guest reported a fatal error state. + KVM doesn't do any parsing or conversion, it just dumps 16 general-purpose + registers to userspace, in ascending order of the 4-bit indices for x86-64 + general-purpose registers in instruction encoding, as defined in the Intel + SDM. - KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and KVM has recognized a wakeup event. Userspace may honor this event by marking the exiting vCPU as runnable, or deny it and call KVM_RUN again. diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index d7a6c41737ce..5f6c29d84180 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1104,11 +1104,39 @@ error: return 1; } +static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 *regs = vcpu->run->system_event.data; + u64 *module_regs = &tdx->vp_enter_args.r8; + int index = VCPU_REGS_RAX; + + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; + vcpu->run->system_event.ndata = 16; + + /* Dump 16 general-purpose registers to userspace in ascending order. */ + regs[index++] = tdx->vp_enter_ret; + regs[index++] = tdx->vp_enter_args.rcx; + regs[index++] = tdx->vp_enter_args.rdx; + regs[index++] = tdx->vp_enter_args.rbx; + regs[index++] = 0; + regs[index++] = 0; + regs[index++] = tdx->vp_enter_args.rsi; + regs[index] = tdx->vp_enter_args.rdi; + for (index = 0; index < 8; index++) + regs[VCPU_REGS_R8 + index] = module_regs[index]; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { case TDVMCALL_MAP_GPA: return tdx_map_gpa(vcpu); + case TDVMCALL_REPORT_FATAL_ERROR: + return tdx_report_fatal_error(vcpu); default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 45e6d8fca9b9..937400350317 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -375,6 +375,7 @@ struct kvm_run { #define KVM_SYSTEM_EVENT_WAKEUP 4 #define KVM_SYSTEM_EVENT_SUSPEND 5 #define KVM_SYSTEM_EVENT_SEV_TERM 6 +#define KVM_SYSTEM_EVENT_TDX_FATAL 7 __u32 type; __u32 ndata; union { -- cgit v1.2.3