diff options
Diffstat (limited to 'arch/x86/coco')
-rw-r--r-- | arch/x86/coco/core.c | 4 | ||||
-rw-r--r-- | arch/x86/coco/sev/Makefile | 7 | ||||
-rw-r--r-- | arch/x86/coco/sev/core.c | 932 | ||||
-rw-r--r-- | arch/x86/coco/sev/shared.c | 72 | ||||
-rw-r--r-- | arch/x86/coco/tdx/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/coco/tdx/debug.c | 69 | ||||
-rw-r--r-- | arch/x86/coco/tdx/tdx.c | 82 |
7 files changed, 1035 insertions, 133 deletions
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 0f81f70aca82..9a0ddda3aa69 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -65,7 +65,6 @@ static __maybe_unused __always_inline bool amd_cc_platform_vtom(enum cc_attr att * up under SME the trampoline area cannot be encrypted, whereas under SEV * the trampoline area must be encrypted. */ - static bool noinstr amd_cc_platform_has(enum cc_attr attr) { #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -97,6 +96,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_SEV_SNP: return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + case CC_ATTR_GUEST_SNP_SECURE_TSC: + return sev_status & MSR_AMD64_SNP_SECURE_TSC; + case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 4e375e7305ac..dcb06dc8b5ae 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -2,6 +2,10 @@ obj-y += core.o +# jump tables are emitted using absolute references in non-PIC code +# so they cannot be used in the early SEV startup code +CFLAGS_core.o += -fno-jump-tables + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif @@ -13,3 +17,6 @@ KCOV_INSTRUMENT_core.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n + +# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining +UBSAN_SANITIZE := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index c5b0148b8c0a..e2ee6bb3008f 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -25,6 +25,7 @@ #include <linux/psp-sev.h> #include <linux/dmi.h> #include <uapi/linux/sev-guest.h> +#include <crypto/gcm.h> #include <asm/init.h> #include <asm/cpu_entry_area.h> @@ -95,6 +96,15 @@ static u64 sev_hv_features __ro_after_init; /* Secrets page physical address from the CC blob */ static u64 secrets_pa __ro_after_init; +/* + * For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and + * initializes snp_tsc_scale and snp_tsc_offset. These values are replicated + * across the APs VMSA fields (TSC_SCALE and TSC_OFFSET). + */ +static u64 snp_tsc_scale __ro_after_init; +static u64 snp_tsc_offset __ro_after_init; +static unsigned long snp_tsc_freq_khz __ro_after_init; + /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { struct ghcb ghcb_page; @@ -777,15 +787,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, val = sev_es_rd_ghcb_msr(); - if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, - "Wrong PSC response code: 0x%x\n", - (unsigned int)GHCB_RESP_CODE(val))) + if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) goto e_term; - if (WARN(GHCB_MSR_PSC_RESP_VAL(val), - "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", - op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", - paddr, GHCB_MSR_PSC_RESP_VAL(val))) + if (GHCB_MSR_PSC_RESP_VAL(val)) goto e_term; /* Page validation must be performed after changing to private */ @@ -821,7 +826,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); } -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { /* @@ -954,6 +959,102 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end) set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } +static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id) +{ + bool create = event != SVM_VMGEXIT_AP_DESTROY; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret = 0; + + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + + if (create) + ghcb_set_rax(ghcb, vmsa->sev_features); + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, + ((u64)apic_id << 32) | + ((u64)snp_vmpl << 16) | + event); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY")); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) +{ + int ret; + + if (snp_vmpl) { + struct svsm_call call = {}; + unsigned long flags; + + local_irq_save(flags); + + call.caa = this_cpu_read(svsm_caa); + call.rcx = __pa(va); + + if (make_vmsa) { + /* Protocol 0, Call ID 2 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); + call.rdx = __pa(caa); + call.r8 = apic_id; + } else { + /* Protocol 0, Call ID 3 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); + } + + ret = svsm_perform_call_protocol(&call); + + local_irq_restore(flags); + } else { + /* + * If the kernel runs at VMPL0, it can change the VMSA + * bit for a page using the RMPADJUST instruction. + * However, for the instruction to succeed it must + * target the permissions of a lesser privileged (higher + * numbered) VMPL level, so use VMPL1. + */ + u64 attrs = 1; + + if (make_vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); + } + + return ret; +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) +{ + int err; + + err = snp_set_vmsa(vmsa, NULL, apic_id, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + static void set_pte_enc(pte_t *kpte, int level, void *va) { struct pte_enc_desc d = { @@ -1000,7 +1101,8 @@ static void unshare_all_memory(void) data = per_cpu(runtime_data, cpu); ghcb = (unsigned long)&data->ghcb_page; - if (addr <= ghcb && ghcb <= addr + size) { + /* Handle the case of a huge page containing the GHCB page */ + if (addr <= ghcb && ghcb < addr + size) { skipped_addr = true; break; } @@ -1050,11 +1152,70 @@ void snp_kexec_begin(void) pr_warn("Failed to stop shared<->private conversions\n"); } +/* + * Shutdown all APs except the one handling kexec/kdump and clearing + * the VMSA tag on AP's VMSA pages as they are not being used as + * VMSA page anymore. + */ +static void shutdown_all_aps(void) +{ + struct sev_es_save_area *vmsa; + int apic_id, this_cpu, cpu; + + this_cpu = get_cpu(); + + /* + * APs are already in HLT loop when enc_kexec_finish() callback + * is invoked. + */ + for_each_present_cpu(cpu) { + vmsa = per_cpu(sev_vmsa, cpu); + + /* + * The BSP or offlined APs do not have guest allocated VMSA + * and there is no need to clear the VMSA tag for this page. + */ + if (!vmsa) + continue; + + /* + * Cannot clear the VMSA tag for the currently running vCPU. + */ + if (this_cpu == cpu) { + unsigned long pa; + struct page *p; + + pa = __pa(vmsa); + /* + * Mark the VMSA page of the running vCPU as offline + * so that is excluded and not touched by makedumpfile + * while generating vmcore during kdump. + */ + p = pfn_to_online_page(pa >> PAGE_SHIFT); + if (p) + __SetPageOffline(p); + continue; + } + + apic_id = cpuid_to_apicid[cpu]; + + /* + * Issue AP destroy to ensure AP gets kicked out of guest mode + * to allow using RMPADJUST to remove the VMSA tag on it's + * VMSA page. + */ + vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id); + snp_cleanup_vmsa(vmsa, apic_id); + } + + put_cpu(); +} + void snp_kexec_finish(void) { struct sev_es_runtime_data *data; + unsigned long size, addr; unsigned int level, cpu; - unsigned long size; struct ghcb *ghcb; pte_t *pte; @@ -1064,6 +1225,8 @@ void snp_kexec_finish(void) if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; + shutdown_all_aps(); + unshare_all_memory(); /* @@ -1080,54 +1243,11 @@ void snp_kexec_finish(void) ghcb = &data->ghcb_page; pte = lookup_address((unsigned long)ghcb, &level); size = page_level_size(level); - set_pte_enc(pte, level, (void *)ghcb); - snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); - } -} - -static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) -{ - int ret; - - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - - local_irq_save(flags); - - call.caa = this_cpu_read(svsm_caa); - call.rcx = __pa(va); - - if (make_vmsa) { - /* Protocol 0, Call ID 2 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); - call.rdx = __pa(caa); - call.r8 = apic_id; - } else { - /* Protocol 0, Call ID 3 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); - } - - ret = svsm_perform_call_protocol(&call); - - local_irq_restore(flags); - } else { - /* - * If the kernel runs at VMPL0, it can change the VMSA - * bit for a page using the RMPADJUST instruction. - * However, for the instruction to succeed it must - * target the permissions of a lesser privileged (higher - * numbered) VMPL level, so use VMPL1. - */ - u64 attrs = 1; - - if (make_vmsa) - attrs |= RMPADJUST_VMSA_PAGE_BIT; - - ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); + /* Handle the case of a huge page containing the GHCB page */ + addr = (unsigned long)ghcb & page_level_mask(level); + set_pte_enc(pte, level, (void *)addr); + snp_set_memory_private(addr, (size / PAGE_SIZE)); } - - return ret; } #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) @@ -1161,24 +1281,10 @@ static void *snp_alloc_vmsa_page(int cpu) return page_address(p + 1); } -static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) -{ - int err; - - err = snp_set_vmsa(vmsa, NULL, apic_id, false); - if (err) - pr_err("clear VMSA page failed (%u), leaking page\n", err); - else - free_page((unsigned long)vmsa); -} - static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) { struct sev_es_save_area *cur_vmsa, *vmsa; - struct ghcb_state state; struct svsm_ca *caa; - unsigned long flags; - struct ghcb *ghcb; u8 sipi_vector; int cpu, ret; u64 cr4; @@ -1276,6 +1382,12 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) vmsa->vmpl = snp_vmpl; vmsa->sev_features = sev_status >> 2; + /* Populate AP's TSC scale/offset to get accurate TSC values. */ + if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) { + vmsa->tsc_scale = snp_tsc_scale; + vmsa->tsc_offset = snp_tsc_offset; + } + /* Switch the page over to a VMSA page now that it is initialized */ ret = snp_set_vmsa(vmsa, caa, apic_id, true); if (ret) { @@ -1286,33 +1398,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) } /* Issue VMGEXIT AP Creation NAE event */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_rax(ghcb, vmsa->sev_features); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); - ghcb_set_sw_exit_info_1(ghcb, - ((u64)apic_id << 32) | - ((u64)snp_vmpl << 16) | - SVM_VMGEXIT_AP_CREATE); - ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (!ghcb_sw_exit_info_1_is_valid(ghcb) || - lower_32_bits(ghcb->save.sw_exit_info_1)) { - pr_err("SNP AP Creation error\n"); - ret = -EINVAL; - } - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Perform cleanup if there was an error */ + ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id); if (ret) { snp_cleanup_vmsa(vmsa, apic_id); vmsa = NULL; @@ -1418,6 +1504,41 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) return ES_OK; } +/* + * TSC related accesses should not exit to the hypervisor when a guest is + * executing with Secure TSC enabled, so special handling is required for + * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. + */ +static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) +{ + u64 tsc; + + /* + * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. + * Terminate the SNP guest when the interception is enabled. + */ + if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) + return ES_VMM_ERROR; + + /* + * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC + * to return undefined values, so ignore all writes. + * + * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use + * the value returned by rdtsc_ordered(). + */ + if (write) { + WARN_ONCE(1, "TSC MSR writes are verboten!\n"); + return ES_OK; + } + + tsc = rdtsc_ordered(); + regs->ax = lower_32_bits(tsc); + regs->dx = upper_32_bits(tsc); + + return ES_OK; +} + static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; @@ -1427,8 +1548,17 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) /* Is it a WRMSR? */ write = ctxt->insn.opcode.bytes[1] == 0x30; - if (regs->cx == MSR_SVSM_CAA) + switch (regs->cx) { + case MSR_SVSM_CAA: return __vc_handle_msr_caa(regs, write); + case MSR_IA32_TSC: + case MSR_AMD64_GUEST_TSC_FREQ: + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return __vc_handle_secure_tsc_msrs(regs, write); + break; + default: + break; + } ghcb_set_rcx(ghcb, regs->cx); if (write) { @@ -1572,9 +1702,7 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc(sizeof(*caa), PAGE_SIZE); - if (!caa) - panic("Can't allocate SVSM CA page\n"); + caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); @@ -2362,7 +2490,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) call.rcx = pa; ret = svsm_perform_call_protocol(&call); if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa; RIP_REL_REF(boot_svsm_caa_pa) = pa; @@ -2508,8 +2636,8 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, } EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req); -int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, - struct snp_guest_request_ioctl *rio) +static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, + struct snp_guest_request_ioctl *rio) { struct ghcb_state state; struct es_em_ctxt ctxt; @@ -2571,7 +2699,6 @@ e_restore_irq: return ret; } -EXPORT_SYMBOL_GPL(snp_issue_guest_request); static struct platform_device sev_guest_device = { .name = "sev-guest", @@ -2580,15 +2707,9 @@ static struct platform_device sev_guest_device = { static int __init snp_init_platform_device(void) { - struct sev_guest_platform_data data; - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return -ENODEV; - data.secrets_gpa = secrets_pa; - if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) - return -ENODEV; - if (platform_device_register(&sev_guest_device)) return -ENODEV; @@ -2667,3 +2788,590 @@ static int __init sev_sysfs_init(void) } arch_initcall(sev_sysfs_init); #endif // CONFIG_SYSFS + +static void free_shared_pages(void *buf, size_t sz) +{ + unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + int ret; + + if (!buf) + return; + + ret = set_memory_encrypted((unsigned long)buf, npages); + if (ret) { + WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); + return; + } + + __free_pages(virt_to_page(buf), get_order(sz)); +} + +static void *alloc_shared_pages(size_t sz) +{ + unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + struct page *page; + int ret; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz)); + if (!page) + return NULL; + + ret = set_memory_decrypted((unsigned long)page_address(page), npages); + if (ret) { + pr_err("failed to mark page shared, ret=%d\n", ret); + __free_pages(page, get_order(sz)); + return NULL; + } + + return page_address(page); +} + +static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno) +{ + u8 *key = NULL; + + switch (id) { + case 0: + *seqno = &secrets->os_area.msg_seqno_0; + key = secrets->vmpck0; + break; + case 1: + *seqno = &secrets->os_area.msg_seqno_1; + key = secrets->vmpck1; + break; + case 2: + *seqno = &secrets->os_area.msg_seqno_2; + key = secrets->vmpck2; + break; + case 3: + *seqno = &secrets->os_area.msg_seqno_3; + key = secrets->vmpck3; + break; + default: + break; + } + + return key; +} + +static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen) +{ + struct aesgcm_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) { + pr_err("Crypto context initialization failed\n"); + kfree(ctx); + return NULL; + } + + return ctx; +} + +int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) +{ + /* Adjust the default VMPCK key based on the executing VMPL level */ + if (vmpck_id == -1) + vmpck_id = snp_vmpl; + + mdesc->vmpck = get_vmpck(vmpck_id, mdesc->secrets, &mdesc->os_area_msg_seqno); + if (!mdesc->vmpck) { + pr_err("Invalid VMPCK%d communication key\n", vmpck_id); + return -EINVAL; + } + + /* Verify that VMPCK is not zero. */ + if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { + pr_err("Empty VMPCK%d communication key\n", vmpck_id); + return -EINVAL; + } + + mdesc->vmpck_id = vmpck_id; + + mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN); + if (!mdesc->ctx) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(snp_msg_init); + +struct snp_msg_desc *snp_msg_alloc(void) +{ + struct snp_msg_desc *mdesc; + void __iomem *mem; + + BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE); + + mdesc = kzalloc(sizeof(struct snp_msg_desc), GFP_KERNEL); + if (!mdesc) + return ERR_PTR(-ENOMEM); + + mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); + if (!mem) + goto e_free_mdesc; + + mdesc->secrets = (__force struct snp_secrets_page *)mem; + + /* Allocate the shared page used for the request and response message. */ + mdesc->request = alloc_shared_pages(sizeof(struct snp_guest_msg)); + if (!mdesc->request) + goto e_unmap; + + mdesc->response = alloc_shared_pages(sizeof(struct snp_guest_msg)); + if (!mdesc->response) + goto e_free_request; + + return mdesc; + +e_free_request: + free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); +e_unmap: + iounmap(mem); +e_free_mdesc: + kfree(mdesc); + + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(snp_msg_alloc); + +void snp_msg_free(struct snp_msg_desc *mdesc) +{ + if (!mdesc) + return; + + kfree(mdesc->ctx); + free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); + free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); + iounmap((__force void __iomem *)mdesc->secrets); + + memset(mdesc, 0, sizeof(*mdesc)); + kfree(mdesc); +} +EXPORT_SYMBOL_GPL(snp_msg_free); + +/* Mutex to serialize the shared buffer access and command handling. */ +static DEFINE_MUTEX(snp_cmd_mutex); + +/* + * If an error is received from the host or AMD Secure Processor (ASP) there + * are two options. Either retry the exact same encrypted request or discontinue + * using the VMPCK. + * + * This is because in the current encryption scheme GHCB v2 uses AES-GCM to + * encrypt the requests. The IV for this scheme is the sequence number. GCM + * cannot tolerate IV reuse. + * + * The ASP FW v1.51 only increments the sequence numbers on a successful + * guest<->ASP back and forth and only accepts messages at its exact sequence + * number. + * + * So if the sequence number were to be reused the encryption scheme is + * vulnerable. If the sequence number were incremented for a fresh IV the ASP + * will reject the request. + */ +static void snp_disable_vmpck(struct snp_msg_desc *mdesc) +{ + pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n", + mdesc->vmpck_id); + memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN); + mdesc->vmpck = NULL; +} + +static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc) +{ + u64 count; + + lockdep_assert_held(&snp_cmd_mutex); + + /* Read the current message sequence counter from secrets pages */ + count = *mdesc->os_area_msg_seqno; + + return count + 1; +} + +/* Return a non-zero on success */ +static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc) +{ + u64 count = __snp_get_msg_seqno(mdesc); + + /* + * The message sequence counter for the SNP guest request is a 64-bit + * value but the version 2 of GHCB specification defines a 32-bit storage + * for it. If the counter exceeds the 32-bit value then return zero. + * The caller should check the return value, but if the caller happens to + * not check the value and use it, then the firmware treats zero as an + * invalid number and will fail the message request. + */ + if (count >= UINT_MAX) { + pr_err("request message sequence counter overflow\n"); + return 0; + } + + return count; +} + +static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc) +{ + /* + * The counter is also incremented by the PSP, so increment it by 2 + * and save in secrets page. + */ + *mdesc->os_area_msg_seqno += 2; +} + +static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req) +{ + struct snp_guest_msg *resp_msg = &mdesc->secret_response; + struct snp_guest_msg *req_msg = &mdesc->secret_request; + struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr; + struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr; + struct aesgcm_ctx *ctx = mdesc->ctx; + u8 iv[GCM_AES_IV_SIZE] = {}; + + pr_debug("response [seqno %lld type %d version %d sz %d]\n", + resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version, + resp_msg_hdr->msg_sz); + + /* Copy response from shared memory to encrypted memory. */ + memcpy(resp_msg, mdesc->response, sizeof(*resp_msg)); + + /* Verify that the sequence counter is incremented by 1 */ + if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1))) + return -EBADMSG; + + /* Verify response message type and version number. */ + if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) || + resp_msg_hdr->msg_version != req_msg_hdr->msg_version) + return -EBADMSG; + + /* + * If the message size is greater than our buffer length then return + * an error. + */ + if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz)) + return -EBADMSG; + + /* Decrypt the payload */ + memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno))); + if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz, + &resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag)) + return -EBADMSG; + + return 0; +} + +static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req) +{ + struct snp_guest_msg *msg = &mdesc->secret_request; + struct snp_guest_msg_hdr *hdr = &msg->hdr; + struct aesgcm_ctx *ctx = mdesc->ctx; + u8 iv[GCM_AES_IV_SIZE] = {}; + + memset(msg, 0, sizeof(*msg)); + + hdr->algo = SNP_AEAD_AES_256_GCM; + hdr->hdr_version = MSG_HDR_VER; + hdr->hdr_sz = sizeof(*hdr); + hdr->msg_type = req->msg_type; + hdr->msg_version = req->msg_version; + hdr->msg_seqno = seqno; + hdr->msg_vmpck = req->vmpck_id; + hdr->msg_sz = req->req_sz; + + /* Verify the sequence number is non-zero */ + if (!hdr->msg_seqno) + return -ENOSR; + + pr_debug("request [seqno %lld type %d version %d sz %d]\n", + hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz); + + if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload))) + return -EBADMSG; + + memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno))); + aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo, + AAD_LEN, iv, hdr->authtag); + + return 0; +} + +static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio) +{ + unsigned long req_start = jiffies; + unsigned int override_npages = 0; + u64 override_err = 0; + int rc; + +retry_request: + /* + * Call firmware to process the request. In this function the encrypted + * message enters shared memory with the host. So after this call the + * sequence number must be incremented or the VMPCK must be deleted to + * prevent reuse of the IV. + */ + rc = snp_issue_guest_request(req, &req->input, rio); + switch (rc) { + case -ENOSPC: + /* + * If the extended guest request fails due to having too + * small of a certificate data buffer, retry the same + * guest request without the extended data request in + * order to increment the sequence number and thus avoid + * IV reuse. + */ + override_npages = req->input.data_npages; + req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + + /* + * Override the error to inform callers the given extended + * request buffer size was too small and give the caller the + * required buffer size. + */ + override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN); + + /* + * If this call to the firmware succeeds, the sequence number can + * be incremented allowing for continued use of the VMPCK. If + * there is an error reflected in the return value, this value + * is checked further down and the result will be the deletion + * of the VMPCK and the error code being propagated back to the + * user as an ioctl() return code. + */ + goto retry_request; + + /* + * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been + * throttled. Retry in the driver to avoid returning and reusing the + * message sequence number on a different message. + */ + case -EAGAIN: + if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) { + rc = -ETIMEDOUT; + break; + } + schedule_timeout_killable(SNP_REQ_RETRY_DELAY); + goto retry_request; + } + + /* + * Increment the message sequence number. There is no harm in doing + * this now because decryption uses the value stored in the response + * structure and any failure will wipe the VMPCK, preventing further + * use anyway. + */ + snp_inc_msg_seqno(mdesc); + + if (override_err) { + rio->exitinfo2 = override_err; + + /* + * If an extended guest request was issued and the supplied certificate + * buffer was not large enough, a standard guest request was issued to + * prevent IV reuse. If the standard request was successful, return -EIO + * back to the caller as would have originally been returned. + */ + if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + rc = -EIO; + } + + if (override_npages) + req->input.data_npages = override_npages; + + return rc; +} + +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio) +{ + u64 seqno; + int rc; + + guard(mutex)(&snp_cmd_mutex); + + /* Check if the VMPCK is not empty */ + if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { + pr_err_ratelimited("VMPCK is disabled\n"); + return -ENOTTY; + } + + /* Get message sequence and verify that its a non-zero */ + seqno = snp_get_msg_seqno(mdesc); + if (!seqno) + return -EIO; + + /* Clear shared memory's response for the host to populate. */ + memset(mdesc->response, 0, sizeof(struct snp_guest_msg)); + + /* Encrypt the userspace provided payload in mdesc->secret_request. */ + rc = enc_payload(mdesc, seqno, req); + if (rc) + return rc; + + /* + * Write the fully encrypted request to the shared unencrypted + * request page. + */ + memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request)); + + /* Initialize the input address for guest request */ + req->input.req_gpa = __pa(mdesc->request); + req->input.resp_gpa = __pa(mdesc->response); + req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0; + + rc = __handle_guest_request(mdesc, req, rio); + if (rc) { + if (rc == -EIO && + rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + return rc; + + pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", + rc, rio->exitinfo2); + + snp_disable_vmpck(mdesc); + return rc; + } + + rc = verify_and_dec_payload(mdesc, req); + if (rc) { + pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc); + snp_disable_vmpck(mdesc); + return rc; + } + + return 0; +} +EXPORT_SYMBOL_GPL(snp_send_guest_request); + +static int __init snp_get_tsc_info(void) +{ + struct snp_guest_request_ioctl *rio; + struct snp_tsc_info_resp *tsc_resp; + struct snp_tsc_info_req *tsc_req; + struct snp_msg_desc *mdesc; + struct snp_guest_req *req; + int rc = -ENOMEM; + + tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL); + if (!tsc_req) + return rc; + + /* + * The intermediate response buffer is used while decrypting the + * response payload. Make sure that it has enough space to cover + * the authtag. + */ + tsc_resp = kzalloc(sizeof(*tsc_resp) + AUTHTAG_LEN, GFP_KERNEL); + if (!tsc_resp) + goto e_free_tsc_req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto e_free_tsc_resp; + + rio = kzalloc(sizeof(*rio), GFP_KERNEL); + if (!rio) + goto e_free_req; + + mdesc = snp_msg_alloc(); + if (IS_ERR_OR_NULL(mdesc)) + goto e_free_rio; + + rc = snp_msg_init(mdesc, snp_vmpl); + if (rc) + goto e_free_mdesc; + + req->msg_version = MSG_HDR_VER; + req->msg_type = SNP_MSG_TSC_INFO_REQ; + req->vmpck_id = snp_vmpl; + req->req_buf = tsc_req; + req->req_sz = sizeof(*tsc_req); + req->resp_buf = (void *)tsc_resp; + req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN; + req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + + rc = snp_send_guest_request(mdesc, req, rio); + if (rc) + goto e_request; + + pr_debug("%s: response status 0x%x scale 0x%llx offset 0x%llx factor 0x%x\n", + __func__, tsc_resp->status, tsc_resp->tsc_scale, tsc_resp->tsc_offset, + tsc_resp->tsc_factor); + + if (!tsc_resp->status) { + snp_tsc_scale = tsc_resp->tsc_scale; + snp_tsc_offset = tsc_resp->tsc_offset; + } else { + pr_err("Failed to get TSC info, response status 0x%x\n", tsc_resp->status); + rc = -EIO; + } + +e_request: + /* The response buffer contains sensitive data, explicitly clear it. */ + memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN); +e_free_mdesc: + snp_msg_free(mdesc); +e_free_rio: + kfree(rio); +e_free_req: + kfree(req); + e_free_tsc_resp: + kfree(tsc_resp); +e_free_tsc_req: + kfree(tsc_req); + + return rc; +} + +void __init snp_secure_tsc_prepare(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) + return; + + if (snp_get_tsc_info()) { + pr_alert("Unable to retrieve Secure TSC info from ASP\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC); + } + + pr_debug("SecureTSC enabled"); +} + +static unsigned long securetsc_get_tsc_khz(void) +{ + return snp_tsc_freq_khz; +} + +void __init snp_secure_tsc_init(void) +{ + struct snp_secrets_page *secrets; + unsigned long tsc_freq_mhz; + void *mem; + + if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) + return; + + mem = early_memremap_encrypted(secrets_pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC); + } + + secrets = (__force struct snp_secrets_page *)mem; + + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + rdmsrl(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz); + + /* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */ + tsc_freq_mhz &= GENMASK_ULL(17, 0); + + snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor); + + x86_platform.calibrate_cpu = securetsc_get_tsc_khz; + x86_platform.calibrate_tsc = securetsc_get_tsc_khz; + + early_memunmap(mem, PAGE_SIZE); +} diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/coco/sev/shared.c index 71de53194089..383afc41a718 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/coco/sev/shared.c @@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +static int __head +snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; @@ -1140,6 +1141,16 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; + /* + * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure + * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP + * instructions are being intercepted. If this should occur and Secure + * TSC is enabled, guest execution should be terminated as the guest + * cannot rely on the TSC value provided by the hypervisor. + */ + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return ES_VMM_ERROR; + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; @@ -1243,7 +1254,25 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs __pval_terminate(pfn, action, page_size, ret, svsm_ret); } -static void svsm_pval_4k_page(unsigned long paddr, bool validate) +static inline void sev_evict_cache(void *va, int npages) +{ + volatile u8 val __always_unused; + u8 *bytes = va; + int page_idx; + + /* + * For SEV guests, a read from the first/last cache-lines of a 4K page + * using the guest key is sufficient to cause a flush of all cache-lines + * associated with that 4K page without incurring all the overhead of a + * full CLFLUSH sequence. + */ + for (page_idx = 0; page_idx < npages; page_idx++) { + val = bytes[page_idx * PAGE_SIZE]; + val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; + } +} + +static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -1275,12 +1304,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate) ret = svsm_perform_call_protocol(&call); if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); } -static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate) +static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate) { int ret; @@ -1293,8 +1323,15 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) - __pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); } + + /* + * If validating memory (making it private) and affected by the + * cache-coherency vulnerability, perform the cache eviction mitigation. + */ + if (validate && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO)) + sev_evict_cache((void *)vaddr, 1); } static void pval_pages(struct snp_psc_desc *desc) @@ -1479,10 +1516,31 @@ static void svsm_pval_pages(struct snp_psc_desc *desc) static void pvalidate_pages(struct snp_psc_desc *desc) { + struct psc_entry *e; + unsigned int i; + if (snp_vmpl) svsm_pval_pages(desc); else pval_pages(desc); + + /* + * If not affected by the cache-coherency vulnerability there is no need + * to perform the cache eviction mitigation. + */ + if (cpu_feature_enabled(X86_FEATURE_COHERENCY_SFW_NO)) + return; + + for (i = 0; i <= desc->hdr.end_entry; i++) { + e = &desc->entries[i]; + + /* + * If validating memory (making it private) perform the cache + * eviction mitigation. + */ + if (e->operation == SNP_PAGE_STATE_PRIVATE) + sev_evict_cache(pfn_to_kaddr(e->gfn), e->pagesize ? 512 : 1); + } } static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index 2c7dcbf1458b..b3c47d3700e2 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o tdx-shared.o tdcall.o +obj-y += debug.o tdcall.o tdx.o tdx-shared.o diff --git a/arch/x86/coco/tdx/debug.c b/arch/x86/coco/tdx/debug.c new file mode 100644 index 000000000000..cef847c8bb67 --- /dev/null +++ b/arch/x86/coco/tdx/debug.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#undef pr_fmt +#define pr_fmt(fmt) "tdx: " fmt + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <asm/tdx.h> + +#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name) + +static __initdata const char *tdx_attributes[] = { + DEF_TDX_ATTR_NAME(DEBUG), + DEF_TDX_ATTR_NAME(HGS_PLUS_PROF), + DEF_TDX_ATTR_NAME(PERF_PROF), + DEF_TDX_ATTR_NAME(PMT_PROF), + DEF_TDX_ATTR_NAME(ICSSD), + DEF_TDX_ATTR_NAME(LASS), + DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE), + DEF_TDX_ATTR_NAME(MIGRTABLE), + DEF_TDX_ATTR_NAME(PKS), + DEF_TDX_ATTR_NAME(KL), + DEF_TDX_ATTR_NAME(TPA), + DEF_TDX_ATTR_NAME(PERFMON), +}; + +#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name) + +static __initdata const char *tdcs_td_ctls[] = { + DEF_TD_CTLS_NAME(PENDING_VE_DISABLE), + DEF_TD_CTLS_NAME(ENUM_TOPOLOGY), + DEF_TD_CTLS_NAME(VIRT_CPUID2), + DEF_TD_CTLS_NAME(REDUCE_VE), + DEF_TD_CTLS_NAME(LOCK), +}; + +void __init tdx_dump_attributes(u64 td_attr) +{ + pr_info("Attributes:"); + + for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) { + if (!tdx_attributes[i]) + continue; + if (td_attr & BIT(i)) + pr_cont(" %s", tdx_attributes[i]); + td_attr &= ~BIT(i); + } + + if (td_attr) + pr_cont(" unknown:%#llx", td_attr); + pr_cont("\n"); + +} + +void __init tdx_dump_td_ctls(u64 td_ctls) +{ + pr_info("TD_CTLS:"); + + for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) { + if (!tdcs_td_ctls[i]) + continue; + if (td_ctls & BIT(i)) + pr_cont(" %s", tdcs_td_ctls[i]); + td_ctls &= ~BIT(i); + } + if (td_ctls) + pr_cont(" unknown:%#llx", td_ctls); + pr_cont("\n"); +} diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 0d9b090b4880..edab6d6049be 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -14,6 +14,7 @@ #include <asm/ia32.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/paravirt_types.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/traps.h> @@ -32,9 +33,6 @@ #define VE_GET_PORT_NUM(e) ((e) >> 16) #define VE_IS_IO_STRING(e) ((e) & BIT(4)) -#define ATTR_DEBUG BIT(0) -#define ATTR_SEPT_VE_DISABLE BIT(28) - /* TDX Module call error codes */ #define TDCALL_RETURN_CODE(a) ((a) >> 32) #define TDCALL_INVALID_OPERAND 0xc0000100 @@ -170,11 +168,11 @@ static void __noreturn tdx_panic(const char *msg) /* Define register order according to the GHCI */ struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; }; - char str[64]; + char bytes[64] __nonstring; } message; /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ - strtomem_pad(message.str, msg, '\0'); + strtomem_pad(message.bytes, msg, '\0'); args.r8 = message.r8; args.r9 = message.r9; @@ -200,14 +198,14 @@ static void __noreturn tdx_panic(const char *msg) * * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM * controls if the guest will receive such #VE with TD attribute - * ATTR_SEPT_VE_DISABLE. + * TDX_ATTR_SEPT_VE_DISABLE. * * Newer TDX modules allow the guest to control if it wants to receive SEPT * violation #VEs. * * Check if the feature is available and disable SEPT #VE if possible. * - * If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE + * If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE * attribute is no longer reliable. It reflects the initial state of the * control for the TD, but it will not be updated if someone (e.g. bootloader) * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to @@ -216,14 +214,14 @@ static void __noreturn tdx_panic(const char *msg) static void disable_sept_ve(u64 td_attr) { const char *msg = "TD misconfiguration: SEPT #VE has to be disabled"; - bool debug = td_attr & ATTR_DEBUG; + bool debug = td_attr & TDX_ATTR_DEBUG; u64 config, controls; /* Is this TD allowed to disable SEPT #VE */ tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) { /* No SEPT #VE controls for the guest: check the attribute */ - if (td_attr & ATTR_SEPT_VE_DISABLE) + if (td_attr & TDX_ATTR_SEPT_VE_DISABLE) return; /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */ @@ -274,6 +272,20 @@ static void enable_cpu_topology_enumeration(void) tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY); } +static void reduce_unnecessary_ve(void) +{ + u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE); + + if (err == TDX_SUCCESS) + return; + + /* + * Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to + * enable ENUM_TOPOLOGY if REDUCE_VE was not successful. + */ + enable_cpu_topology_enumeration(); +} + static void tdx_setup(u64 *cc_mask) { struct tdx_module_args args = {}; @@ -305,7 +317,8 @@ static void tdx_setup(u64 *cc_mask) tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL); disable_sept_ve(td_attr); - enable_cpu_topology_enumeration(); + + reduce_unnecessary_ve(); } /* @@ -380,13 +393,21 @@ static int handle_halt(struct ve_info *ve) { const bool irq_disabled = irqs_disabled(); + /* + * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a + * wake event may be consumed before requesting HLT emulation, leaving + * the vCPU blocking indefinitely. + */ + if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled")) + return -EIO; + if (__halt(irq_disabled)) return -EIO; return ve_instr_len(ve); } -void __cpuidle tdx_safe_halt(void) +void __cpuidle tdx_halt(void) { const bool irq_disabled = false; @@ -397,6 +418,16 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } +static void __cpuidle tdx_safe_halt(void) +{ + tdx_halt(); + /* + * "__cpuidle" section doesn't support instrumentation, so stick + * with raw_* variant that avoids tracing hooks. + */ + raw_local_irq_enable(); +} + static int read_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_module_args args = { @@ -1025,6 +1056,20 @@ static void tdx_kexec_finish(void) } } +static __init void tdx_announce(void) +{ + struct tdx_module_args args = {}; + u64 controls; + + pr_info("Guest detected\n"); + + tdcall(TDG_VP_INFO, &args); + tdx_dump_attributes(args.rdx); + + tdg_vm_rd(TDCS_TD_CTLS, &controls); + tdx_dump_td_ctls(controls); +} + void __init tdx_early_init(void) { u64 cc_mask; @@ -1084,6 +1129,19 @@ void __init tdx_early_init(void) x86_platform.guest.enc_kexec_finish = tdx_kexec_finish; /* + * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that + * will enable interrupts before HLT TDCALL invocation if executed + * in STI-shadow, possibly resulting in missed wakeup events. + * + * Modify all possible HLT execution paths to use TDX specific routines + * that directly execute TDCALL and toggle the interrupt state as + * needed after TDCALL completion. This also reduces HLT related #VEs + * in addition to having a reliable halt logic execution. + */ + pv_ops.irq.safe_halt = tdx_safe_halt; + pv_ops.irq.halt = tdx_halt; + + /* * TDX intercepts the RDMSR to read the X2APIC ID in the parallel * bringup low level code. That raises #VE which cannot be handled * there. @@ -1094,5 +1152,5 @@ void __init tdx_early_init(void) */ x86_cpuinit.parallel_bringup = false; - pr_info("Guest detected\n"); + tdx_announce(); } |