From 4b05f81504bfcaab51083d3a271fada75e6b8904 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:21 +0300 Subject: x86/tdx: Detect TDX at early kernel decompression time The early decompression code does port I/O for its console output. But, handling the decompression-time port I/O demands a different approach from normal runtime because the IDT required to support #VE based port I/O emulation is not yet set up. Paravirtualizing I/O calls during the decompression step is acceptable because the decompression code doesn't have a lot of call sites to IO instruction. To support port I/O in decompression code, TDX must be detected before the decompression code might do port I/O. Detect whether the kernel runs in a TDX guest. Add an early_is_tdx_guest() interface to query the cached TDX guest status in the decompression code. TDX is detected with CPUID. Make cpuid_count() accessible outside boot/cpuflags.c. TDX detection in the main kernel is very similar. Move common bits into . The actual port I/O paravirtualization will come later in the series. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-13-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/misc.c | 8 ++++++++ arch/x86/boot/compressed/misc.h | 2 ++ arch/x86/boot/compressed/tdx.c | 16 ++++++++++++++++ arch/x86/boot/compressed/tdx.h | 13 +++++++++++++ 5 files changed, 40 insertions(+) create mode 100644 arch/x86/boot/compressed/tdx.c create mode 100644 arch/x86/boot/compressed/tdx.h (limited to 'arch/x86/boot/compressed') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6115274fe10f..732f6b21ecbd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,6 +101,7 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcaf34ee36..e8142e977ddb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -371,6 +371,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + /* + * Detect TDX guest environment. + * + * It has to be done before console_init() in order to use + * paravirtualized port I/O operations if needed. + */ + early_tdx_detect(); + console_init(); /* diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 16ed360b6692..0d8e275a9d96 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -28,6 +28,8 @@ #include #include +#include "tdx.h" + #define BOOT_CTYPE_H #include diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c new file mode 100644 index 000000000000..5f6d01a2f1f4 --- /dev/null +++ b/arch/x86/boot/compressed/tdx.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../cpuflags.h" +#include "../string.h" + +#include + +void early_tdx_detect(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; +} diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h new file mode 100644 index 000000000000..9055482cd35c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_TDX_H +#define BOOT_COMPRESSED_TDX_H + +#include + +#ifdef CONFIG_INTEL_TDX_GUEST +void early_tdx_detect(void); +#else +static inline void early_tdx_detect(void) { }; +#endif + +#endif /* BOOT_COMPRESSED_TDX_H */ -- cgit v1.2.3 From 1e8f93e18379d05da9fd130eb7d50988a20f8b9a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:23 +0300 Subject: x86: Consolidate port I/O helpers There are two implementations of port I/O helpers: one in the kernel and one in the boot stub. Move the helpers required for both to and use the one implementation everywhere. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-15-kirill.shutemov@linux.intel.com --- arch/x86/boot/boot.h | 35 +---------------------------------- arch/x86/boot/compressed/misc.h | 2 +- arch/x86/include/asm/io.h | 22 ++-------------------- arch/x86/include/asm/shared/io.h | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 55 deletions(-) create mode 100644 arch/x86/include/asm/shared/io.h (limited to 'arch/x86/boot/compressed') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 34c9dbb6a47d..22a474c5b3e8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "bitops.h" #include "ctype.h" #include "cpuflags.h" @@ -35,40 +36,6 @@ extern struct boot_params boot_params; #define cpu_relax() asm volatile("rep; nop") -/* Basic port I/O */ -static inline void outb(u8 v, u16 port) -{ - asm volatile("outb %0,%1" : : "a" (v), "dN" (port)); -} -static inline u8 inb(u16 port) -{ - u8 v; - asm volatile("inb %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outw(u16 v, u16 port) -{ - asm volatile("outw %0,%1" : : "a" (v), "dN" (port)); -} -static inline u16 inw(u16 port) -{ - u16 v; - asm volatile("inw %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outl(u32 v, u16 port) -{ - asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); -} -static inline u32 inl(u16 port) -{ - u32 v; - asm volatile("inl %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0d8e275a9d96..8a253e85f990 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -22,11 +22,11 @@ #include #include #include -#include #include #include #include #include +#include #include "tdx.h" diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 638c1a2a82e0..a1eb218a49f8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -44,6 +44,7 @@ #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -258,20 +259,6 @@ static inline void slow_down_io(void) #endif #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(type value, u16 port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline type in##bwl(u16 port) \ -{ \ - type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -320,10 +307,8 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ BUILDIO(b, b, u8) BUILDIO(w, w, u16) BUILDIO(l, , u32) +#undef BUILDIO -#define inb inb -#define inw inw -#define inl inl #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p @@ -331,9 +316,6 @@ BUILDIO(l, , u32) #define insw insw #define insl insl -#define outb outb -#define outw outw -#define outl outl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h new file mode 100644 index 000000000000..c0ef921c0586 --- /dev/null +++ b/arch/x86/include/asm/shared/io.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_IO_H +#define _ASM_X86_SHARED_IO_H + +#include + +#define BUILDIO(bwl, bw, type) \ +static inline void __out##bwl(type value, u16 port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline type __in##bwl(u16 port) \ +{ \ + type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} + +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) +#undef BUILDIO + +#define inb __inb +#define inw __inw +#define inl __inl +#define outb __outb +#define outw __outw +#define outl __outl + +#endif -- cgit v1.2.3 From eb4ea1ae8f45e3249e7586f30be8977478202a37 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:24 +0300 Subject: x86/boot: Port I/O: Allow to hook up alternative helpers Port I/O instructions trigger #VE in the TDX environment. In response to the exception, kernel emulates these instructions using hypercalls. But during early boot, on the decompression stage, it is cumbersome to deal with #VE. It is cleaner to go to hypercalls directly, bypassing #VE handling. Add a way to hook up alternative port I/O helpers in the boot stub with a new pio_ops structure. For now, set the ops structure to just call the normal I/O operation functions. out*()/in*() macros redefined to use pio_ops callbacks. It eliminates need in changing call sites. io_delay() changed to use port I/O helper instead of inline assembly. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-16-kirill.shutemov@linux.intel.com --- arch/x86/boot/boot.h | 4 ++-- arch/x86/boot/compressed/misc.c | 4 ++++ arch/x86/boot/compressed/misc.h | 2 +- arch/x86/boot/io.h | 41 +++++++++++++++++++++++++++++++++++++++++ arch/x86/boot/main.c | 4 ++++ arch/x86/realmode/rm/wakemain.c | 4 ++++ 6 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 arch/x86/boot/io.h (limited to 'arch/x86/boot/compressed') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 22a474c5b3e8..b42b91606ca8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,10 +23,10 @@ #include #include #include -#include #include "bitops.h" #include "ctype.h" #include "cpuflags.h" +#include "io.h" /* Useful macros */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) @@ -39,7 +39,7 @@ extern struct boot_params boot_params; static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; - asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT)); + outb(0, DELAY_PORT); } /* These functions are used to reference data in other segments. */ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e8142e977ddb..fa8969fad011 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -48,6 +48,8 @@ void *memmove(void *dest, const void *src, size_t n); */ struct boot_params *boot_params; +struct port_io_ops pio_ops; + memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -371,6 +373,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + init_default_io_ops(); + /* * Detect TDX guest environment. * diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 8a253e85f990..ea71cf3d64e1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -26,7 +26,6 @@ #include #include #include -#include #include "tdx.h" @@ -35,6 +34,7 @@ #define BOOT_BOOT_H #include "../ctype.h" +#include "../io.h" #ifdef CONFIG_X86_64 #define memptr long diff --git a/arch/x86/boot/io.h b/arch/x86/boot/io.h new file mode 100644 index 000000000000..110880907f87 --- /dev/null +++ b/arch/x86/boot/io.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_IO_H +#define BOOT_IO_H + +#include + +#undef inb +#undef inw +#undef inl +#undef outb +#undef outw +#undef outl + +struct port_io_ops { + u8 (*f_inb)(u16 port); + void (*f_outb)(u8 v, u16 port); + void (*f_outw)(u16 v, u16 port); +}; + +extern struct port_io_ops pio_ops; + +/* + * Use the normal I/O instructions by default. + * TDX guests override these to use hypercalls. + */ +static inline void init_default_io_ops(void) +{ + pio_ops.f_inb = __inb; + pio_ops.f_outb = __outb; + pio_ops.f_outw = __outw; +} + +/* + * Redirect port I/O operations via pio_ops callbacks. + * TDX guests override these callbacks with TDX-specific helpers. + */ +#define inb pio_ops.f_inb +#define outb pio_ops.f_outb +#define outw pio_ops.f_outw + +#endif diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index e3add857c2c9..1202d4f8a390 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -17,6 +17,8 @@ struct boot_params boot_params __attribute__((aligned(16))); +struct port_io_ops pio_ops; + char *HEAP = _end; char *heap_end = _end; /* Default end of heap = no heap */ @@ -133,6 +135,8 @@ static void init_heap(void) void main(void) { + init_default_io_ops(); + /* First, copy the boot header into the "zeropage" */ copy_boot_params(); diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c index 1d6437e6d2ba..a6f4d8388ad8 100644 --- a/arch/x86/realmode/rm/wakemain.c +++ b/arch/x86/realmode/rm/wakemain.c @@ -62,8 +62,12 @@ static void send_morse(const char *pattern) } } +struct port_io_ops pio_ops; + void main(void) { + init_default_io_ops(); + /* Kill machine if structures are wrong */ if (wakeup_header.real_magic != 0x12345678) while (1) -- cgit v1.2.3 From 4c5b9aac6cade51aef64cc6ed67f2ad5acda9aed Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:25 +0300 Subject: x86/boot: Port I/O: Add decompression-time support for TDX Port I/O instructions trigger #VE in the TDX environment. In response to the exception, kernel emulates these instructions using hypercalls. But during early boot, on the decompression stage, it is cumbersome to deal with #VE. It is cleaner to go to hypercalls directly, bypassing #VE handling. Hook up TDX-specific port I/O helpers if booting in TDX environment. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-17-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/tdcall.S | 3 ++ arch/x86/boot/compressed/tdx.c | 61 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/shared/tdx.h | 32 ++++++++++++++++++++ arch/x86/include/asm/tdx.h | 27 ----------------- 5 files changed, 97 insertions(+), 28 deletions(-) create mode 100644 arch/x86/boot/compressed/tdcall.S (limited to 'arch/x86/boot/compressed') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 732f6b21ecbd..8fd0e6ae2e1f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,7 +101,7 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S new file mode 100644 index 000000000000..46d0495e0d3a --- /dev/null +++ b/arch/x86/boot/compressed/tdcall.S @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "../../coco/tdx/tdcall.S" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 5f6d01a2f1f4..918a7606f53c 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -2,9 +2,65 @@ #include "../cpuflags.h" #include "../string.h" +#include "../io.h" +#include "error.h" + +#include +#include #include +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + error("TDVMCALL failed. TDX module bug?"); +} + +static inline unsigned int tdx_io_in(int size, u16 port) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 0, + .r14 = port, + }; + + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return UINT_MAX; + + return args.r11; +} + +static inline void tdx_io_out(int size, u16 port, u32 value) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 1, + .r14 = port, + .r15 = value, + }; + + __tdx_hypercall(&args, 0); +} + +static inline u8 tdx_inb(u16 port) +{ + return tdx_io_in(1, port); +} + +static inline void tdx_outb(u8 value, u16 port) +{ + tdx_io_out(1, port, value); +} + +static inline void tdx_outw(u16 value, u16 port) +{ + tdx_io_out(2, port, value); +} + void early_tdx_detect(void) { u32 eax, sig[3]; @@ -13,4 +69,9 @@ void early_tdx_detect(void) if (memcmp(TDX_IDENT, sig, sizeof(sig))) return; + + /* Use hypercalls instead of I/O instructions */ + pio_ops.f_inb = tdx_inb; + pio_ops.f_outb = tdx_outb; + pio_ops.f_outw = tdx_outw; } diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 8209ba9ffe1a..e53f26228fbb 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -2,7 +2,39 @@ #ifndef _ASM_X86_SHARED_TDX_H #define _ASM_X86_SHARED_TDX_H +#include +#include + +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) + #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +#ifndef __ASSEMBLY__ + +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 81a1ec14e476..7944fd1ae07d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -3,17 +3,11 @@ #ifndef _ASM_X86_TDX_H #define _ASM_X86_TDX_H -#include #include #include #include #include -#define TDX_HYPERCALL_STANDARD 0 - -#define TDX_HCALL_HAS_OUTPUT BIT(0) -#define TDX_HCALL_ISSUE_STI BIT(1) - /* * SW-defined error codes. * @@ -41,21 +35,6 @@ struct tdx_module_output { u64 r11; }; -/* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. - */ -struct tdx_hypercall_args { - u64 r10; - u64 r11; - u64 r12; - u64 r13; - u64 r14; - u64 r15; -}; - /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -80,12 +59,6 @@ void __init tdx_early_init(void); u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, struct tdx_module_output *out); -/* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); - -/* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); - void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); -- cgit v1.2.3 From 9cf30606405f37b68ee1c0f6846253313c077088 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:31 +0300 Subject: x86/boot: Set CR0.NE early and keep it set during the boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TDX guest requires CR0.NE to be set. Clearing the bit triggers #GP(0). If CR0.NE is 0, the MS-DOS compatibility mode for handling floating-point exceptions is selected. In this mode, the software exception handler for floating-point exceptions is invoked externally using the processor’s FERR#, INTR, and IGNNE# pins. Using FERR# and IGNNE# to handle floating-point exception is deprecated. CR0.NE=0 also limits newer processors to operate with one logical processor active. Kernel uses CR0_STATE constant to initialize CR0. It has NE bit set. But during early boot kernel has more ad-hoc approach to setting bit in the register. During some of this ad-hoc manipulation, CR0.NE is cleared. This causes a #GP in TDX guests and makes it die in early boot. Make CR0 initialization consistent, deriving the initial value of CR0 from CR0_STATE. Since CR0_STATE always has CR0.NE=1, this ensures that CR0.NE is never 0 and avoids the #GP. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-23-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/head_64.S | 7 ++++--- arch/x86/realmode/rm/trampoline_64.S | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/boot/compressed') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index dea95301196b..7b5d36214352 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -289,7 +289,7 @@ SYM_FUNC_START(startup_32) pushl %eax /* Enter paged protected Mode, activating Long Mode */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ + movl $CR0_STATE, %eax movl %eax, %cr0 /* Jump from 32bit compatibility mode into 64bit mode. */ @@ -661,8 +661,9 @@ SYM_CODE_START(trampoline_32bit_src) pushl $__KERNEL_CS pushl %eax - /* Enable paging again */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax + /* Enable paging again. */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 lret diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index ae112a91592f..d380f2d1fd23 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -70,7 +70,7 @@ SYM_CODE_START(trampoline_start) movw $__KERNEL_DS, %dx # Data segment descriptor # Enable protected mode - movl $X86_CR0_PE, %eax # protected mode (PE) bit + movl $(CR0_STATE & ~X86_CR0_PG), %eax movl %eax, %cr0 # into protected mode # flush prefetch and jump to startup_32 @@ -148,8 +148,8 @@ SYM_CODE_START(startup_32) movl $MSR_EFER, %ecx wrmsr - # Enable paging and in turn activate Long Mode - movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax + # Enable paging and in turn activate Long Mode. + movl $CR0_STATE, %eax movl %eax, %cr0 /* @@ -169,7 +169,7 @@ SYM_CODE_START(pa_trampoline_compat) movl $rm_stack_end, %esp movw $__KERNEL_DS, %dx - movl $X86_CR0_PE, %eax + movl $(CR0_STATE & ~X86_CR0_PG), %eax movl %eax, %cr0 ljmpl $__KERNEL32_CS, $pa_startup_32 SYM_CODE_END(pa_trampoline_compat) -- cgit v1.2.3 From 77a512e35db7609a8c909e2006b2ea82f2b1616f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Apr 2022 02:29:32 +0300 Subject: x86/boot: Avoid #VE during boot for TDX platforms There are a few MSRs and control register bits that the kernel normally needs to modify during boot. But, TDX disallows modification of these registers to help provide consistent security guarantees. Fortunately, TDX ensures that these are all in the correct state before the kernel loads, which means the kernel does not need to modify them. The conditions to avoid are: * Any writes to the EFER MSR * Clearing CR4.MCE This theoretically makes the guest boot more fragile. If, for instance, EFER was set up incorrectly and a WRMSR was performed, it will trigger early exception panic or a triple fault, if it's before early exceptions are set up. However, this is likely to trip up the guest BIOS long before control reaches the kernel. In any case, these kinds of problems are unlikely to occur in production environments, and developers have good debug tools to fix them quickly. Change the common boot code to work on TDX and non-TDX systems. This should have no functional effect on non-TDX systems. Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-24-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/head_64.S | 20 ++++++++++++++++++-- arch/x86/boot/compressed/pgtable.h | 2 +- arch/x86/kernel/head_64.S | 28 ++++++++++++++++++++++++++-- arch/x86/realmode/rm/trampoline_64.S | 13 ++++++++++++- 5 files changed, 58 insertions(+), 6 deletions(-) (limited to 'arch/x86/boot/compressed') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aea4cc404c31..a18185525708 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -884,6 +884,7 @@ config INTEL_TDX_GUEST depends on X86_X2APIC select ARCH_HAS_CC_PLATFORM select DYNAMIC_PHYSICAL_MASK + select X86_MCE help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 7b5d36214352..f63ec254e457 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -642,12 +642,28 @@ SYM_CODE_START(trampoline_32bit_src) movl $MSR_EFER, %ecx rdmsr btsl $_EFER_LME, %eax + /* Avoid writing EFER if no change was made (for TDX guest) */ + jc 1f wrmsr - popl %edx +1: popl %edx popl %ecx +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movl %cr4, %eax + andl $X86_CR4_MCE, %eax +#else + movl $0, %eax +#endif + /* Enable PAE and LA57 (if required) paging modes */ - movl $X86_CR4_PAE, %eax + orl $X86_CR4_PAE, %eax testl %edx, %edx jz 1f orl $X86_CR4_LA57, %eax diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index 6ff7e81b5628..cc9b2529a086 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -6,7 +6,7 @@ #define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0x70 +#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 #define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b8e3019547a5..39a6ff7c0060 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -142,8 +142,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) addq $(init_top_pgt - __START_KERNEL_map), %rax 1: +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + /* Enable PAE mode, PGE and LA57 */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f @@ -249,13 +263,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx btsl $_EFER_SCE, %eax /* Enable System Call */ btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: /* Setup cr0 */ movl $CR0_STATE, %eax /* Make changes effective */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index d380f2d1fd23..e38d61d6562e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -143,11 +143,22 @@ SYM_CODE_START(startup_32) movl %eax, %cr3 # Set up EFER + movl $MSR_EFER, %ecx + rdmsr + /* + * Skip writing to EFER if the register already has desired + * value (to avoid #VE for the TDX guest). + */ + cmp pa_tr_efer, %eax + jne .Lwrite_efer + cmp pa_tr_efer + 4, %edx + je .Ldone_efer +.Lwrite_efer: movl pa_tr_efer, %eax movl pa_tr_efer + 4, %edx - movl $MSR_EFER, %ecx wrmsr +.Ldone_efer: # Enable paging and in turn activate Long Mode. movl $CR0_STATE, %eax movl %eax, %cr0 -- cgit v1.2.3