diff options
Diffstat (limited to 'arch/x86/boot/compressed')
-rw-r--r-- | arch/x86/boot/compressed/Makefile | 54 | ||||
-rw-r--r-- | arch/x86/boot/compressed/cpuflags.c | 4 | ||||
-rw-r--r-- | arch/x86/boot/compressed/head_32.S | 99 | ||||
-rw-r--r-- | arch/x86/boot/compressed/head_64.S | 198 | ||||
-rw-r--r-- | arch/x86/boot/compressed/ident_map_64.c | 349 | ||||
-rw-r--r-- | arch/x86/boot/compressed/idt_64.c | 54 | ||||
-rw-r--r-- | arch/x86/boot/compressed/idt_handlers_64.S | 77 | ||||
-rw-r--r-- | arch/x86/boot/compressed/kaslr.c | 266 | ||||
-rw-r--r-- | arch/x86/boot/compressed/kaslr_64.c | 153 | ||||
-rw-r--r-- | arch/x86/boot/compressed/misc.c | 7 | ||||
-rw-r--r-- | arch/x86/boot/compressed/misc.h | 54 | ||||
-rw-r--r-- | arch/x86/boot/compressed/mkpiggy.c | 6 | ||||
-rw-r--r-- | arch/x86/boot/compressed/pgtable_64.c | 9 | ||||
-rw-r--r-- | arch/x86/boot/compressed/sev-es.c | 214 | ||||
-rw-r--r-- | arch/x86/boot/compressed/vmlinux.lds.S | 50 |
15 files changed, 1044 insertions, 550 deletions
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3962f592633d..ee249088cbfe 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -29,10 +29,10 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst KBUILD_CFLAGS := -m$(BITS) -O2 -KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) +KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 -cflags-$(CONFIG_X86_64) := -mcmodel=small +cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding @@ -43,24 +43,26 @@ KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS +# Disable relocation relaxation in case the link is not PIE. +KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) +KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h + +# sev-es.c indirectly inludes inat-table.h which is generated during +# compilation and stored in $(objtree). Add the directory to the includes so +# that the compiler finds it even with out-of-tree builds (make O=/some/path). +CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE :=n KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) # Compressed kernel should be built as PIE since it may be loaded at any # address by the bootloader. -ifeq ($(CONFIG_X86_32),y) -KBUILD_LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) -else -# To build 64-bit compressed kernel as PIE, we disable relocation -# overflow check to avoid relocation overflow error with a new linker -# command-line option, -z noreloc-overflow. -KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ - && echo "-z noreloc-overflow -pie --no-dynamic-linker") -endif -LDFLAGS_vmlinux := -T +LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) +LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) +LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include @@ -84,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o + vmlinux-objs-y += $(obj)/ident_map_64.o + vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o @@ -94,30 +98,8 @@ vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a -# The compressed kernel is built with -fPIC/-fPIE so that a boot loader -# can place it anywhere in memory and it will still run. However, since -# it is executed as-is without any ELF relocation processing performed -# (and has already had all relocation sections stripped from the binary), -# none of the code can use data relocations (e.g. static assignments of -# pointer values), since they will be meaningless at runtime. This check -# will refuse to link the vmlinux if any of these relocations are found. -quiet_cmd_check_data_rel = DATAREL $@ -define cmd_check_data_rel - for obj in $(filter %.o,$^); do \ - $(READELF) -S $$obj | grep -qF .rel.local && { \ - echo "error: $$obj has data relocations!" >&2; \ - exit 1; \ - } || true; \ - done -endef - -# We need to run two commands under "if_changed", so merge them into a -# single invocation. -quiet_cmd_check-and-link-vmlinux = LD $@ - cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) - $(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE - $(call if_changed,check-and-link-vmlinux) + $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c index 6448a8196d32..0cc1323896d1 100644 --- a/arch/x86/boot/compressed/cpuflags.c +++ b/arch/x86/boot/compressed/cpuflags.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_RANDOMIZE_BASE - #include "../cpuflags.c" bool has_cpuflag(int flag) @@ -9,5 +7,3 @@ bool has_cpuflag(int flag) return test_bit(flag, cpu.flags); } - -#endif diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 03557f2174bf..659fad53ca82 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -33,32 +33,13 @@ #include <asm/bootparam.h> /* - * The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X - * relocation to get the symbol address in PIC. When the compressed x86 - * kernel isn't built as PIC, the linker optimizes R_386_GOT32X - * relocations to their fixed symbol addresses. However, when the - * compressed x86 kernel is loaded at a different address, it leads - * to the following load failure: - * - * Failed to allocate space for phdrs - * - * during the decompression stage. - * - * If the compressed x86 kernel is relocatable at run-time, it should be - * compiled with -fPIE, instead of -fPIC, if possible and should be built as - * Position Independent Executable (PIE) so that linker won't optimize - * R_386_GOT32X relocation to its fixed symbol address. Older - * linkers generate R_386_32 relocations against locally defined symbols, - * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less - * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle - * R_386_32 relocations when relocating the kernel. To generate - * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as - * hidden: + * These symbols needed to be marked as .hidden to prevent the BFD linker from + * generating R_386_32 (rather than R_386_RELATIVE) relocations for them when + * the 32-bit compressed kernel is linked as PIE. This is no longer necessary, + * but it doesn't hurt to keep them .hidden. */ .hidden _bss .hidden _ebss - .hidden _got - .hidden _egot .hidden _end __HEAD @@ -77,10 +58,10 @@ SYM_FUNC_START(startup_32) leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %edx - subl $1b, %edx + addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %edx /* Load new GDT */ - leal gdt(%edx), %eax + leal gdt@GOTOFF(%edx), %eax movl %eax, 2(%eax) lgdt (%eax) @@ -93,14 +74,16 @@ SYM_FUNC_START(startup_32) movl %eax, %ss /* - * %edx contains the address we are loaded at by the boot loader and %ebx - * contains the address where we should move the kernel image temporarily - * for safe in-place decompression. %ebp contains the address that the kernel - * will be decompressed to. + * %edx contains the address we are loaded at by the boot loader (plus the + * offset to the GOT). The below code calculates %ebx to be the address where + * we should move the kernel image temporarily for safe in-place decompression + * (again, plus the offset to the GOT). + * + * %ebp is calculated to be the address that the kernel will be decompressed to. */ #ifdef CONFIG_RELOCATABLE - movl %edx, %ebx + leal startup_32@GOTOFF(%edx), %ebx #ifdef CONFIG_EFI_STUB /* @@ -111,7 +94,7 @@ SYM_FUNC_START(startup_32) * image_offset = startup_32 - image_base * Otherwise image_offset will be zero and has no effect on the calculations. */ - subl image_offset(%edx), %ebx + subl image_offset@GOTOFF(%edx), %ebx #endif movl BP_kernel_alignment(%esi), %eax @@ -128,10 +111,10 @@ SYM_FUNC_START(startup_32) movl %ebx, %ebp // Save the output address for later /* Target address to relocate to for decompression */ addl BP_init_size(%esi), %ebx - subl $_end, %ebx + subl $_end@GOTOFF, %ebx /* Set up the stack */ - leal boot_stack_end(%ebx), %esp + leal boot_stack_end@GOTOFF(%ebx), %esp /* Zero EFLAGS */ pushl $0 @@ -142,8 +125,8 @@ SYM_FUNC_START(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal (_bss-4)(%edx), %esi - leal (_bss-4)(%ebx), %edi + leal (_bss@GOTOFF-4)(%edx), %esi + leal (_bss@GOTOFF-4)(%ebx), %edi movl $(_bss - startup_32), %ecx shrl $2, %ecx std @@ -156,14 +139,14 @@ SYM_FUNC_START(startup_32) * during extract_kernel below. To avoid any issues, repoint the GDTR * to the new copy of the GDT. */ - leal gdt(%ebx), %eax + leal gdt@GOTOFF(%ebx), %eax movl %eax, 2(%eax) lgdt (%eax) /* * Jump to the relocated address. */ - leal .Lrelocated(%ebx), %eax + leal .Lrelocated@GOTOFF(%ebx), %eax jmp *%eax SYM_FUNC_END(startup_32) @@ -173,7 +156,7 @@ SYM_FUNC_START_ALIAS(efi_stub_entry) add $0x4, %esp movl 8(%esp), %esi /* save boot_params pointer */ call efi_main - leal startup_32(%eax), %eax + /* efi_main returns the possibly relocated address of startup_32 */ jmp *%eax SYM_FUNC_END(efi32_stub_entry) SYM_FUNC_END_ALIAS(efi_stub_entry) @@ -186,40 +169,26 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) * Clear BSS (stack is currently empty) */ xorl %eax, %eax - leal _bss(%ebx), %edi - leal _ebss(%ebx), %ecx + leal _bss@GOTOFF(%ebx), %edi + leal _ebss@GOTOFF(%ebx), %ecx subl %edi, %ecx shrl $2, %ecx rep stosl /* - * Adjust our own GOT - */ - leal _got(%ebx), %edx - leal _egot(%ebx), %ecx -1: - cmpl %ecx, %edx - jae 2f - addl %ebx, (%edx) - addl $4, %edx - jmp 1b -2: - -/* * Do the extraction, and jump to the new kernel.. */ - /* push arguments for extract_kernel: */ - pushl $z_output_len /* decompressed length, end of relocs */ - - pushl %ebp /* output address */ - - pushl $z_input_len /* input_len */ - leal input_data(%ebx), %eax - pushl %eax /* input_data */ - leal boot_heap(%ebx), %eax - pushl %eax /* heap area */ - pushl %esi /* real mode pointer */ - call extract_kernel /* returns kernel location in %eax */ + /* push arguments for extract_kernel: */ + + pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */ + pushl %ebp /* output address */ + pushl input_len@GOTOFF(%ebx) /* input_len */ + leal input_data@GOTOFF(%ebx), %eax + pushl %eax /* input_data */ + leal boot_heap@GOTOFF(%ebx), %eax + pushl %eax /* heap area */ + pushl %esi /* real mode pointer */ + call extract_kernel /* returns kernel location in %eax */ addl $24, %esp /* diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 97d37f0a34f5..1c80f1738fd9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include <asm/desc_defs.h> #include "pgtable.h" /* @@ -40,11 +41,35 @@ */ .hidden _bss .hidden _ebss - .hidden _got - .hidden _egot .hidden _end __HEAD + +/* + * This macro gives the relative virtual address of X, i.e. the offset of X + * from startup_32. This is the same as the link-time virtual address of X, + * since startup_32 is at 0, but defining it this way tells the + * assembler/linker that we do not want the actual run-time address of X. This + * prevents the linker from trying to create unwanted run-time relocation + * entries for the reference when the compressed kernel is linked as PIE. + * + * A reference X(%reg) will result in the link-time VA of X being stored with + * the instruction, and a run-time R_X86_64_RELATIVE relocation entry that + * adds the 64-bit base address where the kernel is loaded. + * + * Replacing it with (X-startup_32)(%reg) results in the offset being stored, + * and no run-time relocation. + * + * The macro should be used as a displacement with a base register containing + * the run-time address of startup_32 [i.e. rva(X)(%reg)], or as an immediate + * [$ rva(X)]. + * + * This macro can only be used from within the .head.text section, since the + * expression requires startup_32 to be in the same section as the code being + * assembled. + */ +#define rva(X) ((X) - startup_32) + .code32 SYM_FUNC_START(startup_32) /* @@ -67,10 +92,10 @@ SYM_FUNC_START(startup_32) leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp - subl $1b, %ebp + subl $ rva(1b), %ebp /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax + leal rva(gdt)(%ebp), %eax movl %eax, 2(%eax) lgdt (%eax) @@ -83,7 +108,7 @@ SYM_FUNC_START(startup_32) movl %eax, %ss /* setup a stack and make sure cpu supports long mode. */ - leal boot_stack_end(%ebp), %esp + leal rva(boot_stack_end)(%ebp), %esp call verify_cpu testl %eax, %eax @@ -110,7 +135,7 @@ SYM_FUNC_START(startup_32) * image_offset = startup_32 - image_base * Otherwise image_offset will be zero and has no effect on the calculations. */ - subl image_offset(%ebp), %ebx + subl rva(image_offset)(%ebp), %ebx #endif movl BP_kernel_alignment(%esi), %eax @@ -126,7 +151,7 @@ SYM_FUNC_START(startup_32) /* Target address to relocate to for decompression */ addl BP_init_size(%esi), %ebx - subl $_end, %ebx + subl $ rva(_end), %ebx /* * Prepare for entering 64 bit mode @@ -154,19 +179,19 @@ SYM_FUNC_START(startup_32) 1: /* Initialize Page tables to 0 */ - leal pgtable(%ebx), %edi + leal rva(pgtable)(%ebx), %edi xorl %eax, %eax movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ - leal pgtable + 0(%ebx), %edi + leal rva(pgtable + 0)(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi) addl %edx, 4(%edi) /* Build Level 3 */ - leal pgtable + 0x1000(%ebx), %edi + leal rva(pgtable + 0x1000)(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) @@ -177,7 +202,7 @@ SYM_FUNC_START(startup_32) jnz 1b /* Build Level 2 */ - leal pgtable + 0x2000(%ebx), %edi + leal rva(pgtable + 0x2000)(%ebx), %edi movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) @@ -188,7 +213,7 @@ SYM_FUNC_START(startup_32) jnz 1b /* Enable the boot page tables */ - leal pgtable(%ebx), %eax + leal rva(pgtable)(%ebx), %eax movl %eax, %cr3 /* Enable Long mode in EFER (Extended Feature Enable Register) */ @@ -213,14 +238,14 @@ SYM_FUNC_START(startup_32) * We place all of the values on our mini stack so lret can * used to perform that far jump. */ - leal startup_64(%ebp), %eax + leal rva(startup_64)(%ebp), %eax #ifdef CONFIG_EFI_MIXED - movl efi32_boot_args(%ebp), %edi + movl rva(efi32_boot_args)(%ebp), %edi cmp $0, %edi jz 1f - leal efi64_stub_entry(%ebp), %eax - movl efi32_boot_args+4(%ebp), %esi - movl efi32_boot_args+8(%ebp), %edx // saved bootparams pointer + leal rva(efi64_stub_entry)(%ebp), %eax + movl rva(efi32_boot_args+4)(%ebp), %esi + movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer cmpl $0, %edx jnz 1f /* @@ -231,7 +256,7 @@ SYM_FUNC_START(startup_32) * the correct stack alignment for entry. */ subl $40, %esp - leal efi_pe_entry(%ebp), %eax + leal rva(efi_pe_entry)(%ebp), %eax movl %edi, %ecx // MS calling convention movl %esi, %edx 1: @@ -257,18 +282,18 @@ SYM_FUNC_START(efi32_stub_entry) call 1f 1: pop %ebp - subl $1b, %ebp + subl $ rva(1b), %ebp - movl %esi, efi32_boot_args+8(%ebp) + movl %esi, rva(efi32_boot_args+8)(%ebp) SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) - movl %ecx, efi32_boot_args(%ebp) - movl %edx, efi32_boot_args+4(%ebp) - movb $0, efi_is64(%ebp) + movl %ecx, rva(efi32_boot_args)(%ebp) + movl %edx, rva(efi32_boot_args+4)(%ebp) + movb $0, rva(efi_is64)(%ebp) /* Save firmware GDTR and code/data selectors */ - sgdtl efi32_boot_gdt(%ebp) - movw %cs, efi32_boot_cs(%ebp) - movw %ds, efi32_boot_ds(%ebp) + sgdtl rva(efi32_boot_gdt)(%ebp) + movw %cs, rva(efi32_boot_cs)(%ebp) + movw %ds, rva(efi32_boot_ds)(%ebp) /* Disable paging */ movl %cr0, %eax @@ -347,30 +372,11 @@ SYM_CODE_START(startup_64) /* Target address to relocate to for decompression */ movl BP_init_size(%rsi), %ebx - subl $_end, %ebx + subl $ rva(_end), %ebx addq %rbp, %rbx /* Set up the stack */ - leaq boot_stack_end(%rbx), %rsp - - /* - * paging_prepare() and cleanup_trampoline() below can have GOT - * references. Adjust the table with address we are running at. - * - * Zero RAX for adjust_got: the GOT was not adjusted before; - * there's no adjustment to undo. - */ - xorq %rax, %rax - - /* - * Calculate the address the binary is loaded at and use it as - * a GOT adjustment. - */ - call 1f -1: popq %rdi - subq $1b, %rdi - - call .Ladjust_got + leaq rva(boot_stack_end)(%rbx), %rsp /* * At this point we are in long mode with 4-level paging enabled, @@ -410,6 +416,10 @@ SYM_CODE_START(startup_64) .Lon_kernel_cs: + pushq %rsi + call load_stage1_idt + popq %rsi + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. @@ -444,7 +454,7 @@ SYM_CODE_START(startup_64) lretq trampoline_return: /* Restore the stack, the 32-bit trampoline uses its own stack */ - leaq boot_stack_end(%rbx), %rsp + leaq rva(boot_stack_end)(%rbx), %rsp /* * cleanup_trampoline() would restore trampoline memory. @@ -456,7 +466,7 @@ trampoline_return: * this function call. */ pushq %rsi - leaq top_pgtable(%rbx), %rdi + leaq rva(top_pgtable)(%rbx), %rdi call cleanup_trampoline popq %rsi @@ -464,30 +474,15 @@ trampoline_return: pushq $0 popfq - /* - * Previously we've adjusted the GOT with address the binary was - * loaded at. Now we need to re-adjust for relocation address. - * - * Calculate the address the binary is loaded at, so that we can - * undo the previous GOT adjustment. - */ - call 1f -1: popq %rax - subq $1b, %rax - - /* The new adjustment is the relocation address */ - movq %rbx, %rdi - call .Ladjust_got - /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ pushq %rsi leaq (_bss-8)(%rip), %rsi - leaq (_bss-8)(%rbx), %rdi - movq $_bss /* - $startup_32 */, %rcx - shrq $3, %rcx + leaq rva(_bss-8)(%rbx), %rdi + movl $(_bss - startup_32), %ecx + shrl $3, %ecx std rep movsq cld @@ -498,15 +493,15 @@ trampoline_return: * during extract_kernel below. To avoid any issues, repoint the GDTR * to the new copy of the GDT. */ - leaq gdt64(%rbx), %rax - leaq gdt(%rbx), %rdx + leaq rva(gdt64)(%rbx), %rax + leaq rva(gdt)(%rbx), %rdx movq %rdx, 2(%rax) lgdt (%rax) /* * Jump to the relocated address. */ - leaq .Lrelocated(%rbx), %rax + leaq rva(.Lrelocated)(%rbx), %rax jmp *%rax SYM_CODE_END(startup_64) @@ -518,7 +513,7 @@ SYM_FUNC_START_ALIAS(efi_stub_entry) movq %rdx, %rbx /* save boot_params pointer */ call efi_main movq %rbx,%rsi - leaq startup_64(%rax), %rax + leaq rva(startup_64)(%rax), %rax jmp *%rax SYM_FUNC_END(efi64_stub_entry) SYM_FUNC_END_ALIAS(efi_stub_entry) @@ -538,15 +533,30 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) rep stosq /* + * If running as an SEV guest, the encryption mask is required in the + * page-table setup code below. When the guest also has SEV-ES enabled + * set_sev_encryption_mask() will cause #VC exceptions, but the stage2 + * handler can't map its GHCB because the page-table is not set up yet. + * So set up the encryption mask here while still on the stage1 #VC + * handler. Then load stage2 IDT and switch to the kernel's own + * page-table. + */ + pushq %rsi + call set_sev_encryption_mask + call load_stage2_idt + call initialize_identity_maps + popq %rsi + +/* * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ - movl $z_input_len, %ecx /* input_len */ + movl input_len(%rip), %ecx /* input_len */ movq %rbp, %r8 /* output target address */ - movl $z_output_len, %r9d /* decompressed length, end of relocs */ + movl output_len(%rip), %r9d /* decompressed length, end of relocs */ call extract_kernel /* returns kernel location in %rax */ popq %rsi @@ -556,27 +566,6 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) jmp *%rax SYM_FUNC_END(.Lrelocated) -/* - * Adjust the global offset table - * - * RAX is the previous adjustment of the table to undo (use 0 if it's the - * first time we touch GOT). - * RDI is the new adjustment to apply. - */ -.Ladjust_got: - /* Walk through the GOT adding the address to the entries */ - leaq _got(%rip), %rdx - leaq _egot(%rip), %rcx -1: - cmpq %rcx, %rdx - jae 2f - subq %rax, (%rdx) /* Undo previous adjustment */ - addq %rdi, (%rdx) /* Apply the new adjustment */ - addq $8, %rdx - jmp 1b -2: - ret - .code32 /* * This is the 32-bit trampoline that will be copied over to low memory. @@ -690,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x0000000000000000 /* TS continued */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) +SYM_DATA_START(boot_idt_desc) + .word boot_idt_end - boot_idt - 1 + .quad 0 +SYM_DATA_END(boot_idt_desc) + .balign 8 +SYM_DATA_START(boot_idt) + .rept BOOT_IDT_ENTRIES + .quad 0 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif - #ifdef CONFIG_EFI_MIXED SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) @@ -702,7 +702,7 @@ SYM_DATA(efi_is64, .byte 1) #define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) #define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - .text + __HEAD .code32 SYM_FUNC_START(efi32_pe_entry) /* @@ -724,12 +724,12 @@ SYM_FUNC_START(efi32_pe_entry) call 1f 1: pop %ebx - subl $1b, %ebx + subl $ rva(1b), %ebx /* Get the loaded image protocol pointer from the image handle */ leal -4(%ebp), %eax pushl %eax // &loaded_image - leal loaded_image_proto(%ebx), %eax + leal rva(loaded_image_proto)(%ebx), %eax pushl %eax // pass the GUID address pushl 8(%ebp) // pass the image handle @@ -764,7 +764,7 @@ SYM_FUNC_START(efi32_pe_entry) * use it before we get to the 64-bit efi_pe_entry() in C code. */ subl %esi, %ebx - movl %ebx, image_offset(%ebp) // save image_offset + movl %ebx, rva(image_offset)(%ebp) // save image_offset jmp efi32_pe_stub_entry 2: popl %edi // restore callee-save registers diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c new file mode 100644 index 000000000000..063a60edcf99 --- /dev/null +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + +#include "error.h" +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include <linux/pgtable.h> +#include <asm/cmpxchg.h> +#include <asm/trap_pf.h> +#include <asm/trapnr.h> +#include <asm/init.h> +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#include "../../mm/ident_map.c" + +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled; +unsigned int pgdir_shift = 39; +unsigned int ptrs_per_p4d = 1; +#endif + +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long top_level_pgt; + +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info; + +/* + * Adds the specified range to the identity mappings. + */ +static void add_identity_map(unsigned long start, unsigned long end) +{ + int ret; + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); + if (ret) + error("Error: kernel_ident_mapping_init() failed\n"); +} + +/* Locates and clears a region for a new top level page table. */ +void initialize_identity_maps(void) +{ + /* Exclude the encryption mask from __PHYSICAL_MASK */ + physical_mask &= ~sme_me_mask; + + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; + + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + * + * With 5-level paging, we use '_pgtable' to allocate the p4d page table, + * the top-level page table is allocated separately. + * + * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level + * cases. On 4-level paging it's equal to 'top_level_pgt'. + */ + top_level_pgt = read_cr3_pa(); + if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); + } + + /* + * New page-table is set up - map the kernel image and load it + * into cr3. + */ + add_identity_map((unsigned long)_head, (unsigned long)_end); + write_cr3(top_level_pgt); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(top_level_pgt); +} + +static pte_t *split_large_pmd(struct x86_mapping_info *info, + pmd_t *pmdp, unsigned long __address) +{ + unsigned long page_flags; + unsigned long address; + pte_t *pte; + pmd_t pmd; + int i; + + pte = (pte_t *)info->alloc_pgt_page(info->context); + if (!pte) + return NULL; + + address = __address & PMD_MASK; + /* No large page - clear PSE flag */ + page_flags = info->page_flag & ~_PAGE_PSE; + + /* Populate the PTEs */ + for (i = 0; i < PTRS_PER_PMD; i++) { + set_pte(&pte[i], __pte(address | page_flags)); + address += PAGE_SIZE; + } + + /* + * Ideally we need to clear the large PMD first and do a TLB + * flush before we write the new PMD. But the 2M range of the + * PMD might contain the code we execute and/or the stack + * we are on, so we can't do that. But that should be safe here + * because we are going from large to small mappings and we are + * also the only user of the page-table, so there is no chance + * of a TLB multihit. + */ + pmd = __pmd((unsigned long)pte | info->kernpg_flag); + set_pmd(pmdp, pmd); + /* Flush TLB to establish the new PMD */ + write_cr3(top_level_pgt); + + return pte + pte_index(__address); +} + +static void clflush_page(unsigned long address) +{ + unsigned int flush_size; + char *cl, *start, *end; + + /* + * Hardcode cl-size to 64 - CPUID can't be used here because that might + * cause another #VC exception and the GHCB is not ready to use yet. + */ + flush_size = 64; + start = (char *)(address & PAGE_MASK); + end = start + PAGE_SIZE; + + /* + * First make sure there are no pending writes on the cache-lines to + * flush. + */ + asm volatile("mfence" : : : "memory"); + + for (cl = start; cl != end; cl += flush_size) + clflush(cl); +} + +static int set_clr_page_flags(struct x86_mapping_info *info, + unsigned long address, + pteval_t set, pteval_t clr) +{ + pgd_t *pgdp = (pgd_t *)top_level_pgt; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep, pte; + + /* + * First make sure there is a PMD mapping for 'address'. + * It should already exist, but keep things generic. + * + * To map the page just read from it and fault it in if there is no + * mapping yet. add_identity_map() can't be called here because that + * would unconditionally map the address on PMD level, destroying any + * PTE-level mappings that might already exist. Use assembly here so + * the access won't be optimized away. + */ + asm volatile("mov %[address], %%r9" + :: [address] "g" (*(unsigned long *)address) + : "r9", "memory"); + + /* + * The page is mapped at least with PMD size - so skip checks and walk + * directly to the PMD. + */ + p4dp = p4d_offset(pgdp, address); + pudp = pud_offset(p4dp, address); + pmdp = pmd_offset(pudp, address); + + if (pmd_large(*pmdp)) + ptep = split_large_pmd(info, pmdp, address); + else + ptep = pte_offset_kernel(pmdp, address); + + if (!ptep) + return -ENOMEM; + + /* + * Changing encryption attributes of a page requires to flush it from + * the caches. + */ + if ((set | clr) & _PAGE_ENC) + clflush_page(address); + + /* Update PTE */ + pte = *ptep; + pte = pte_set_flags(pte, set); + pte = pte_clear_flags(pte, clr); + set_pte(ptep, pte); + + /* Flush TLB after changing encryption attribute */ + write_cr3(top_level_pgt); + + return 0; +} + +int set_page_decrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC); +} + +int set_page_encrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0); +} + +int set_page_non_present(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT); +} + +static void do_pf_error(const char *msg, unsigned long error_code, + unsigned long address, unsigned long ip) +{ + error_putstr(msg); + + error_putstr("\nError Code: "); + error_puthex(error_code); + error_putstr("\nCR2: 0x"); + error_puthex(address); + error_putstr("\nRIP relative to _head: 0x"); + error_puthex(ip - (unsigned long)_head); + error_putstr("\n"); + + error("Stopping.\n"); +} + +void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address = native_read_cr2(); + unsigned long end; + bool ghcb_fault; + + ghcb_fault = sev_es_check_ghcb_fault(address); + + address &= PMD_MASK; + end = address + PMD_SIZE; + + /* + * Check for unexpected error codes. Unexpected are: + * - Faults on present pages + * - User faults + * - Reserved bits set + */ + if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) + do_pf_error("Unexpected page-fault:", error_code, address, regs->ip); + else if (ghcb_fault) + do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip); + + /* + * Error code is sane - now identity map the 2M region around + * the faulting address. + */ + add_identity_map(address, end); +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c new file mode 100644 index 000000000000..804a502ee0d2 --- /dev/null +++ b/arch/x86/boot/compressed/idt_64.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <asm/trap_pf.h> +#include <asm/segment.h> +#include <asm/trapnr.h> +#include "misc.h" + +static void set_idt_entry(int vector, void (*handler)(void)) +{ + unsigned long address = (unsigned long)handler; + gate_desc entry; + + memset(&entry, 0, sizeof(entry)); + + entry.offset_low = (u16)(address & 0xffff); + entry.segment = __KERNEL_CS; + entry.bits.type = GATE_TRAP; + entry.bits.p = 1; + entry.offset_middle = (u16)((address >> 16) & 0xffff); + entry.offset_high = (u32)(address >> 32); + + memcpy(&boot_idt[vector], &entry, sizeof(entry)); +} + +/* Have this here so we don't need to include <asm/desc.h> */ +static void load_boot_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +/* Setup IDT before kernel jumping to .Lrelocated */ +void load_stage1_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_idt_entry(X86_TRAP_VC, boot_stage1_vc); + + load_boot_idt(&boot_idt_desc); +} + +/* Setup IDT after kernel jumping to .Lrelocated */ +void load_stage2_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + set_idt_entry(X86_TRAP_PF, boot_page_fault); + +#ifdef CONFIG_AMD_MEM_ENCRYPT + set_idt_entry(X86_TRAP_VC, boot_stage2_vc); +#endif + + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S new file mode 100644 index 000000000000..22890e199f5b --- /dev/null +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Early IDT handler entry points + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#include <asm/segment.h> + +/* For ORIG_RAX */ +#include "../../entry/calling.h" + +.macro EXCEPTION_HANDLER name function error_code=0 +SYM_FUNC_START(\name) + + /* Build pt_regs */ + .if \error_code == 0 + pushq $0 + .endif + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* Call handler with pt_regs */ + movq %rsp, %rdi + /* Error code is second parameter */ + movq ORIG_RAX(%rsp), %rsi + call \function + + /* Restore regs */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + + /* Remove error code and return */ + addq $8, %rsp + + iretq +SYM_FUNC_END(\name) + .endm + + .text + .code64 + +EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 + +#ifdef CONFIG_AMD_MEM_ENCRYPT +EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 +EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index dde7cb3724df..b59547ce5b19 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -36,17 +36,12 @@ #define STATIC #include <linux/decompress/mm.h> -#ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled; -unsigned int pgdir_shift __ro_after_init = 39; -unsigned int ptrs_per_p4d __ro_after_init = 1; -#endif +#define _SETUP +#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ +#undef _SETUP extern unsigned long get_cmd_line_ptr(void); -/* Used by PAGE_KERN* macros: */ -pteval_t __default_kernel_pte_mask __read_mostly = ~0; - /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -87,8 +82,11 @@ static unsigned long get_boot_seed(void) static bool memmap_too_large; -/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ -static unsigned long long mem_limit = ULLONG_MAX; +/* + * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. + * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. + */ +static u64 mem_limit; /* Number of immovable memory regions */ static int num_immovable_mem; @@ -131,8 +129,7 @@ enum parse_mode { }; static int -parse_memmap(char *p, unsigned long long *start, unsigned long long *size, - enum parse_mode mode) +parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) { char *oldp; @@ -162,7 +159,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size, */ *size = 0; } else { - unsigned long long flags; + u64 flags; /* * efi_fake_mem=nn@ss:attr the attr specifies @@ -201,7 +198,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) while (str && (i < MAX_MEMMAP_REGIONS)) { int rc; - unsigned long long start, size; + u64 start, size; char *k = strchr(str, ','); if (k) @@ -214,7 +211,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) if (start == 0) { /* Store the specified memory limit if size > 0 */ - if (size > 0) + if (size > 0 && size < mem_limit) mem_limit = size; continue; @@ -261,15 +258,15 @@ static void parse_gb_huge_pages(char *param, char *val) static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); - size_t len = strlen((char *)args); + size_t len; char *tmp_cmdline; char *param, *val; u64 mem_size; - if (!strstr(args, "memmap=") && !strstr(args, "mem=") && - !strstr(args, "hugepages")) + if (!args) return; + len = strnlen(args, COMMAND_LINE_SIZE-1); tmp_cmdline = malloc(len + 1); if (!tmp_cmdline) error("Failed to allocate space for tmp_cmdline"); @@ -284,14 +281,12 @@ static void handle_mem_options(void) while (*args) { args = next_arg(args, ¶m, &val); /* Stop at -- */ - if (!val && strcmp(param, "--") == 0) { - warn("Only '--' specified in cmdline"); - goto out; - } + if (!val && strcmp(param, "--") == 0) + break; if (!strcmp(param, "memmap")) { mem_avoid_memmap(PARSE_MEMMAP, val); - } else if (strstr(param, "hugepages")) { + } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { char *p = val; @@ -300,21 +295,23 @@ static void handle_mem_options(void) continue; mem_size = memparse(p, &p); if (mem_size == 0) - goto out; + break; - mem_limit = mem_size; + if (mem_size < mem_limit) + mem_limit = mem_size; } else if (!strcmp(param, "efi_fake_mem")) { mem_avoid_memmap(PARSE_EFI, val); } } -out: free(tmp_cmdline); return; } /* - * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) + * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. + * * The mem_avoid array is used to store the ranges that need to be avoided * when KASLR searches for an appropriate random address. We must avoid any * regions that are unsafe to overlap with during decompression, and other @@ -392,8 +389,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, { unsigned long init_size = boot_params->hdr.init_size; u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - char *ptr; + unsigned long cmd_line, cmd_line_size; /* * Avoid the region that is unsafe to overlap during @@ -401,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, */ mem_avoid[MEM_AVOID_ZO_RANGE].start = input; mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; - add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, - mem_avoid[MEM_AVOID_ZO_RANGE].size); /* Avoid initrd. */ initrd_start = (u64)boot_params->ext_ramdisk_image << 32; @@ -414,22 +408,17 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* No need to set mapping for initrd, it will be handled in VO. */ /* Avoid kernel command line. */ - cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; - cmd_line |= boot_params->hdr.cmd_line_ptr; + cmd_line = get_cmd_line_ptr(); /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++];) - ; - mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; - mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; - add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, - mem_avoid[MEM_AVOID_CMDLINE].size); + if (cmd_line) { + cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + } /* Avoid boot parameters. */ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); - add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, - mem_avoid[MEM_AVOID_BOOTPARAMS].size); /* We don't need to set a mapping for setup_data. */ @@ -438,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* Enumerate the immovable memory regions */ num_immovable_mem = count_immovable_mem_regions(); - -#ifdef CONFIG_X86_VERBOSE_BOOTUP - /* Make sure video RAM can be used. */ - add_identity_map(0, PMD_SIZE); -#endif } /* @@ -454,7 +438,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, { int i; struct setup_data *ptr; - unsigned long earliest = img->start + img->size; + u64 earliest = img->start + img->size; bool is_overlapping = false; for (i = 0; i < MEM_AVOID_MAX; i++) { @@ -499,18 +483,16 @@ static bool mem_avoid_overlap(struct mem_vector *img, } struct slot_area { - unsigned long addr; - int num; + u64 addr; + unsigned long num; }; #define MAX_SLOT_AREA 100 static struct slot_area slot_areas[MAX_SLOT_AREA]; - +static unsigned int slot_area_index; static unsigned long slot_max; -static unsigned long slot_area_index; - static void store_slot_info(struct mem_vector *region, unsigned long image_size) { struct slot_area slot_area; @@ -519,13 +501,10 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) return; slot_area.addr = region->start; - slot_area.num = (region->size - image_size) / - CONFIG_PHYSICAL_ALIGN + 1; + slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; - if (slot_area.num > 0) { - slot_areas[slot_area_index++] = slot_area; - slot_max += slot_area.num; - } + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; } /* @@ -535,57 +514,53 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) static void process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) { - unsigned long addr, size = 0; + u64 pud_start, pud_end; + unsigned long gb_huge_pages; struct mem_vector tmp; - int i = 0; - if (!max_gb_huge_pages) { + if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { store_slot_info(region, image_size); return; } - addr = ALIGN(region->start, PUD_SIZE); - /* Did we raise the address above the passed in memory entry? */ - if (addr < region->start + region->size) - size = region->size - (addr - region->start); - - /* Check how many 1GB huge pages can be filtered out: */ - while (size > PUD_SIZE && max_gb_huge_pages) { - size -= PUD_SIZE; - max_gb_huge_pages--; - i++; - } + /* Are there any 1GB pages in the region? */ + pud_start = ALIGN(region->start, PUD_SIZE); + pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); /* No good 1GB huge pages found: */ - if (!i) { + if (pud_start >= pud_end) { store_slot_info(region, image_size); return; } - /* - * Skip those 'i'*1GB good huge pages, and continue checking and - * processing the remaining head or tail part of the passed region - * if available. - */ - - if (addr >= region->start + image_size) { + /* Check if the head part of the region is usable. */ + if (pud_start >= region->start + image_size) { tmp.start = region->start; - tmp.size = addr - region->start; + tmp.size = pud_start - region->start; store_slot_info(&tmp, image_size); } - size = region->size - (addr - region->start) - i * PUD_SIZE; - if (size >= image_size) { - tmp.start = addr + i * PUD_SIZE; - tmp.size = size; + /* Skip the good 1GB pages. */ + gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; + if (gb_huge_pages > max_gb_huge_pages) { + pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); + max_gb_huge_pages = 0; + } else { + max_gb_huge_pages -= gb_huge_pages; + } + + /* Check if the tail part of the region is usable. */ + if (region->start + region->size >= pud_end + image_size) { + tmp.start = pud_end; + tmp.size = region->start + region->size - pud_end; store_slot_info(&tmp, image_size); } } -static unsigned long slots_fetch_random(void) +static u64 slots_fetch_random(void) { unsigned long slot; - int i; + unsigned int i; /* Handle case of no slots stored. */ if (slot_max == 0) @@ -598,7 +573,7 @@ static unsigned long slots_fetch_random(void) slot -= slot_areas[i].num; continue; } - return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN; + return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); } if (i == slot_area_index) @@ -611,49 +586,23 @@ static void __process_mem_region(struct mem_vector *entry, unsigned long image_size) { struct mem_vector region, overlap; - unsigned long start_orig, end; - struct mem_vector cur_entry; + u64 region_end; - /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) - return; - - /* Ignore entries entirely below our minimum. */ - if (entry->start + entry->size < minimum) - return; - - /* Ignore entries above memory limit */ - end = min(entry->size + entry->start, mem_limit); - if (entry->start >= end) - return; - cur_entry.start = entry->start; - cur_entry.size = end - entry->start; - - region.start = cur_entry.start; - region.size = cur_entry.size; + /* Enforce minimum and memory limit. */ + region.start = max_t(u64, entry->start, minimum); + region_end = min(entry->start + entry->size, mem_limit); /* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { - start_orig = region.start; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above the passed in memory entry? */ - if (region.start > cur_entry.start + cur_entry.size) + if (region.start > region_end) return; /* Reduce size by any delta from the original address. */ - region.size -= region.start - start_orig; - - /* On 32-bit, reduce region size to fit within max size. */ - if (IS_ENABLED(CONFIG_X86_32) && - region.start + region.size > KERNEL_IMAGE_SIZE) - region.size = KERNEL_IMAGE_SIZE - region.start; + region.size = region_end - region.start; /* Return if region can't contain decompressed kernel */ if (region.size < image_size) @@ -666,27 +615,19 @@ static void __process_mem_region(struct mem_vector *entry, } /* Store beginning of region if holds at least image_size. */ - if (overlap.start > region.start + image_size) { - struct mem_vector beginning; - - beginning.start = region.start; - beginning.size = overlap.start - region.start; - process_gb_huge_pages(&beginning, image_size); + if (overlap.start >= region.start + image_size) { + region.size = overlap.start - region.start; + process_gb_huge_pages(®ion, image_size); } - /* Return if overlap extends to or past end of region. */ - if (overlap.start + overlap.size >= region.start + region.size) - return; - /* Clip off the overlapping region and start over. */ - region.size -= overlap.start - region.start + overlap.size; region.start = overlap.start + overlap.size; } } static bool process_mem_region(struct mem_vector *region, - unsigned long long minimum, - unsigned long long image_size) + unsigned long minimum, + unsigned long image_size) { int i; /* @@ -709,7 +650,7 @@ static bool process_mem_region(struct mem_vector *region, * immovable memory and @region. */ for (i = 0; i < num_immovable_mem; i++) { - unsigned long long start, end, entry_end, region_end; + u64 start, end, entry_end, region_end; struct mem_vector entry; if (!mem_overlaps(region, &immovable_mem[i])) @@ -736,8 +677,8 @@ static bool process_mem_region(struct mem_vector *region, #ifdef CONFIG_EFI /* - * Returns true if mirror region found (and must have been processed - * for slots adding) + * Returns true if we processed the EFI memmap, which we prefer over the E820 + * table if it is available. */ static bool process_efi_entries(unsigned long minimum, unsigned long image_size) @@ -839,20 +780,30 @@ static void process_e820_entries(unsigned long minimum, static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { + u64 phys_addr; + + /* Bail out early if it's impossible to succeed. */ + if (minimum + image_size > mem_limit) + return 0; + /* Check if we had too many memmaps. */ if (memmap_too_large) { debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); return 0; } - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + if (!process_efi_entries(minimum, image_size)) + process_e820_entries(minimum, image_size); + + phys_addr = slots_fetch_random(); - if (process_efi_entries(minimum, image_size)) - return slots_fetch_random(); + /* Perform a final check to make sure the address is in range. */ + if (phys_addr < minimum || phys_addr + image_size > mem_limit) { + warn("Invalid physical address chosen!\n"); + return 0; + } - process_e820_entries(minimum, image_size); - return slots_fetch_random(); + return (unsigned long)phys_addr; } static unsigned long find_random_virt_addr(unsigned long minimum, @@ -860,18 +811,12 @@ static unsigned long find_random_virt_addr(unsigned long minimum, { unsigned long slots, random_addr; - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - /* Align image_size for easy slot calculations. */ - image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); - /* * There are how many CONFIG_PHYSICAL_ALIGN-sized slots * that can hold image_size within the range of minimum to * KERNEL_IMAGE_SIZE? */ - slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / - CONFIG_PHYSICAL_ALIGN + 1; + slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; random_addr = kaslr_get_random_long("Virtual") % slots; @@ -905,8 +850,10 @@ void choose_random_location(unsigned long input, boot_params->hdr.loadflags |= KASLR_FLAG; - /* Prepare to add new identity pagetables on demand. */ - initialize_identity_maps(); + if (IS_ENABLED(CONFIG_X86_32)) + mem_limit = KERNEL_IMAGE_SIZE; + else + mem_limit = MAXMEM; /* Record the various known unsafe memory ranges. */ mem_avoid_init(input, input_size, *output); @@ -917,6 +864,8 @@ void choose_random_location(unsigned long input, * location: */ min_addr = min(*output, 512UL << 20); + /* Make sure minimum is aligned. */ + min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); @@ -924,19 +873,8 @@ void choose_random_location(unsigned long input, warn("Physical KASLR disabled: no suitable memory region!"); } else { /* Update the new physical address location. */ - if (*output != random_addr) { - add_identity_map(random_addr, output_size); + if (*output != random_addr) *output = random_addr; - } - - /* - * This loads the identity mapping page table. - * This should only be done if a new physical address - * is found for the kernel, otherwise we should keep - * the old page table to make it be like the "nokaslr" - * case. - */ - finalize_identity_maps(); } diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c deleted file mode 100644 index f9c5c13d979b..000000000000 --- a/arch/x86/boot/compressed/kaslr_64.c +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code is used on x86_64 to create page table identity mappings on - * demand by building up a new set of page tables (or appending to the - * existing ones), and then switching over to them when ready. - * - * Copyright (C) 2015-2016 Yinghai Lu - * Copyright (C) 2016 Kees Cook - */ - -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION - -#include "misc.h" - -/* These actually do the work of building the kernel identity maps. */ -#include <linux/pgtable.h> -#include <asm/init.h> -/* Use the static base for this part of the boot process */ -#undef __PAGE_OFFSET -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#include "../../mm/ident_map.c" - -/* Used to track our page table allocation area. */ -struct alloc_pgt_data { - unsigned char *pgt_buf; - unsigned long pgt_buf_size; - unsigned long pgt_buf_offset; -}; - -/* - * Allocates space for a page table entry, using struct alloc_pgt_data - * above. Besides the local callers, this is used as the allocation - * callback in mapping_info below. - */ -static void *alloc_pgt_page(void *context) -{ - struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; - unsigned char *entry; - - /* Validate there is space available for a new page. */ - if (pages->pgt_buf_offset >= pages->pgt_buf_size) { - debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); - debug_putaddr(pages->pgt_buf_offset); - debug_putaddr(pages->pgt_buf_size); - return NULL; - } - - entry = pages->pgt_buf + pages->pgt_buf_offset; - pages->pgt_buf_offset += PAGE_SIZE; - - return entry; -} - -/* Used to track our allocated page tables. */ -static struct alloc_pgt_data pgt_data; - -/* The top level page table entry pointer. */ -static unsigned long top_level_pgt; - -phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; - -/* - * Mapping information structure passed to kernel_ident_mapping_init(). - * Due to relocation, pointers must be assigned at run time not build time. - */ -static struct x86_mapping_info mapping_info; - -/* Locates and clears a region for a new top level page table. */ -void initialize_identity_maps(void) -{ - /* If running as an SEV guest, the encryption mask is required. */ - set_sev_encryption_mask(); - - /* Exclude the encryption mask from __PHYSICAL_MASK */ - physical_mask &= ~sme_me_mask; - - /* Init mapping_info with run-time function/buffer pointers. */ - mapping_info.alloc_pgt_page = alloc_pgt_page; - mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE; - - /* - * It should be impossible for this not to already be true, - * but since calling this a second time would rewind the other - * counters, let's just make sure this is reset too. - */ - pgt_data.pgt_buf_offset = 0; - - /* - * If we came here via startup_32(), cr3 will be _pgtable already - * and we must append to the existing area instead of entirely - * overwriting it. - * - * With 5-level paging, we use '_pgtable' to allocate the p4d page table, - * the top-level page table is allocated separately. - * - * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level - * cases. On 4-level paging it's equal to 'top_level_pgt'. - */ - top_level_pgt = read_cr3_pa(); - if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { - debug_putstr("booted via startup_32()\n"); - pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - } else { - debug_putstr("booted via startup_64()\n"); - pgt_data.pgt_buf = _pgtable; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); - } -} - -/* - * Adds the specified range to what will become the new identity mappings. - * Once all ranges have been added, the new mapping is activated by calling - * finalize_identity_maps() below. - */ -void add_identity_map(unsigned long start, unsigned long size) -{ - unsigned long end = start + size; - - /* Align boundary to 2M. */ - start = round_down(start, PMD_SIZE); - end = round_up(end, PMD_SIZE); - if (start >= end) - return; - - /* Build the mapping. */ - kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, - start, end); -} - -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ - write_cr3(top_level_pgt); -} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e478e40fbe5a..267e7f93050e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, parse_elf(output); handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); + + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 726e264410ff..6d31f1b4c4d1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,6 +23,7 @@ #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> +#include <asm/desc_defs.h> #define BOOT_CTYPE_H #include <linux/acpi.h> @@ -36,6 +37,9 @@ #define memptr unsigned #endif +/* boot/compressed/vmlinux start and end markers */ +extern char _head[], _end[]; + /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; @@ -70,8 +74,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); struct mem_vector { - unsigned long long start; - unsigned long long size; + u64 start; + u64 size; }; #if CONFIG_RANDOMIZE_BASE @@ -81,8 +85,6 @@ void choose_random_location(unsigned long input, unsigned long *output, unsigned long output_size, unsigned long *virt_addr); -/* cpuflags.c */ -bool has_cpuflag(int flag); #else static inline void choose_random_location(unsigned long input, unsigned long input_size, @@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input, } #endif +/* cpuflags.c */ +bool has_cpuflag(int flag); + #ifdef CONFIG_X86_64 -void initialize_identity_maps(void); -void add_identity_map(unsigned long start, unsigned long size); -void finalize_identity_maps(void); +extern int set_page_decrypted(unsigned long address); +extern int set_page_encrypted(unsigned long address); +extern int set_page_non_present(unsigned long address); extern unsigned char _pgtable[]; -#else -static inline void initialize_identity_maps(void) -{ } -static inline void add_identity_map(unsigned long start, unsigned long size) -{ } -static inline void finalize_identity_maps(void) -{ } #endif #ifdef CONFIG_EARLY_PRINTK @@ -119,6 +117,17 @@ static inline void console_init(void) void set_sev_encryption_mask(void); +#ifdef CONFIG_AMD_MEM_ENCRYPT +void sev_es_shutdown_ghcb(void); +extern bool sev_es_check_ghcb_fault(unsigned long address); +#else +static inline void sev_es_shutdown_ghcb(void) { } +static inline bool sev_es_check_ghcb_fault(unsigned long address) +{ + return false; +} +#endif + /* acpi.c */ #ifdef CONFIG_ACPI acpi_physical_address get_rsdp_addr(void); @@ -133,4 +142,21 @@ int count_immovable_mem_regions(void); static inline int count_immovable_mem_regions(void) { return 0; } #endif +/* ident_map_64.c */ +#ifdef CONFIG_X86_5LEVEL +extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; +#endif + +/* Used by PAGE_KERN* macros: */ +extern pteval_t __default_kernel_pte_mask; + +/* idt_64.c */ +extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; +extern struct desc_ptr boot_idt_desc; + +/* IDT Entry Points */ +void boot_page_fault(void); +void boot_stage1_vc(void); +void boot_stage2_vc(void); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 7e01248765b2..52aa56cdbacc 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -60,6 +60,12 @@ int main(int argc, char *argv[]) printf(".incbin \"%s\"\n", argv[1]); printf("input_data_end:\n"); + printf(".section \".rodata\",\"a\",@progbits\n"); + printf(".globl input_len\n"); + printf("input_len:\n\t.long %lu\n", ilen); + printf(".globl output_len\n"); + printf("output_len:\n\t.long %lu\n", (unsigned long)olen); + retval = 0; bail: if (f) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index c8862696a47b..7d0394f4ebf9 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -5,15 +5,6 @@ #include "pgtable.h" #include "../string.h" -/* - * __force_order is used by special_insns.h asm code to force instruction - * serialization. - * - * It is not referenced from the code, but GCC < 5 with -fPIE would fail - * due to an undefined symbol. Define it to make these ancient GCCs work. - */ -unsigned long __force_order; - #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c new file mode 100644 index 000000000000..954cb2702e23 --- /dev/null +++ b/arch/x86/boot/compressed/sev-es.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include <asm/pgtable_types.h> +#include <asm/sev-es.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/msr-index.h> +#include <asm/fpu/xcr.h> +#include <asm/ptrace.h> +#include <asm/svm.h> + +#include "error.h" + +struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +static bool insn_has_rep_prefix(struct insn *insn) +{ + int i; + + insn_get_prefixes(insn); + + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_byte_t p = insn->prefixes.bytes[i]; + + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + unsigned long low, high; + + asm volatile("rdmsr" : "=a" (low), "=d" (high) : + "c" (MSR_AMD64_SEV_ES_GHCB)); + + return ((high << 32) | low); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = val & 0xffffffffUL; + high = val >> 32; + + asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), + "a"(low), "d" (high) : "memory"); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + enum es_result ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); + insn_get_length(&ctxt->insn); + + ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + + return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +#undef __init +#undef __pa +#define __init +#define __pa(x) ((unsigned long)(x)) + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* Include code for early handlers */ +#include "../../kernel/sev-es-shared.c" + +static bool early_setup_sev_es(void) +{ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + + if (set_page_decrypted((unsigned long)&boot_ghcb_page)) + return false; + + /* Page is now mapped decrypted, clear it */ + memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + + boot_ghcb = &boot_ghcb_page; + + /* Initialize lookup tables for the instruction decoder */ + inat_init_tables(); + + return true; +} + +void sev_es_shutdown_ghcb(void) +{ + if (!boot_ghcb) + return; + + if (!sev_es_check_cpu_features()) + error("SEV-ES CPU Features missing."); + + /* + * GHCB Page must be flushed from the cache and mapped encrypted again. + * Otherwise the running kernel will see strange cache effects when + * trying to use that page. + */ + if (set_page_encrypted((unsigned long)&boot_ghcb_page)) + error("Can't map GHCB page encrypted"); + + /* + * GHCB page is mapped encrypted again and flushed from the cache. + * Mark it non-present now to catch bugs when #VC exceptions trigger + * after this point. + */ + if (set_page_non_present((unsigned long)&boot_ghcb_page)) + error("Can't unmap GHCB page"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ + /* Check whether the fault was on the GHCB page */ + return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_sev_es()) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) { + vc_finish_insn(&ctxt); + } else if (result != ES_RETRY) { + /* + * For now, just halt the machine. That makes debugging easier, + * later we just call sev_es_terminate() here. + */ + while (true) + asm volatile("hlt\n"); + } +} diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 8f1025d1f681..112b2375d021 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -42,12 +42,6 @@ SECTIONS *(.rodata.*) _erodata = . ; } - .got : { - _got = .; - KEEP(*(.got.plt)) - KEEP(*(.got)) - _egot = .; - } .data : { _data = . ; *(.data) @@ -75,5 +69,49 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + DISCARDS + /DISCARD/ : { + *(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss) + *(.hash) *(.gnu.hash) + *(.note.*) + } + + .got.plt (INFO) : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) + } + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + + .plt : { + *(.plt) *(.plt.*) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") } |