diff options
author | Alexander Gordeev <agordeev@linux.ibm.com> | 2022-12-13 13:35:11 +0300 |
---|---|---|
committer | Heiko Carstens <hca@linux.ibm.com> | 2023-01-13 16:15:05 +0300 |
commit | bb1520d581a3a46e2d6e12bb74604ace33404de5 (patch) | |
tree | e30707db7e1375a9b4bb8bd40102e57aee8c968a /arch/s390/boot | |
parent | bd50b7436217b4123911c2bca1efd74718654f06 (diff) | |
download | linux-bb1520d581a3a46e2d6e12bb74604ace33404de5.tar.xz |
s390/mm: start kernel with DAT enabled
The setup of the kernel virtual address space is spread
throughout the sources, boot stages and config options
like this:
1. The available physical memory regions are queried
and stored as mem_detect information for later use
in the decompressor.
2. Based on the physical memory availability the virtual
memory layout is established in the decompressor;
3. If CONFIG_KASAN is disabled the kernel paging setup
code populates kernel pgtables and turns DAT mode on.
It uses the information stored at step [1].
4. If CONFIG_KASAN is enabled the kernel early boot
kasan setup populates kernel pgtables and turns DAT
mode on. It uses the information stored at step [1].
The kasan setup creates early_pg_dir directory and
directly overwrites swapper_pg_dir entries to make
shadow memory pages available.
Move the kernel virtual memory setup to the decompressor
and start the kernel with DAT turned on right from the
very first istruction. That completely eliminates the
boot phase when the kernel runs in DAT-off mode, simplies
the overall design and consolidates pgtables setup.
The identity mapping is created in the decompressor, while
kasan shadow mappings are still created by the early boot
kernel code.
Share with decompressor the existing kasan memory allocator.
It decreases the size of a newly requested memory block from
pgalloc_pos and ensures that kernel image is not overwritten.
pgalloc_low and pgalloc_pos pointers are made preserved boot
variables for that.
Use the bootdata infrastructure to setup swapper_pg_dir
and invalid_pg_dir directories used by the kernel later.
The interim early_pg_dir directory established by the
kasan initialization code gets eliminated as result.
As the kernel runs in DAT-on mode only the PSW_KERNEL_BITS
define gets PSW_MASK_DAT bit by default. Additionally, the
setup_lowcore_dat_off() and setup_lowcore_dat_on() routines
get merged, since there is no DAT-off mode stage anymore.
The memory mappings are created with RW+X protection that
allows the early boot code setting up all necessary data
and services for the kernel being booted. Just before the
paging is enabled the memory protection is changed to
RO+X for text, RO+NX for read-only data and RW+NX for
kernel data and the identity mapping.
Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Diffstat (limited to 'arch/s390/boot')
-rw-r--r-- | arch/s390/boot/Makefile | 2 | ||||
-rw-r--r-- | arch/s390/boot/boot.h | 6 | ||||
-rw-r--r-- | arch/s390/boot/startup.c | 45 | ||||
-rw-r--r-- | arch/s390/boot/vmem.c | 254 |
4 files changed, 298 insertions, 9 deletions
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index d52c3e2e16bc..47a397da0498 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -35,7 +35,7 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 286441cf3bf0..547614496e7e 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -16,7 +16,7 @@ struct machine_info { struct vmlinux_info { unsigned long default_lma; - void (*entry)(void); + unsigned long entry; unsigned long image_size; /* does not include .bss */ unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; @@ -27,6 +27,9 @@ struct vmlinux_info { unsigned long rela_dyn_start; unsigned long rela_dyn_end; unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; }; void startup_kernel(void); @@ -41,6 +44,7 @@ void print_missing_facilities(void); void sclp_early_setup_buffer(void); void print_pgm_check_info(void); unsigned long get_random_base(unsigned long safe_addr); +void setup_vmem(unsigned long online_end, unsigned long asce_limit); void __printf(1, 2) decompressor_printk(const char *fmt, ...); void error(char *m); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index da6ee587fe9a..c5d59df9fa62 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -11,6 +11,7 @@ #include <asm/diag.h> #include <asm/uv.h> #include <asm/abs_lowcore.h> +#include <asm/mem_detect.h> #include "decompressor.h" #include "boot.h" #include "uv.h" @@ -166,9 +167,10 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #endif } -static void setup_kernel_memory_layout(void) +static unsigned long setup_kernel_memory_layout(void) { unsigned long vmemmap_start; + unsigned long asce_limit; unsigned long rte_size; unsigned long pages; unsigned long vmax; @@ -183,10 +185,10 @@ static void setup_kernel_memory_layout(void) vmalloc_size > _REGION2_SIZE || vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) { - vmax = _REGION1_SIZE; + asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; } else { - vmax = _REGION2_SIZE; + asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } /* @@ -194,7 +196,7 @@ static void setup_kernel_memory_layout(void) * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. */ - vmax = adjust_to_uv_max(vmax); + vmax = adjust_to_uv_max(asce_limit); #ifdef CONFIG_KASAN /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); @@ -223,6 +225,8 @@ static void setup_kernel_memory_layout(void) /* make sure vmemmap doesn't overlay with vmalloc area */ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); vmemmap = (struct page *)vmemmap_start; + + return asce_limit; } /* @@ -256,6 +260,9 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.rela_dyn_start += offset; vmlinux.rela_dyn_end += offset; vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; } static unsigned long reserve_amode31(unsigned long safe_addr) @@ -268,7 +275,10 @@ void startup_kernel(void) { unsigned long random_lma; unsigned long safe_addr; + unsigned long asce_limit; + unsigned long online_end; void *img; + psw_t psw; detect_facilities(); @@ -290,7 +300,8 @@ void startup_kernel(void) sanitize_prot_virt_host(); setup_ident_map_size(detect_memory()); setup_vmalloc_size(); - setup_kernel_memory_layout(); + asce_limit = setup_kernel_memory_layout(); + online_end = min(get_mem_detect_end(), ident_map_size); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { random_lma = get_random_base(safe_addr); @@ -307,9 +318,23 @@ void startup_kernel(void) } else if (__kaslr_offset) memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ clear_bss_section(); - copy_bootdata(); handle_relocs(__kaslr_offset); + setup_vmem(online_end, asce_limit); + copy_bootdata(); if (__kaslr_offset) { /* @@ -321,5 +346,11 @@ void startup_kernel(void) if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) memset(img, 0, vmlinux.image_size); } - vmlinux.entry(); + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..db1469c17289 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched/task.h> +#include <linux/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/mem_detect.h> +#include "decompressor.h" +#include "boot.h" + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +unsigned long __bootdata_preserved(s390_invalid_asce); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); + +static void boot_check_oom(void) +{ + if (pgalloc_pos < pgalloc_low) + error("out of memory on boot\n"); +} + +static void pgtable_populate_begin(unsigned long online_end) +{ + unsigned long initrd_end; + unsigned long kernel_end; + + kernel_end = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + pgalloc_low = round_up(kernel_end, PAGE_SIZE); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { + initrd_end = round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); + pgalloc_low = max(pgalloc_low, initrd_end); + } + + pgalloc_end = round_down(online_end, PAGE_SIZE); + pgalloc_pos = pgalloc_end; + + boot_check_oom(); +} + +static void *boot_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + pgalloc_pos -= size; + pgalloc_pos = round_down(pgalloc_pos, size); + + boot_check_oom(); + + return (void *)pgalloc_pos; +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = boot_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); + + if (!pte_leftover) { + pte_leftover = boot_alloc_pages(0); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + unsigned long next; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + entry = __pte(__pa(addr)); + entry = set_pte_bit(entry, PAGE_KERNEL_EXEC); + set_pte(pte, entry); + } + } +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(__pa(addr)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL_EXEC); + set_pmd(pmd, entry); + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next); + } +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (can_large_pud(pud, addr, next)) { + entry = __pud(__pa(addr)); + entry = set_pud_bit(entry, REGION3_KERNEL_EXEC); + set_pud(pud, entry); + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next); + } +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } + pgtable_p4d_populate(pgd, addr, next); + } +} + +/* + * The pgtables are located in the range [pgalloc_pos, pgalloc_end). + * That range must stay intact and is later reserved in the memblock. + * Therefore pgtable_populate(pgalloc_pos, pgalloc_end) is needed to + * finalize pgalloc_pos pointer. However that call can decrease the + * value of pgalloc_pos pointer itself. Therefore, pgtable_populate() + * needs to be called repeatedly until pgtables are complete and + * pgalloc_pos does not grow left anymore. + */ +static void pgtable_populate_end(void) +{ + unsigned long pgalloc_end_curr = pgalloc_end; + unsigned long pgalloc_pos_prev; + + do { + pgalloc_pos_prev = pgalloc_pos; + pgtable_populate(pgalloc_pos, pgalloc_end_curr); + pgalloc_end_curr = pgalloc_pos_prev; + } while (pgalloc_pos < pgalloc_pos_prev); +} + +void setup_vmem(unsigned long online_end, unsigned long asce_limit) +{ + unsigned long asce_type; + unsigned long asce_bits; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + * + * No further pgtable_populate() calls are allowed after the value + * of pgalloc_pos finalized with a call to pgtable_populate_end(). + */ + pgtable_populate_begin(online_end); + pgtable_populate(0, sizeof(struct lowcore)); + pgtable_populate(0, online_end); + pgtable_populate_end(); + + S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + + init_mm.context.asce = S390_lowcore.kernel_asce; +} |