diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/entry/entry_64.S | 17 | ||||
-rw-r--r-- | arch/x86/entry/entry_64_compat.S | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/hw_irq.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/kaiser.h | 113 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_64.h | 21 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 12 | ||||
-rw-r--r-- | arch/x86/include/asm/processor.h | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/espfix_64.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 16 | ||||
-rw-r--r-- | arch/x86/kernel/irqinit.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/kaiser.c | 160 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 26 |
17 files changed, 388 insertions, 15 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e7b0e7ff4c58..9467a2c4bc60 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/kaiser.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs @@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath: movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 + SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64 @@ -323,10 +326,12 @@ return_from_SYSCALL_64: syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 + SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64 opportunistic_sysret_failed: + SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret END(entry_SYSCALL_64) @@ -424,6 +429,7 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWITCH_USER_CR3 SWAPGS jmp restore_regs_and_iret @@ -478,6 +484,7 @@ END(irq_entries_start) * tracking that we're in kernel mode. */ SWAPGS + SWITCH_KERNEL_CR3 /* * We need to tell lockdep that IRQs are off. We can't do this until @@ -535,6 +542,7 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ + SWITCH_USER_CR3 SWAPGS jmp restore_regs_and_iret @@ -612,6 +620,7 @@ native_irq_return_ldt: pushq %rdi /* Stash user RDI */ SWAPGS + SWITCH_KERNEL_CR3 movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ movq (1*8)(%rsp), %rax /* user RIP */ @@ -638,6 +647,7 @@ native_irq_return_ldt: * still points to an RO alias of the ESPFIX stack. */ orq PER_CPU_VAR(espfix_stack), %rax + SWITCH_USER_CR3 SWAPGS movq %rax, %rsp @@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry) testl %edx, %edx js 1f /* negative -> in kernel */ SWAPGS + SWITCH_KERNEL_CR3 xorl %ebx, %ebx 1: ret END(paranoid_entry) @@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ + SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK jmp paranoid_exit_restore paranoid_exit_no_swapgs: @@ -1084,6 +1096,7 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + SWITCH_KERNEL_CR3 .Lerror_entry_from_usermode_after_swapgs: /* @@ -1135,6 +1148,7 @@ ENTRY(error_entry) * Switch to kernel gsbase: */ SWAPGS + SWITCH_KERNEL_CR3 /* * Pretend that the exception came from user mode: set up pt_regs @@ -1235,6 +1249,7 @@ ENTRY(nmi) */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1275,6 +1290,7 @@ ENTRY(nmi) * work, because we don't want to enable interrupts. Fortunately, * do_nmi doesn't modify pt_regs. */ + SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret @@ -1486,6 +1502,7 @@ end_repeat_nmi: testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: + SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_EXTRA_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..f0e384ee8fc6 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -13,6 +13,7 @@ #include <asm/irqflags.h> #include <asm/asm.h> #include <asm/smap.h> +#include <asm/kaiser.h> #include <linux/linkage.h> #include <linux/err.h> @@ -48,6 +49,7 @@ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* @@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat) ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d @@ -259,6 +262,7 @@ sysret32_from_system_call: xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r10 + SWITCH_USER_CR3 movq RSP-ORIG_RAX(%rsp), %rsp swapgs sysretl @@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat) PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS - + SWITCH_KERNEL_CR3_NO_STACK /* * User tracing code (ptrace or signal handlers) might assume that * the saved RAX contains a 32-bit number when we're invoking a 32-bit @@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON + SWITCH_USER_CR3_NO_STACK SWAPGS jmp restore_regs_and_iret END(entry_INT80_compat) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b90e1053049b..0817d63bce41 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -178,7 +178,7 @@ extern char irq_entries_start[]; #define VECTOR_RETRIGGERED ((void *)~0UL) typedef struct irq_desc* vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..63ee8309b35b --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,113 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +/* This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. + * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, + * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. + * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions + * of the user space, or the stacks. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_KAISER + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~0x1000), \reg +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg +movq %cr3, \reg +orq $(0x1000), \reg +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +pushq %rax +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +.endm + +.macro SWITCH_USER_CR3 +pushq %rax +_SWITCH_TO_USER_CR3 %rax +popq %rax +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +.endm + + +.macro SWITCH_USER_CR3_NO_STACK + +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_USER_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + +.endm + +#else /* CONFIG_KAISER */ + +.macro SWITCH_KERNEL_CR3 reg +.endm +.macro SWITCH_USER_CR3 reg +.endm +.macro SWITCH_USER_CR3_NO_STACK +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_KAISER */ +#else /* __ASSEMBLY__ */ + + +#ifdef CONFIG_KAISER +// Upon kernel/user mode switch, it may happen that +// the address space has to be switched before the registers have been stored. +// To change the address space, another register is needed. +// A register therefore has to be stored/restored. +// +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +#endif /* CONFIG_KAISER */ + +/** + * shadowmem_add_mapping - map a virtual memory part to the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * the mapping is done on a global scope, so no bigger synchronization has to be done. + * the pages have to be manually unmapped again when they are not needed any longer. + */ +extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + + +/** + * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * shadowmem_initialize_mapping - Initalize the shadow mapping + * + * most parts of the shadow mapping can be mapped upon boot time. + * only the thread stacks have to be mapped on runtime. + * the mapped regions are not unmapped at all. + */ +extern void kaiser_init(void); + +#endif + + + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 437feb436efa..4b479c9b064f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_KAISER + // clone the shadow pgd part as well + memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1cc82ece9ac1..e6ea39fa6d0b 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +#ifdef CONFIG_KAISER +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); +} + +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); +} +#endif /* CONFIG_KAISER */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_KAISER + // We know that a pgd is page aligned. + // Therefore the lower indices have to be mapped to user space. + // These pages are mapped to the shadow mapping. + if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + + pgdp->pgd = pgd.pgd & ~_PAGE_USER; +#else /* CONFIG_KAISER */ *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8b4de22d6429..00fecbb153ac 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -45,7 +45,11 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#ifdef CONFIG_KAISER +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +#else +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) @@ -119,7 +123,11 @@ #define _PAGE_DEVMAP (_AT(pteval_t, 0)) #endif -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#ifdef CONFIG_KAISER +#define _PAGE_PROTNONE (_AT(pteval_t, 0)) +#else +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#endif #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 83db0eae9979..3d4784e2f862 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -308,7 +308,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); @@ -335,6 +335,11 @@ union irq_stack_union { char gs_base[40]; unsigned long stack_canary; }; + + struct { + char irq_stack_pointer[64]; + char unused[IRQ_STACK_SIZE - 64]; + }; }; DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 91588be529b9..3efde13eaed0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = { static const struct cpu_dev *this_cpu = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -1365,7 +1365,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..9ff875a1aa24 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/espfix.h> +#include <asm/kaiser.h> /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -126,6 +127,11 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); +#ifdef CONFIG_KAISER + // add the esp stack pud to the shadow mapping here. + // This can be done directly, because the fixup stack has its own pud + set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); +#endif /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..9e849b520780 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -405,6 +405,14 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_KAISER +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -414,7 +422,7 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PGD_PAGE(early_level4_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE @@ -424,10 +432,10 @@ NEXT_PAGE(early_dynamic_pgts) .data #ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) - .fill 512,8,0 +NEXT_PGD_PAGE(init_level4_pgt) + .fill 2*512,8,0 #else -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1423ab1b0312..f480b38a03c3 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -51,7 +51,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, }; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8e10e72bf6ee..a55b32007785 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -41,7 +41,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { .x86_tss = { .sp0 = TOP_OF_INIT_STACK, #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..682c162333ba 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o - +obj-$(CONFIG_KAISER) += kaiser.o diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..cf1bb922d467 --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,160 @@ + + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> + +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/desc.h> +#ifdef CONFIG_KAISER + +__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/** + * Get the real ppn from a address in kernel mapping. + * @param address The virtual adrress + * @return the physical address + */ +static inline unsigned long get_pa_from_mapping (unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(address); + BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); + + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + + if (pud_large(*pud)) { + return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); + } + + pmd = pmd_offset(pud, address); + BUG_ON(pmd_none(*pmd)); + + if (pmd_large(*pmd)) { + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); + } + + pte = pte_offset_kernel(pmd, address); + BUG_ON(pte_none(*pte)); + + return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); +} + +void _kaiser_copy (unsigned long start_addr, unsigned long size, + unsigned long flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long address; + unsigned long end_addr = start_addr + size; + unsigned long target_address; + + for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); + address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + + pgd = native_get_shadow_pgd(pgd_offset_k(address)); + + BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) { + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); + } + BUG_ON(pud_large(*pud)); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); + } + BUG_ON(pmd_large(*pmd)); + + pte = pte_offset_kernel(pmd, address); + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + BUG_ON(__pa(pte_page(*pte)) != target_address); + } + } +} + +// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping +static inline void __init _kaiser_init(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); + } +} + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +spinlock_t shadow_table_lock; +void __init kaiser_init(void) +{ + int cpu; + spin_lock_init(&shadow_table_lock); + + spin_lock(&shadow_table_lock); + + _kaiser_init(); + + for_each_possible_cpu(cpu) { + // map the per cpu user variables + _kaiser_copy( + (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), + (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, + __PAGE_KERNEL); + } + + // map the entry/exit text section, which is responsible to switch between user- and kernel mode + _kaiser_copy( + (unsigned long) __entry_text_start, + (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, + __PAGE_KERNEL_RX); + + // the fixed map address of the idt_table + _kaiser_copy( + (unsigned long) idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); + + spin_unlock(&shadow_table_lock); +} + +// add a mapping to the shadow-mapping, and synchronize the mappings +void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + spin_lock(&shadow_table_lock); + _kaiser_copy(addr, size, flags); + spin_unlock(&shadow_table_lock); +} + +extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); + spin_lock(&shadow_table_lock); + do { + unmap_pud_range(pgd, start, start + size); + } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); + spin_unlock(&shadow_table_lock); +} +#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e3353c97d086..c17412f92d77 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(pgd, start); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3feec5af4e67..27d218b38538 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd) #else static inline pgd_t *_pgd_alloc(void) { +#ifdef CONFIG_KAISER + // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory + // block. Therefore, we have to allocate at least 3 pages. However, the + // __get_free_pages returns us 4 pages. Hence, we store the base pointer at + // the beginning of the page of our 8kb-aligned memory block in order to + // correctly free it afterwars. + + unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); + + if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) + { + *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; + return (pgd_t *) pages; + } + else + { + *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; + return (pgd_t *) (pages + PAGE_SIZE); + } +#else return (pgd_t *)__get_free_page(PGALLOC_GFP); +#endif } static inline void _pgd_free(pgd_t *pgd) { +#ifdef CONFIG_KAISER + unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); + free_pages(pages, get_order(4*PAGE_SIZE)); +#else free_page((unsigned long)pgd); +#endif } #endif /* CONFIG_X86_PAE */ |