diff options
Diffstat (limited to 'arch/x86/include/asm')
72 files changed, 1639 insertions, 1131 deletions
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e4362..72d867f6b518 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,8 +150,6 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { @@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) * you call efi_mem_attributes() during boot and at runtime, * you could theoretically see different attributes. * - * Since we are yet to see any x86 platforms that require - * anything other than PAGE_KERNEL (some arm64 platforms - * require the equivalent of PAGE_KERNEL_NOCACHE), return that - * until we know differently. + * We are yet to see any x86 platforms that require anything + * other than PAGE_KERNEL (some ARM64 platforms require the + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME + * is active, the ACPI information will not be encrypted, + * so return PAGE_KERNEL_NOENC until we know differently. */ - return PAGE_KERNEL; + return PAGE_KERNEL_NOENC; } #endif diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..676ee5807d86 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -74,6 +74,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,6 +126,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b871463..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c..84ae170bc3d0 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ca3c48c0872f..42bbbf0f173d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -177,7 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ @@ -286,7 +287,7 @@ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b12dd58..1a2ba368da39 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> +#include <asm/irq_vectors.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; + desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy(&gdt[entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); - desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; -} - - -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->limit0 = (u16) size; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); -#else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 + desc->base3 = (u32) (addr >> 32); #endif } @@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; - desc->limit = (limit >> 16) & 0xf; -} - -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, &s); + desc->limit1 = (limit >> 16) & 0xf; } -#endif -#ifdef CONFIG_TRACING -extern struct desc_ptr trace_idt_descr; -extern gate_desc trace_idt_table[]; -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ - write_idt_entry(trace_idt_table, entry, gate); -} +void update_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); -static inline void _trace_set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_trace_idt_entry(gate, &s); -} -#else -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ -} - -#define _trace_set_gate(gate, type, addr, dpl, ist, seg) -#endif - -static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_idt_entry(idt_table, gate, &s); - write_trace_idt_entry(gate, &s); -} - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -#define set_intr_gate_notrace(n, addr) \ - do { \ - BUG_ON((unsigned)n > 0xFF); \ - _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ - __KERNEL_CS); \ - } while (0) - -#define set_intr_gate(n, addr) \ - do { \ - set_intr_gate_notrace(n, addr); \ - _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ - 0, 0, __KERNEL_CS); \ - } while (0) - -extern int first_system_vector; -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do { \ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) @@ -567,31 +420,6 @@ static inline void load_debug_idt(void) } #endif -#ifdef CONFIG_TRACING -extern atomic_t trace_idt_ctr; -static inline bool is_trace_idt_enabled(void) -{ - if (atomic_read(&trace_idt_ctr)) - return true; - - return false; -} - -static inline void load_trace_idt(void) -{ - load_idt((const struct desc_ptr *)&trace_idt_descr); -} -#else -static inline bool is_trace_idt_enabled(void) -{ - return false; -} - -static inline void load_trace_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -603,9 +431,25 @@ static inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); - else if (is_trace_idt_enabled()) - load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); } + +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); +extern void idt_setup_traps(void); +extern void idt_setup_apic_and_irq_gates(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); +extern void idt_setup_debugidt_traps(void); +#else +static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } +static inline void idt_setup_debugidt_traps(void) { } +#endif + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 49265345d4d2..346d252029b7 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -11,34 +11,30 @@ #include <linux/types.h> -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than a union, - * so we can get rid of it transparently in the future -- glommer - */ /* 8 byte segment descriptor */ struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }; - struct { - u16 limit0; - u16 base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }; - }; + u16 limit0; + u16 base0; + u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; + u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed)); -#define GDT_ENTRY_INIT(flags, base, limit) { { { \ - .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ - .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ - ((limit) & 0xf0000) | ((base) & 0xff000000), \ - } } } +#define GDT_ENTRY_INIT(flags, base, limit) \ + { \ + .limit0 = (u16) (limit), \ + .limit1 = ((limit) >> 16) & 0x0F, \ + .base0 = (u16) (base), \ + .base1 = ((base) >> 16) & 0xFF, \ + .base2 = ((base) >> 24) & 0xFF, \ + .type = (flags & 0x0f), \ + .s = (flags >> 4) & 0x01, \ + .dpl = (flags >> 5) & 0x03, \ + .p = (flags >> 7) & 0x01, \ + .avl = (flags >> 12) & 0x01, \ + .l = (flags >> 13) & 0x01, \ + .d = (flags >> 14) & 0x01, \ + .g = (flags >> 15) & 0x01, \ + } enum { GATE_INTERRUPT = 0xE, @@ -47,49 +43,63 @@ enum { GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; +#ifdef CONFIG_X86_64 + u32 base3; + u32 zero1; +#endif } __attribute__((packed)); +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; + +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); #else -typedef struct desc_struct gate_desc; -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) + return g->offset_low | ((unsigned long)g->offset_middle << 16); #endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 398c79889f5c..1387dafdba2d 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,6 +12,7 @@ #include <asm/io.h> #include <asm/swiotlb.h> #include <linux/dma-contiguous.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr; + return __sme_set(paddr); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr; + return __sme_clr(daddr); } #endif /* CONFIG_X86_DMA_REMAP */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c5..a8e15b04565b 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index a504adc661a4..cd266d830e49 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void); extern void e820__reallocate_tables(void); extern void e820__register_nosave_regions(unsigned long limit_pfn); +extern int e820__get_entry_type(u64 start, u64 end); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1c18d83d3f09..04330c8d9af9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -126,15 +126,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ @@ -247,11 +248,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - 0x100000000UL) + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, @@ -304,8 +305,8 @@ static inline int mmap_is_ia32(void) test_thread_flag(TIF_ADDR32)); } -extern unsigned long tasksize_32bit(void); -extern unsigned long tasksize_64bit(void); +extern unsigned long task_size_32bit(void); +extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 07b06955a05d..aa15d1f7e530 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,20 +13,14 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, - smp_irq_move_cleanup_interrupt) -BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) +BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, - smp_kvm_posted_intr_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, - smp_kvm_posted_intr_wakeup_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, - smp_kvm_posted_intr_nested_ipi) +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif /* @@ -41,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760..dcd9fb55e679 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include <asm-generic/fixmap.h> #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 255645f60ca2..554cdb205d17 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate) +static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { - copy_kernel_to_xregs(&fpstate->xsave, -1); + copy_kernel_to_xregs(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); @@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) : : [addr] "m" (fpstate)); } - __copy_kernel_to_fpregs(fpstate); + __copy_kernel_to_fpregs(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..f4dc9b63bdda 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d6dbafbd4207..6dfe366a8804 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -46,26 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); -#ifdef CONFIG_TRACING -/* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); -extern void trace_x86_platform_ipi(void); -extern void trace_error_interrupt(void); -extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); -extern void trace_thermal_interrupt(void); -extern void trace_reschedule_interrupt(void); -extern void trace_threshold_interrupt(void); -extern void trace_deferred_error_interrupt(void); -extern void trace_call_function_interrupt(void); -extern void trace_call_function_single_interrupt(void); -#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt -#define trace_reboot_interrupt reboot_interrupt -#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi -#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi -#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi -#endif /* CONFIG_TRACING */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 21126155a739..0ead9dbb9130 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -43,6 +43,9 @@ struct hypervisor_x86 { /* pin current vcpu to specified physical cpu (run rarely) */ void (*pin_vcpu)(int); + + /* called during init_mem_mapping() to setup early mappings. */ + void (*init_mem_mapping)(void); }; extern const struct hypervisor_x86 *x86_hyper; @@ -57,8 +60,15 @@ extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); extern void hypervisor_pin_vcpu(int cpu); + +static inline void hypervisor_init_mem_mapping(void) +{ + if (x86_hyper && x86_hyper->init_mem_mapping) + x86_hyper->init_mem_mapping(); +} #else static inline void init_hypervisor_platform(void) { } static inline bool hypervisor_x2apic_available(void) { return false; } +static inline void hypervisor_init_mem_mapping(void) { } #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 474eb8c66fee..05c4aa00cc86 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -7,6 +7,7 @@ struct x86_mapping_info { unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ bool direct_gbpages; /* PUD level 1GB page support */ + unsigned long kernpg_flag; /* kernel pagetable flag override */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_H -#define _ASM_X86_INTEL_RDT_H - -#ifdef CONFIG_INTEL_RDT_A - -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/jump_label.h> - -#include <asm/intel_rdt_common.h> - -#define IA32_L3_QOS_CFG 0xc81 -#define IA32_L3_CBM_BASE 0xc90 -#define IA32_L2_CBM_BASE 0xd10 -#define IA32_MBA_THRTL_BASE 0xd50 - -#define L3_QOS_CDP_ENABLE 0x01ULL - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - int closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - struct kernfs_ops *kf_ops; - unsigned long flags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - u32 *ctrl_val; - u32 new_ctrl; - bool have_new_ctrl; -}; - -/** - * struct msr_param - set a range of MSRs from a domain - * @res: The resource to use - * @low: Beginning index from base MSR - * @high: End index - */ -struct msr_param { - struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - u32 *mb_map; -}; - -/** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @msr_base: Base MSR address for CBMs - * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @info_files: resctrl info files for the resource - * @nr_info_files: Number of info files - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; - unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - struct rftype *info_files; - int nr_info_files; - const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, - struct rdt_domain *d); -}; - -void rdt_get_cache_infofile(struct rdt_resource *r); -void rdt_get_mba_infofile(struct rdt_resource *r); -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - -extern struct rdt_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); - -int __init rdtgroup_init(void); - -enum { - RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -#define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->capable) - -#define for_each_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->enabled) - -/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ -union cpuid_0x10_1_eax { - struct { - unsigned int cbm_len:5; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ -union cpuid_0x10_3_eax { - struct { - unsigned int max_delay:12; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID).EDX */ -union cpuid_0x10_x_edx { - struct { - unsigned int cos_max:16; - } split; - unsigned int full; -}; - -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); - -void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); - -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - -#endif /* CONFIG_INTEL_RDT_A */ -#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_COMMON_H -#define _ASM_X86_INTEL_RDT_COMMON_H - -#define MSR_IA32_PQR_ASSOC 0x0c8f - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); - -#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..b4bbf8b21512 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,92 @@ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include <linux/sched.h> +#include <linux/jump_label.h> + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 48febf07e828..c40a95c33bb8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -341,13 +309,38 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index @@ -381,4 +377,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_can_ram_remap(resource_size_t offset, + unsigned long size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + +extern bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size); + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 668cca540025..9958ceea2fa3 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f70604125286..ddbb8ea0f5a9 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -3,9 +3,17 @@ #include <asm/cpufeature.h> +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 70ef205489f0..942c1f444da8 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -147,7 +147,8 @@ unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, - unsigned int preserve_context); + unsigned int preserve_context, + unsigned int sme_active); #endif #define ARCH_HAS_KIMAGE_ARCH @@ -207,6 +208,14 @@ struct kexec_entry64_regs { uint64_t r15; uint64_t rip; }; + +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, + gfp_t gfp); +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages + +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages + #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 87ac4fba6d8e..369e41c23f07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -492,6 +492,7 @@ struct kvm_vcpu_arch { unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 pkru; u32 hflags; u64 efer; u64 apic_base; @@ -1078,7 +1079,7 @@ void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask); + u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..8e618fcf1f7c --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,80 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#include <linux/init.h> + +#include <asm/bootparam.h> + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern unsigned long sme_me_mask; + +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, + unsigned long decrypted_kernel_vaddr, + unsigned long kernel_len, + unsigned long encryption_wa, + unsigned long encryption_pgd); + +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + +void __init sme_early_init(void); + +void __init sme_encrypt_kernel(void); +void __init sme_enable(struct boot_params *bp); + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + +static inline void __init sme_early_init(void) { } + +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(struct boot_params *bp) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 79b647a7ebd0..bb8c597c2248 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,28 @@ #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/atomic.h> /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,6 +53,11 @@ typedef struct { #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 265c907d7d4c..7ae318c340d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/mpx.h> + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + cpumask_clear_cpu(cpu, mm_cpumask(mm)); } static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ @@ -140,9 +148,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif - init_new_context_ldt(tsk, mm); - - return 0; + return init_new_context_ldt(tsk, mm); } static inline void destroy_context(struct mm_struct *mm) { @@ -292,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void) { unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + if (static_cpu_has(X86_FEATURE_PCID)) + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..9eb7c718aaf8 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a0d662be4c5b..7d7404756bb4 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm) } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); + +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { @@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } + +static inline unsigned long mpx_unmapped_area_check(unsigned long addr, + unsigned long len, unsigned long flags) +{ + return addr; +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 2b58c8c1eeaa..63cc96f064dc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -3,6 +3,8 @@ #include <linux/types.h> #include <linux/atomic.h> +#include <linux/nmi.h> +#include <asm/io.h> #include <asm/hyperv.h> /* @@ -28,6 +30,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -168,12 +172,155 @@ void hv_remove_crash_handler(void); #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = input ? virt_to_phys(input) : 0; + u64 output_address = output ? virt_to_phys(output) : 0; + u64 hv_status; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("mov %4, %%r8\n" + "call *%5" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input_address) + : "r" (output_address), "m" (hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); +#else + u32 input_address_hi = upper_32_bits(input_address); + u32 input_address_lo = lower_32_bits(input_address); + u32 output_address_hi = upper_32_bits(output_address); + u32 output_address_lo = lower_32_bits(output_address); + + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("call *%7" + : "=A" (hv_status), + "+c" (input_address_lo), "+r" (__sp) + : "A" (control), + "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + "m" (hv_hypercall_pg) + : "cc", "memory"); +#endif /* !x86_64 */ + return hv_status; +} + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__("call *%4" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input1) + : "m" (hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + + __asm__ __volatile__ ("call *%5" + : "=A"(hv_status), + "+c"(input1_lo), + "+r"(__sp) + : "A" (control), + "b" (input1_hi), + "m" (hv_hypercall_pg) + : "cc", "edi", "esi"); + } +#endif + return hv_status; +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and varhead_size comply with Hyper-V hypercall definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + u64 control = code; + u64 status; + u16 rep_comp; + + control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; + control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + + do { + status = hv_do_hypercall(control, input, output); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + return status; + + /* Bits 32-43 of status have 'Reps completed' data. */ + rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >> + HV_HYPERCALL_REP_COMP_OFFSET; + + control &= ~HV_HYPERCALL_REP_START_MASK; + control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET; + + touch_nmi_watchdog(); + } while (rep_comp < rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 *hv_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); -#endif +#else /* CONFIG_HYPERV */ +static inline void hyperv_init(void) {} +static inline bool hv_is_hypercall_page_setup(void) { return false; } +static inline void hyperv_cleanup(void) {} +static inline void hyperv_setup_mmu_ops(void) {} +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5573c75f8e4c..17f5c12e1afd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -356,6 +356,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b4a0d43248cf..b50df06ad251 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -51,6 +51,10 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384ca..b98ed9d14630 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include <linux/const.h> #include <linux/types.h> +#include <linux/mem_encrypt.h> /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9ccac1926587..c25dd22f7c70 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -960,11 +960,6 @@ extern void default_banner(void); #define GET_CR2_INTO_RAX \ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) - #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9ffc36bfe4cd..6b64fc6367f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -196,9 +196,6 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 77037b6f1caa..bbeae4a2bd01 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include <linux/mem_encrypt.h> #include <asm/page.h> #include <asm/pgtable_types.h> @@ -13,9 +14,18 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include <asm/x86_init.h> +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -38,6 +48,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT */ @@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee42..399261ce904c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> +#include <linux/mem_encrypt.h> + #include <asm/page_types.h> #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -159,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -187,22 +190,42 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE @@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d) #else #include <asm-generic/pgtable-nop4d.h> +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; +} + static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 79aa2f98398d..dc723b64acf0 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -2,6 +2,7 @@ #define _ASM_X86_PROCESSOR_FLAGS_H #include <uapi/asm/processor-flags.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM @@ -32,16 +33,18 @@ * CR3_ADDR_MASK is the mask used by read_cr3_pa(). */ #ifdef CONFIG_X86_64 -/* Mask off the address space ID bits. */ -#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull -#define CR3_PCID_MASK 0xFFFull +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_PCID_MASK 0xFFFull +#define CR3_NOFLUSH BIT_ULL(63) #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save * a tiny bit of code size by setting all the bits. */ -#define CR3_ADDR_MASK 0xFFFFFFFFull -#define CR3_PCID_MASK 0ull +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#define CR3_NOFLUSH 0 #endif #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 028245e1c42b..3fa26a61eabc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -29,6 +30,7 @@ struct vm86; #include <linux/math64.h> #include <linux/err.h> #include <linux/irqflags.h> +#include <linux/mem_encrypt.h> /* * We handle most unaligned accesses in hardware. On the other hand @@ -239,9 +241,14 @@ static inline unsigned long read_cr3_pa(void) return __read_cr3() & CR3_ADDR_MASK; } +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 @@ -661,7 +668,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -684,6 +691,7 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -693,6 +701,7 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif @@ -802,7 +811,9 @@ static inline void spin_lock_prefetch(const void *x) */ #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -842,7 +853,9 @@ static inline void spin_lock_prefetch(const void *x) * particular problem by preventing anything from being mapped * at the maximum canonical address. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -850,12 +863,14 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define STACK_TOP TASK_SIZE +#define STACK_TOP TASK_SIZE_LOW #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ @@ -876,7 +891,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) -#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8d3964fc5f91..b408b1886195 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..91c04c8e67fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -9,6 +9,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +30,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 230e1903acf0..90d91520c13a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -1,6 +1,15 @@ #ifndef _ARCH_X86_REALMODE_H #define _ARCH_X86_REALMODE_H +/* + * Flag bit definitions for use with the flags field of the trampoline header + * in the CONFIG_X86_64 variant. + */ +#define TH_FLAGS_SME_ACTIVE_BIT 0 +#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) + +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <asm/io.h> @@ -38,6 +47,7 @@ struct trampoline_header { u64 start; u64 efer; u32 cr4; + u32 flags; #endif }; @@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void) void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); +#endif /* __ASSEMBLY__ */ + #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..045f99211a99 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,56 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1549caa098f0..066aaf813141 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -238,9 +238,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; -#ifdef CONFIG_TRACING -# define trace_early_idt_handler_array early_idt_handler_array -#endif +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index eaec6c364e42..cd71273ec49d 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -11,6 +11,7 @@ * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent + * Encryption : Encrypted, Decrypted * * Within a category, the attributes are mutually exclusive. * @@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index e4585a393965..a65cf544686a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,7 @@ static inline void vsmp_init(void) { } #endif void setup_bios_corruption_check(void); +void early_platform_quirks(void); extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e00e1bd6e7b3..5161da1a0fa0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -98,6 +98,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +123,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +139,8 @@ struct thread_info { (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index c7797307fc2b..79a4ca6a9606 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -15,4 +15,18 @@ #include <asm-generic/tlb.h> +/* + * While x86 architecture in general requires an IPI to perform TLB + * shootdown, enablement code for several hypervisors overrides + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing + * a hypercall. To keep software pagetable walkers safe in this case we + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50ea3482e1d1..4893abf7f74f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * 6 because 6 should be plenty and struct tlb_state will fit in + * two cache lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -73,13 +101,35 @@ struct tlb_state { * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; + u16 loaded_mm_asid; + u16 next_asid; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * There is one per ASID that we use, and the ASID (what the + * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + * + * NB: context 0 is a bit special, since it's also used by + * various bits of init code. This is fine -- code that + * isn't aware of PCID will end up harmlessly flushing + * context 0. + */ + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -148,6 +198,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +extern void initialize_tlbstate_and_flush(void); + static inline void __native_flush_tlb(void) { /* @@ -207,6 +259,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) @@ -231,9 +291,26 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() @@ -256,12 +333,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000000000000..57c8da027d99 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,16 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(&trace_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled() \ + static_branch_unlikely(&trace_resched_ipi_key) +#else +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14c50a7..5665bf205b8d 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,10 @@ #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, @@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h new file mode 100644 index 000000000000..4253bca99989 --- /dev/null +++ b/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,40 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include <linux/tracepoint.h> + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, + const struct flush_tlb_info *info), + TP_ARGS(cpus, info), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = info->mm; + __entry->addr = info->start; + __entry->end = info->end; + ), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, + __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hyperv +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9e343c..1599d394c8c1 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,12 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#ifdef CONFIG_X86_LOCAL_APIC + +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); DECLARE_EVENT_CLASS(x86_irq_vector, @@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); \ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); /* * local_timer - called when entering/exiting a local timer interrupt @@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_IRQ_VECTOR_EVENT(reschedule); - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..5545f6459bf5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,9 +13,6 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -38,22 +35,29 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -#ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); -#define trace_stack_segment stack_segment -#define trace_divide_error divide_error -#define trace_bounds bounds -#define trace_invalid_op invalid_op -#define trace_device_not_available device_not_available -#define trace_coprocessor_segment_overrun coprocessor_segment_overrun -#define trace_invalid_TSS invalid_TSS -#define trace_segment_not_present segment_not_present -#define trace_general_protection general_protection -#define trace_spurious_interrupt_bug spurious_interrupt_bug -#define trace_coprocessor_error coprocessor_error -#define trace_alignment_check alignment_check -#define trace_simd_coprocessor_error simd_coprocessor_error -#define trace_async_page_fault async_page_fault +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_nmi(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 30269dafec47..184eb9894dae 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -26,7 +26,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..e9f793e2df7a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +27,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index c4b9dc2f67c5..9f42beefc67a 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -7,12 +7,24 @@ #ifndef _ASM_X86_VGA_H #define _ASM_X86_VGA_H +#include <asm/set_memory.h> + /* * On the PC, we can just recalculate addresses and then * access the videoram directly without any black magic. + * To support memory encryption however, we need to access + * the videoram as decrypted memory. */ -#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x, s) \ +({ \ + unsigned long start = (unsigned long)phys_to_virt(x); \ + \ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ + \ + start; \ +}) #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 11071fcd630e..9606688caa4b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,6 +552,8 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) &desc; + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8417ef7c3885..07b6531813c4 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -158,9 +158,6 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) unsigned long pfn; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return mfn; - if (unlikely(mfn >= machine_to_phys_nr)) return ~0; @@ -317,8 +314,6 @@ static inline pte_t __pte_ma(pteval_t x) #define p4d_val_ma(x) ((x).p4d) #endif -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); - xmaddr_t arbitrary_virt_to_machine(void *address); unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); |