diff options
Diffstat (limited to 'arch/x86/include')
53 files changed, 1055 insertions, 633 deletions
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 65064d9f7fa6..8eb74cf386db 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -14,6 +14,7 @@ #include <asm/mmu.h> #include <asm/mpspec.h> #include <asm/x86_init.h> +#include <asm/cpufeature.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -63,6 +64,13 @@ extern int (*acpi_suspend_lowlevel)(void); /* Physical address to resume after wakeup */ unsigned long acpi_get_wakeup_address(void); +static inline bool acpi_skip_set_wakeup_address(void) +{ + return cpu_feature_enabled(X86_FEATURE_XENPV); +} + +#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address + /* * Check if the CPU can handle C2 and deeper */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9542c582d546..7659217f4d49 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -78,8 +78,43 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, + s32 *start_cfi, s32 *end_cfi); struct module; +struct paravirt_patch_site; + +struct callthunk_sites { + s32 *call_start, *call_end; + struct paravirt_patch_site *pv_start, *pv_end; +}; + +#ifdef CONFIG_CALL_THUNKS +extern void callthunks_patch_builtin_calls(void); +extern void callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod); +extern void *callthunks_translate_call_dest(void *dest); +extern bool is_callthunk(void *addr); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); +#else +static __always_inline void callthunks_patch_builtin_calls(void) {} +static __always_inline void +callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod) {} +static __always_inline void *callthunks_translate_call_dest(void *dest) +{ + return dest; +} +static __always_inline bool is_callthunk(void *addr) +{ + return false; +} +static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, + void *func) +{ + return 0; +} +#endif #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, @@ -347,6 +382,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define old_len 141b-140b #define new_len1 144f-143f #define new_len2 145f-144f +#define new_len3 146f-145f /* * gas compatible max based on the idea from: @@ -354,7 +390,8 @@ static inline int alternatives_text_reserved(void *start, void *end) * * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c)) /* @@ -366,13 +403,36 @@ static inline int alternatives_text_reserved(void *start, void *end) 140: \oldinstr 141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 + .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_2(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection +.endm + +.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3 +140: + \oldinstr +141: + .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \ + (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90 142: .pushsection .altinstructions,"a" altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f .popsection .pushsection .altinstr_replacement,"ax" @@ -381,6 +441,8 @@ static inline int alternatives_text_reserved(void *start, void *end) 144: \newinstr2 145: + \newinstr3 +146: .popsection .endm diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index 86b2e0dcc4bf..ce9685fc78d8 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -2,7 +2,20 @@ #ifndef _ASM_X86_CACHEINFO_H #define _ASM_X86_CACHEINFO_H +/* Kernel controls MTRR and/or PAT MSRs. */ +extern unsigned int memory_caching_control; +#define CACHE_MTRR 0x01 +#define CACHE_PAT 0x02 + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); +void cache_disable(void); +void cache_enable(void); +void set_cache_aps_delayed_init(bool val); +bool get_cache_aps_delayed_init(void); +void cache_bp_init(void); +void cache_bp_restore(void); +void cache_aps_init(void); + #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 215f5a65790f..6ba80ce9438d 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -7,34 +7,6 @@ * you need to test for the feature in boot_cpu_data. */ -/* - * CMPXCHG8B only writes to the target if we had the previous - * value in registers, otherwise it acts as a read and gives us the - * "new previous" value. That is why there is a loop. Preloading - * EDX:EAX is a performance optimization: in the common case it means - * we need only one locked operation. - * - * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very - * least an FPU save and/or %cr0.ts manipulation. - * - * cmpxchg8b must be used with the lock prefix here to allow the - * instruction to be executed atomically. We need to have the reader - * side to see the coherent 64bit value. - */ -static inline void set_64bit(volatile u64 *ptr, u64 value) -{ - u32 low = value; - u32 high = value >> 32; - u64 prev = *ptr; - - asm volatile("\n1:\t" - LOCK_PREFIX "cmpxchg8b %0\n\t" - "jnz 1b" - : "=m" (*ptr), "+A" (prev) - : "b" (low), "c" (high) - : "memory"); -} - #ifdef CONFIG_X86_CMPXCHG64 #define arch_cmpxchg64(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 250187ac8248..0d3beb27b7fe 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -2,11 +2,6 @@ #ifndef _ASM_X86_CMPXCHG_64_H #define _ASM_X86_CMPXCHG_64_H -static inline void set_64bit(volatile u64 *ptr, u64 val) -{ - *ptr = val; -} - #define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index b472ef76826a..78796b98a544 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -95,5 +95,7 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, } extern u64 x86_read_arch_cap_msr(void); +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf); +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 75efc4c6f076..462fc34f1317 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -130,10 +130,6 @@ struct cpu_entry_area { }; #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) - -/* Total size includes the readonly IDT mapping page as well: */ -#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c9f4730bb113..61012476d66e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -305,13 +305,15 @@ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ #define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ - - +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 70b2db18165e..9bee3e7bf973 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -1,13 +1,132 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * CPUID-related helpers/definitions - * - * Derived from arch/x86/kvm/cpuid.c */ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include <asm/string.h> + +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + +#ifdef CONFIG_X86_32 +extern int have_cpuid_p(void); +#else +static inline int have_cpuid_p(void) +{ + return 1; +} +#endif +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +#define native_cpuid_reg(reg) \ +static inline unsigned int native_cpuid_##reg(unsigned int op) \ +{ \ + unsigned int eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum. + */ +native_cpuid_reg(eax) +native_cpuid_reg(ebx) +native_cpuid_reg(ecx) +native_cpuid_reg(edx) + +#ifdef CONFIG_PARAVIRT_XXL +#include <asm/paravirt.h> +#else +#define __cpuid native_cpuid +#endif + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { @@ -31,4 +150,22 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) return false; } +#define for_each_possible_hypervisor_cpuid_base(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) +{ + uint32_t base, eax, signature[3]; + + for_each_possible_hypervisor_cpuid_base(base) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); + + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } + + return 0; +} + #endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 3e204e6140b5..a1168e7b69e5 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -3,16 +3,42 @@ #define _ASM_X86_CURRENT_H #include <linux/compiler.h> -#include <asm/percpu.h> #ifndef __ASSEMBLY__ + +#include <linux/cache.h> +#include <asm/percpu.h> + struct task_struct; -DECLARE_PER_CPU(struct task_struct *, current_task); +struct pcpu_hot { + union { + struct { + struct task_struct *current_task; + int preempt_count; + int cpu_number; +#ifdef CONFIG_CALL_DEPTH_TRACKING + u64 call_depth; +#endif + unsigned long top_of_stack; + void *hardirq_stack_ptr; + u16 softirq_pending; +#ifdef CONFIG_X86_64 + bool hardirq_stack_inuse; +#else + void *softirq_stack_ptr; +#endif + }; + u8 pad[64]; + }; +}; +static_assert(sizeof(struct pcpu_hot) == 64); + +DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); static __always_inline struct task_struct *get_current(void) { - return this_cpu_read_stable(current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } #define current get_current() diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index cfdf307ddc01..ca97442e8d49 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -2,8 +2,8 @@ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H - #include <linux/bug.h> +#include <linux/percpu.h> #include <uapi/asm/debugreg.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); @@ -39,7 +39,20 @@ static __always_inline unsigned long native_get_debugreg(int regno) asm("mov %%db6, %0" :"=r" (val)); break; case 7: - asm("mov %%db7, %0" :"=r" (val)); + /* + * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them + * with other code. + * + * This is needed because a DR7 access can cause a #VC exception + * when running under SEV-ES. Taking a #VC exception is not a + * safe thing to do just anywhere in the entry code and + * re-ordering might place the access into an unsafe location. + * + * This happened in the NMI handler, where the DR7 read was + * re-ordered to happen before the call to sev_es_ist_enter(), + * causing stack recursion. + */ + asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); @@ -66,7 +79,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) asm("mov %0, %%db6" ::"r" (value)); break; case 7: - asm("mov %0, %%db7" ::"r" (value)); + /* + * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them + * with other code. + * + * While is didn't happen with a DR7 write (see the DR7 read + * comment above which explains where it happened), add the + * __FORCE_ORDER here too to avoid similar problems in the + * future. + */ + asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33d2cd04d254..c44b56f7ffba 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -69,6 +69,12 @@ # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +# define DISABLE_CALL_DEPTH_TRACKING 0 +#else +# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -81,6 +87,12 @@ # define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) #endif +#ifdef CONFIG_XEN_PV +# define DISABLE_XENPV 0 +#else +# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) +#endif + #ifdef CONFIG_INTEL_TDX_GUEST # define DISABLE_TDX_GUEST 0 #else @@ -98,10 +110,11 @@ #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_TDX_GUEST) +#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ + DISABLE_CALL_DEPTH_TRACKING) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 233ae6986d6f..a63154e049d7 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -178,7 +178,7 @@ struct efi_setup_data { extern u64 efi_setup; #ifdef CONFIG_EFI -extern efi_status_t __efi64_thunk(u32, ...); +extern u64 __efi64_thunk(u32, ...); #define efi64_thunk(...) ({ \ u64 __pad[3]; /* must have space for 3 args on the stack */ \ @@ -228,16 +228,15 @@ static inline bool efi_is_native(void) return efi_is_64bit(); } -#define efi_mixed_mode_cast(attr) \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(u32, __typeof__(attr)), \ - (unsigned long)(attr), (attr)) - #define efi_table_attr(inst, attr) \ - (efi_is_native() \ - ? inst->attr \ - : (__typeof__(inst->attr)) \ - efi_mixed_mode_cast(inst->mixed_mode.attr)) + (efi_is_native() ? (inst)->attr \ + : efi_mixed_table_attr((inst), attr)) + +#define efi_mixed_table_attr(inst, attr) \ + (__typeof__(inst->attr)) \ + _Generic(inst->mixed_mode.attr, \ + u32: (unsigned long)(inst->mixed_mode.attr), \ + default: (inst->mixed_mode.attr)) /* * The following macros allow translating arguments if necessary from native to @@ -325,6 +324,17 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) +/* file protocol */ +#define __efi64_argmap_open(prot, newh, fname, mode, attr) \ + ((prot), efi64_zero_upper(newh), (fname), __efi64_split(mode), \ + __efi64_split(attr)) + +#define __efi64_argmap_set_position(pos) (__efi64_split(pos)) + +/* file system protocol */ +#define __efi64_argmap_open_volume(prot, file) \ + ((prot), efi64_zero_upper(file)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro @@ -344,31 +354,27 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi_eat(...) #define __efi_eval(...) __VA_ARGS__ -/* The three macros below handle dispatching via the thunk if needed */ - -#define efi_call_proto(inst, func, ...) \ - (efi_is_native() \ - ? inst->func(inst, ##__VA_ARGS__) \ - : __efi64_thunk_map(inst, func, inst, ##__VA_ARGS__)) +static inline efi_status_t __efi64_widen_efi_status(u64 status) +{ + /* use rotate to move the value of bit #31 into position #63 */ + return ror64(rol32(status, 1), 1); +} -#define efi_bs_call(func, ...) \ - (efi_is_native() \ - ? efi_system_table->boottime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table, \ - boottime), \ - func, __VA_ARGS__)) +/* The macro below handles dispatching via the thunk if needed */ -#define efi_rt_call(func, ...) \ - (efi_is_native() \ - ? efi_system_table->runtime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table, \ - runtime), \ - func, __VA_ARGS__)) +#define efi_fn_call(inst, func, ...) \ + (efi_is_native() ? (inst)->func(__VA_ARGS__) \ + : efi_mixed_call((inst), func, ##__VA_ARGS__)) -#define efi_dxe_call(func, ...) \ - (efi_is_native() \ - ? efi_dxe_table->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) +#define efi_mixed_call(inst, func, ...) \ + _Generic(inst->func(__VA_ARGS__), \ + efi_status_t: \ + __efi64_widen_efi_status( \ + __efi64_thunk_map(inst, func, ##__VA_ARGS__)), \ + u64: ({ BUILD_BUG(); ULONG_MAX; }), \ + default: \ + (__typeof__(inst->func(__VA_ARGS__))) \ + __efi64_thunk_map(inst, func, ##__VA_ARGS__)) #else /* CONFIG_EFI_MIXED */ @@ -400,13 +406,52 @@ static inline void efi_reserve_boot_services(void) #ifdef CONFIG_EFI_FAKE_MEMMAP extern void __init efi_fake_memmap_early(void); +extern void __init efi_fake_memmap(void); #else static inline void efi_fake_memmap_early(void) { } + +static inline void efi_fake_memmap(void) +{ +} #endif +extern int __init efi_memmap_alloc(unsigned int num_entries, + struct efi_memory_map_data *data); +extern void __efi_memmap_free(u64 phys, unsigned long size, + unsigned long flags); +#define __efi_memmap_free __efi_memmap_free + +extern int __init efi_memmap_install(struct efi_memory_map_data *data); +extern int __init efi_memmap_split_count(efi_memory_desc_t *md, + struct range *range); +extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, + void *buf, struct efi_mem_range *mem); + #define arch_ima_efi_boot_mode \ ({ extern struct boot_params boot_params; boot_params.secure_boot; }) +#ifdef CONFIG_EFI_RUNTIME_MAP +int efi_get_runtime_map_size(void); +int efi_get_runtime_map_desc_size(void); +int efi_runtime_map_copy(void *buf, size_t bufsz); +#else +static inline int efi_get_runtime_map_size(void) +{ + return 0; +} + +static inline int efi_get_runtime_map_desc_size(void) +{ + return 0; +} + +static inline int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + return 0; +} + +#endif + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 674ed46d3ced..117903881fe4 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -24,8 +24,8 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) /* * For !SMAP hardware we patch out CLAC on entry. */ - if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + if (cpu_feature_enabled(X86_FEATURE_SMAP) || + cpu_feature_enabled(X86_FEATURE_XENPV)) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 275e7fd20310..66837b8c67f1 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,9 +3,9 @@ #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> +#include <asm/current.h> typedef struct { - u16 __softirq_pending; #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif @@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat +#define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) static inline void kvm_set_cpu_l1tf_flush_l1d(void) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 6d9368ea3701..08e822bd7aa6 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -61,6 +61,8 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) /* Support for debug MSRs available */ #define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +/* Support for extended gva ranges for flush hypercalls available */ +#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14) /* * Support for returning hypercall output block via XMM * registers is available @@ -607,6 +609,41 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF +/* + * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by + * pairing it with architecturally impossible exit reasons. Bit 28 is set only + * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit + * is pending. I.e. it will never be set by hardware for non-SMI exits (there + * are only three), nor will it ever be set unless the VMM is an STM. + */ +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031 + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose + * SVM enlightenments to guests. + */ +struct hv_vmcb_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB. + */ +#define HV_VMCB_NESTED_ENLIGHTENMENTS 31 + +/* Synthetic VM-Exit */ +#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) + struct hv_partition_assist_pg { u32 tlb_lock_count; }; diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index f07faa61c7f3..54368a43abf6 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -32,16 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); -enum mmio_type { - MMIO_DECODE_FAILED, - MMIO_WRITE, - MMIO_WRITE_IMM, - MMIO_READ, - MMIO_READ_ZERO_EXTEND, - MMIO_READ_SIGN_EXTEND, - MMIO_MOVS, +enum insn_mmio_type { + INSN_MMIO_DECODE_FAILED, + INSN_MMIO_WRITE, + INSN_MMIO_WRITE_IMM, + INSN_MMIO_READ, + INSN_MMIO_READ_ZERO_EXTEND, + INSN_MMIO_READ_SIGN_EXTEND, + INSN_MMIO_MOVS, }; -enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 147cb8fdda92..798183867d78 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 13e70da38bed..de75306b932e 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -28,9 +28,12 @@ #ifdef CONFIG_KASAN void __init kasan_early_init(void); void __init kasan_init(void); +void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid); #else static inline void kasan_early_init(void) { } static inline void kasan_init(void) { } +static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size, + int nid) { } #endif #endif diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 82ba4a564e58..abccd51dcfca 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -110,10 +110,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) +#ifdef CONFIG_KVM_SMM KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) +#endif KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) @@ -123,7 +125,7 @@ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f05ebaa26f0f..6aaae18f1854 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include <linux/clocksource.h> #include <linux/irqbypass.h> #include <linux/hyperv.h> +#include <linux/kfifo.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> @@ -81,7 +82,9 @@ #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) +#ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) +#endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) @@ -108,6 +111,8 @@ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -204,6 +209,7 @@ typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; +union kvm_smram; enum x86_intercept; enum x86_intercept_stage; @@ -253,16 +259,16 @@ enum x86_intercept_stage; #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) -#define PFERR_PK_MASK (1U << PFERR_PK_BIT) -#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) +#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) +#define PFERR_USER_MASK BIT(PFERR_USER_BIT) +#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) +#define PFERR_PK_MASK BIT(PFERR_PK_BIT) +#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ @@ -488,17 +494,19 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; /* More counters may conflict with other existing Architectural MSRs */ @@ -524,7 +532,16 @@ struct kvm_pmu { struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; - DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + + /* + * Overlay the bitmap with a 64-bit atomic so that all bits can be + * set in a single access, e.g. to reprogram all counters when the PMU + * filter changes. + */ + union { + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + atomic64_t __reprogram_pmi; + }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); @@ -602,6 +619,29 @@ struct kvm_vcpu_hv_synic { bool dont_zero_synic_pages; }; +/* The maximum number of entries on the TLB flush fifo. */ +#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) +/* + * Note: the following 'magic' entry is made up by KVM to avoid putting + * anything besides GVA on the TLB flush fifo. It is theoretically possible + * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 + * which will look identical. KVM's action to 'flush everything' instead of + * flushing these particular addresses is, however, fully legitimate as + * flushing more than requested is always OK. + */ +#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) + +enum hv_tlb_flush_fifos { + HV_L1_TLB_FLUSH_FIFO, + HV_L2_TLB_FLUSH_FIFO, + HV_NR_TLB_FLUSH_FIFOS, +}; + +struct kvm_vcpu_hv_tlb_flush_fifo { + spinlock_t write_lock; + DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; @@ -623,6 +663,19 @@ struct kvm_vcpu_hv { u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; + + struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; + + /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ + u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; + + struct hv_vp_assist_page vp_assist_page; + + struct { + u64 pa_page_gpa; + u64 vm_id; + u32 vp_id; + } nested; }; /* Xen HVM per vcpu emulation context */ @@ -633,6 +686,7 @@ struct kvm_vcpu_xen { struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; + struct gfn_to_pfn_cache runstate2_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; @@ -1057,8 +1111,10 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + struct mutex xen_lock; u32 xen_version; bool long_mode; + bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; @@ -1156,7 +1212,18 @@ struct kvm_arch { struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; - struct list_head lpage_disallowed_mmu_pages; + /* + * A list of kvm_mmu_page structs that, if zapped, could possibly be + * replaced by an NX huge page. A shadow page is on this list if its + * existence disallows an NX huge page (nx_huge_page_disallowed is set) + * and there are no other conditions that prevent a huge page, e.g. + * the backing host page is huge, dirtly logging is not enabled for its + * memslot, etc... Note, zapping shadow pages on this list doesn't + * guarantee an NX huge page will be created in its stead, e.g. if the + * guest attempts to execute from the region then KVM obviously can't + * create an NX huge page (without hanging the guest). + */ + struct list_head possible_nx_huge_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; /* @@ -1272,7 +1339,7 @@ struct kvm_arch { bool sgx_provisioning_allowed; struct kvm_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_lpage_recovery_thread; + struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 /* @@ -1284,6 +1351,9 @@ struct kvm_arch { */ bool tdp_mmu_enabled; + /* The number of TDP MMU pages across all roots. */ + atomic64_t tdp_mmu_pages; + /* * List of kvm_mmu_page structs being used as roots. * All kvm_mmu_page structs in the list should have @@ -1305,20 +1375,12 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* - * List of kvm_mmu_page structs not being used as roots. - * All kvm_mmu_page structs in the list should have - * tdp_mmu_page set and a tdp_mmu_root_count of 0. - */ - struct list_head tdp_mmu_pages; - - /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) - * - tdp_mmu_pages (above) * - the link field of kvm_mmu_page structs used by the TDP MMU - * - lpage_disallowed_mmu_pages - * - the lpage_disallowed_link field of kvm_mmu_page structs used + * - possible_nx_huge_pages; + * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * It is acceptable, but not necessary, to acquire this lock when * the thread holds the MMU lock in write mode. @@ -1612,10 +1674,12 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); + int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); +#endif int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1630,7 +1694,7 @@ struct kvm_x86_ops { void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); - int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); @@ -1663,6 +1727,7 @@ struct kvm_x86_nested_ops { int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); + void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { @@ -1844,6 +1909,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); @@ -1909,8 +1975,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -1994,14 +2058,18 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ + +#ifdef CONFIG_KVM_SMM #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) -#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -#define KVM_ADDRESS_SPACE_NUM 2 - -#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) -#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +# define KVM_ADDRESS_SPACE_NUM 2 +# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +#else +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) +#endif #define KVM_ARCH_WANT_MMU_NOTIFIER @@ -2089,14 +2157,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) - -int kvm_cpu_dirty_log_size(void); - int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f484d656d34e..dd9b8118f784 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -12,13 +12,26 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ -#ifdef __ASSEMBLY__ - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) -#define __ALIGN .p2align 4, 0x90 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) + +#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; +#else +#define FUNCTION_PADDING +#endif + +#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO) +# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING +#else +# define __FUNC_ALIGN __ALIGN #endif +#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN) +#define SYM_F_ALIGN __FUNC_ALIGN + +#ifdef __ASSEMBLY__ + #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_RETPOLINE */ @@ -43,11 +56,45 @@ #endif /* __ASSEMBLY__ */ +/* + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the + * CFI symbol layout changes. + * + * Without CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .skip FUNCTION_PADDING, 0x90 + * .byte 0xb8 + * .long __kcfi_typeid_##name + * name: + * + * With CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .byte 0xb8 + * .long __kcfi_typeid_##name + * .skip FUNCTION_PADDING, 0x90 + * name: + * + * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. + */ + +#ifdef CONFIG_CALL_PADDING +#define CFI_PRE_PADDING +#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#else +#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#define CFI_POST_PADDING +#endif + #define __CFI_TYPE(name) \ SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \ - .fill 11, 1, 0x90 ASM_NL \ + CFI_PRE_PADDING \ .byte 0xb8 ASM_NL \ .long __kcfi_typeid_##name ASM_NL \ + CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ @@ -57,7 +104,7 @@ /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ @@ -67,7 +114,7 @@ /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ @@ -77,7 +124,7 @@ /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h index 9ca760e430b9..113b2fa51849 100644 --- a/arch/x86/include/asm/memtype.h +++ b/arch/x86/include/asm/memtype.h @@ -6,9 +6,8 @@ #include <asm/pgtable_types.h> extern bool pat_enabled(void); -extern void pat_disable(const char *reason); -extern void pat_init(void); -extern void init_cache_modes(void); +extern void pat_bp_init(void); +extern void pat_cpu_init(void); extern int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 74ecc2bd6cd0..d5a58bde091c 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -33,8 +33,7 @@ enum ucode_state { }; struct microcode_ops { - enum ucode_state (*request_microcode_fw) (int cpu, struct device *, - bool refresh_fw); + enum ucode_state (*request_microcode_fw) (int cpu, struct device *); void (*microcode_fini_cpu) (int cpu); @@ -50,7 +49,6 @@ struct microcode_ops { struct ucode_cpu_info { struct cpu_signature cpu_sig; - int valid; void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 4c92cea7e4b5..f1fa979e05bf 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -14,7 +14,8 @@ struct microcode_header_intel { unsigned int pf; unsigned int datasize; unsigned int totalsize; - unsigned int reserved[3]; + unsigned int metasize; + unsigned int reserved[2]; }; struct microcode_intel { @@ -41,6 +42,8 @@ struct extended_sigtable { #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) +#define MC_HEADER_TYPE_MICROCODE 1 +#define MC_HEADER_TYPE_IFS 2 #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.datasize ? \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6bcd5d2dd718..d3fe82c5d6b6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -566,6 +566,26 @@ #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) +/* SNP feature bits enabled by the hypervisor */ +#define MSR_AMD64_SNP_VTOM BIT_ULL(3) +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) +#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) +#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) + +/* SNP feature bits reserved for future use. */ +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) + #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f /* AMD Collaborative Processor Performance Control MSRs */ @@ -793,6 +813,7 @@ #define ENERGY_PERF_BIAS_PERFORMANCE 0 #define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4 #define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_NORMAL_POWERSAVE 7 #define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8 #define ENERGY_PERF_BIAS_POWERSAVE 15 diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 76d726074c16..f0eeaf6e5f5f 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -25,13 +25,12 @@ #include <uapi/asm/mtrr.h> -void mtrr_bp_init(void); - /* * The following functions are for use by other drivers that cannot use * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR +void mtrr_bp_init(void); extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); @@ -42,12 +41,12 @@ extern int mtrr_add_page(unsigned long base, unsigned long size, extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); -extern void mtrr_ap_init(void); -extern void set_mtrr_aps_delayed_init(void); -extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +void mtrr_disable(void); +void mtrr_enable(void); +void mtrr_generic_set_state(void); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { @@ -83,10 +82,11 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -#define mtrr_ap_init() do {} while (0) -#define set_mtrr_aps_delayed_init() do {} while (0) -#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) +#define mtrr_disable() do {} while (0) +#define mtrr_enable() do {} while (0) +#define mtrr_generic_set_state() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dfdb103ae4f6..771b0a2b7a34 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,8 +12,104 @@ #include <asm/msr-index.h> #include <asm/unwind_hints.h> #include <asm/percpu.h> +#include <asm/current.h> -#define RETPOLINE_THUNK_SIZE 32 +/* + * Call depth tracking for Intel SKL CPUs to address the RSB underflow + * issue in software. + * + * The tracking does not use a counter. It uses uses arithmetic shift + * right on call entry and logical shift left on return. + * + * The depth tracking variable is initialized to 0x8000.... when the call + * depth is zero. The arithmetic shift right sign extends the MSB and + * saturates after the 12th call. The shift count is 5 for both directions + * so the tracking covers 12 nested calls. + * + * Call + * 0: 0x8000000000000000 0x0000000000000000 + * 1: 0xfc00000000000000 0xf000000000000000 + * ... + * 11: 0xfffffffffffffff8 0xfffffffffffffc00 + * 12: 0xffffffffffffffff 0xffffffffffffffe0 + * + * After a return buffer fill the depth is credited 12 calls before the + * next stuffing has to take place. + * + * There is a inaccuracy for situations like this: + * + * 10 calls + * 5 returns + * 3 calls + * 4 returns + * 3 calls + * .... + * + * The shift count might cause this to be off by one in either direction, + * but there is still a cushion vs. the RSB depth. The algorithm does not + * claim to be perfect and it can be speculated around by the CPU, but it + * is considered that it obfuscates the problem enough to make exploitation + * extremly difficult. + */ +#define RET_DEPTH_SHIFT 5 +#define RSB_RET_STUFF_LOOPS 16 +#define RET_DEPTH_INIT 0x8000000000000000ULL +#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL +#define RET_DEPTH_CREDIT 0xffffffffffffffffULL + +#ifdef CONFIG_CALL_THUNKS_DEBUG +# define CALL_THUNKS_DEBUG_INC_CALLS \ + incq %gs:__x86_call_count; +# define CALL_THUNKS_DEBUG_INC_RETS \ + incq %gs:__x86_ret_count; +# define CALL_THUNKS_DEBUG_INC_STUFFS \ + incq %gs:__x86_stuffs_count; +# define CALL_THUNKS_DEBUG_INC_CTXSW \ + incq %gs:__x86_ctxsw_count; +#else +# define CALL_THUNKS_DEBUG_INC_CALLS +# define CALL_THUNKS_DEBUG_INC_RETS +# define CALL_THUNKS_DEBUG_INC_STUFFS +# define CALL_THUNKS_DEBUG_INC_CTXSW +#endif + +#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) + +#include <asm/asm-offsets.h> + +#define CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define ASM_CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH \ + mov $0x80, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH_FROM_CALL \ + mov $0xfc, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS + +#define INCREMENT_CALL_DEPTH \ + sarq $5, %gs:pcpu_hot + X86_call_depth; \ + CALL_THUNKS_DEBUG_INC_CALLS + +#define ASM_INCREMENT_CALL_DEPTH \ + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS + +#else +#define CREDIT_CALL_DEPTH +#define ASM_CREDIT_CALL_DEPTH +#define RESET_CALL_DEPTH +#define INCREMENT_CALL_DEPTH +#define ASM_INCREMENT_CALL_DEPTH +#define RESET_CALL_DEPTH_FROM_CALL +#endif /* * Fill the CPU return stack buffer. @@ -32,6 +128,7 @@ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. */ +#define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* @@ -60,7 +157,9 @@ dec reg; \ jnz 771b; \ /* barrier for jnz misprediction */ \ - lfence; + lfence; \ + ASM_CREDIT_CALL_DEPTH \ + CALL_THUNKS_DEBUG_INC_CTXSW #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -185,11 +284,32 @@ * where we have a stack but before any RET instruction. */ .macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_CALL_DEPTH_TRACKING) ANNOTATE_UNRET_END - ALTERNATIVE_2 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + +.macro UNTRAIN_RET_FROM_CALL +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_CALL_DEPTH_TRACKING) + ANNOTATE_UNRET_END + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH +#endif +.endm + + +.macro CALL_DEPTH_ACCOUNT +#ifdef CONFIG_CALL_DEPTH_TRACKING + ALTERNATIVE "", \ + __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -203,11 +323,45 @@ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_CALL_THUNKS +extern void (*x86_return_thunk)(void); +#else +#define x86_return_thunk (&__x86_return_thunk) +#endif + +#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __x86_return_skl(void); + +static inline void x86_set_skl_return_thunk(void) +{ + x86_return_thunk = &__x86_return_skl; +} + +#define CALL_DEPTH_ACCOUNT \ + ALTERNATIVE("", \ + __stringify(INCREMENT_CALL_DEPTH), \ + X86_FEATURE_CALL_DEPTH) + +#ifdef CONFIG_CALL_THUNKS_DEBUG +DECLARE_PER_CPU(u64, __x86_call_count); +DECLARE_PER_CPU(u64, __x86_ret_count); +DECLARE_PER_CPU(u64, __x86_stuffs_count); +DECLARE_PER_CPU(u64, __x86_ctxsw_count); +#endif +#else +static inline void x86_set_skl_return_thunk(void) {} + +#define CALL_DEPTH_ACCOUNT "" + +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ @@ -215,6 +369,16 @@ extern void entry_ibpb(void); #include <asm/GEN-for-each-reg.h> #undef GEN +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index a506a411474d..86bd4311daf8 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -11,20 +11,14 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) - -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) -/* Cast *PAGE_MASK to a signed type so that it is sign-extended if +/* Cast P*D_MASK to a signed type so that it is sign-extended if virtual addresses are 32-bits but physical addresses are larger (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) -#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) -#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2a0b8dd4ec33..73e9522db7c1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -4,13 +4,13 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ +#include <asm/paravirt_types.h> + #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> -#include <asm/paravirt_types.h> - #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> @@ -665,6 +665,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ + ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ @@ -730,6 +731,18 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + extern void default_banner(void); #else /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f3d601574730..8c1da419260f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,36 +2,23 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) +#ifndef __ASSEMBLY__ +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 type; /* type of this instruction */ + u8 len; /* length of original instruction */ +}; -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; +#endif -#endif /* X86_64 */ +#ifdef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ @@ -279,27 +266,23 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ [paravirt_opptr] "m" (pv_ops.op) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - /* * Generate some code, and mark it as patchable by the * apply_paravirt() alternate instruction patcher. */ -#define _paravirt_alt(insn_string, type, clobber) \ +#define _paravirt_alt(insn_string, type) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ _ASM_ALIGN "\n" \ _ASM_PTR " 771b\n" \ " .byte " type "\n" \ " .byte 772b-771b\n" \ - " .short " clobber "\n" \ _ASM_ALIGN "\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ #define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + _paravirt_alt(insn_string, "%c[paravirt_typenum]") /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" @@ -451,20 +434,19 @@ int paravirt_disable_iospace(void); }) -#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ +#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(paravirt_alt(PARAVIRT_CALL) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) -#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \ extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ @@ -473,45 +455,44 @@ int paravirt_disable_iospace(void); alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) #define __PVOP_CALL(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ - ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_CALLEESAVE(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ - CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_VCALL(op, ...) \ - (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \ VEXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_VCALL(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + (void)____PVOP_ALT_CALL(, op, alt, cond, \ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_VCALLEESAVE(op, ...) \ - (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + (void)____PVOP_CALL(, op.func, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) @@ -571,13 +552,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - enum paravirt_lazy_mode paravirt_get_lazy_mode(void); void paravirt_start_context_switch(struct task_struct *prev); void paravirt_end_context_switch(struct task_struct *next); @@ -593,16 +567,9 @@ unsigned long paravirt_ret0(void); #define paravirt_nop ((void *)_paravirt_nop) -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; - extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #endif /* __ASSEMBLY__ */ - +#endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9ac46dbe57d4..5d0f6891ae61 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -543,12 +543,12 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { - return -1; + memset(lbr, 0, sizeof(*lbr)); } #endif diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 28421a887209..967b135fa2c0 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PGTABLE_3LEVEL_H #define _ASM_X86_PGTABLE_3LEVEL_H -#include <asm/atomic64_32.h> - /* * Intel Physical Address Extension (PAE) Mode - three-level page * tables on PPro+ CPUs. @@ -21,7 +19,15 @@ pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) -/* Rules for using set_pte: the pte being assigned *must* be +#define pxx_xchg64(_pxx, _ptr, _val) ({ \ + _pxx##val_t *_p = (_pxx##val_t *)_ptr; \ + _pxx##val_t _o = *_p; \ + do { } while (!try_cmpxchg64(_p, &_o, (_val))); \ + native_make_##_pxx(_o); \ +}) + +/* + * Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is * not possible, use pte_get_and_clear to obtain the old pte @@ -29,75 +35,19 @@ */ static inline void native_set_pte(pte_t *ptep, pte_t pte) { - ptep->pte_high = pte.pte_high; + WRITE_ONCE(ptep->pte_high, pte.pte_high); smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -#define pmd_read_atomic pmd_read_atomic -/* - * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by GCC. Problem is, in certain places - * where pte_offset_map_lock() is called, concurrent page faults are - * allowed, if the mmap_lock is hold for reading. An example is mincore - * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate() rightfully does a set_64bit(), but if we're reading the - * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because GCC will not read the 64-bit value of the pmd atomically. - * - * To fix this all places running pte_offset_map_lock() while holding the - * mmap_lock in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null or not, and in turn to know if - * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd - * operations. - * - * Without THP if the mmap_lock is held for reading, the pmd can only - * transition from null to not null while pmd_read_atomic() runs. So - * we can always return atomic pmd values with this function. - * - * With THP if the mmap_lock is held for reading, the pmd can become - * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic(). We could read it truly - * atomically here with an atomic64_read() for the THP enabled case (and - * it would be a whole lot simpler), but to avoid using cmpxchg8b we - * only return an atomic pmdval if the low part of the pmdval is later - * found to be stable (i.e. pointing to a pte). We are also returning a - * 'none' (zero) pmdval if the low part of the pmd is zero. - * - * In some cases the high and low part of the pmdval returned may not be - * consistent if THP is enabled (the low part may point to previously - * mapped hugepage, while the high part may point to a more recently - * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only - * needs the low part of the pmd to be read atomically to decide if the - * pmd is unstable or not, with the only exception when the low part - * of the pmd is zero, in which case we return a 'none' pmd. - */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - pmdval_t ret; - u32 *tmp = (u32 *)pmdp; - - ret = (pmdval_t) (*tmp); - if (ret) { - /* - * If the low part is null, we must not read the high part - * or we can end up with a partial pmd. - */ - smp_rmb(); - ret |= ((pmdval_t)*(tmp + 1)) << 32; - } - - return (pmd_t) { ret }; + WRITE_ONCE(ptep->pte_low, pte.pte_low); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); + pxx_xchg64(pte, ptep, native_pte_val(pte)); } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { - set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); + pxx_xchg64(pmd, pmdp, native_pmd_val(pmd)); } static inline void native_set_pud(pud_t *pudp, pud_t pud) @@ -105,7 +55,7 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud) #ifdef CONFIG_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif - set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); + pxx_xchg64(pud, pudp, native_pud_val(pud)); } /* @@ -116,17 +66,16 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud) static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep->pte_low = 0; + WRITE_ONCE(ptep->pte_low, 0); smp_wmb(); - ptep->pte_high = 0; + WRITE_ONCE(ptep->pte_high, 0); } -static inline void native_pmd_clear(pmd_t *pmd) +static inline void native_pmd_clear(pmd_t *pmdp) { - u32 *tmp = (u32 *)pmd; - *tmp = 0; + WRITE_ONCE(pmdp->pmd_low, 0); smp_wmb(); - *(tmp + 1) = 0; + WRITE_ONCE(pmdp->pmd_high, 0); } static inline void native_pud_clear(pud_t *pudp) @@ -149,41 +98,26 @@ static inline void pud_clear(pud_t *pudp) */ } + #ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { - pte_t res; - - res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0); - - return res; + return pxx_xchg64(pte, ptep, 0ULL); } -#else -#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) -#endif -union split_pmd { - struct { - u32 pmd_low; - u32 pmd_high; - }; - pmd_t pmd; -}; - -#ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) { - union split_pmd res, *orig = (union split_pmd *)pmdp; - - /* xchg acts as a barrier before setting of the high bits */ - res.pmd_low = xchg(&orig->pmd_low, 0); - res.pmd_high = orig->pmd_high; - orig->pmd_high = 0; + return pxx_xchg64(pmd, pmdp, 0ULL); +} - return res.pmd; +static inline pud_t native_pudp_get_and_clear(pud_t *pudp) +{ + return pxx_xchg64(pud, pudp, 0ULL); } #else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) #endif #ifndef pmdp_establish @@ -199,53 +133,16 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * anybody. */ if (!(pmd_val(pmd) & _PAGE_PRESENT)) { - union split_pmd old, new, *ptr; - - ptr = (union split_pmd *)pmdp; - - new.pmd = pmd; - /* xchg acts as a barrier before setting of the high bits */ - old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); - old.pmd_high = ptr->pmd_high; - ptr->pmd_high = new.pmd_high; - return old.pmd; - } - - do { - old = *pmdp; - } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); - - return old; -} -#endif - -#ifdef CONFIG_SMP -union split_pud { - struct { - u32 pud_low; - u32 pud_high; - }; - pud_t pud; -}; - -static inline pud_t native_pudp_get_and_clear(pud_t *pudp) -{ - union split_pud res, *orig = (union split_pud *)pudp; + old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low); + old.pmd_high = READ_ONCE(pmdp->pmd_high); + WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high); -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); -#endif - - /* xchg acts as a barrier before setting of the high bits */ - res.pud_low = xchg(&orig->pud_low, 0); - res.pud_high = orig->pud_high; - orig->pud_high = 0; + return old; + } - return res.pud; + return pxx_xchg64(pmd, pmdp, pmd.pmd); } -#else -#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) #endif /* Encode and de-code a swap entry */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 56baf43befb4..80911349519e 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -18,6 +18,13 @@ typedef union { }; pteval_t pte; } pte_t; + +typedef union { + struct { + unsigned long pmd_low, pmd_high; + }; + pmdval_t pmd; +} pmd_t; #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 286a71810f9e..0564edd24ffb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -292,7 +292,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { - return pte_flags(pte) & _PAGE_UFFD_WP; + bool wp = pte_flags(pte) & _PAGE_UFFD_WP; + +#ifdef CONFIG_DEBUG_VM + /* + * Having write bit for wr-protect-marked present ptes is fatal, + * because it means the uffd-wp bit will be ignored and write will + * just go through. + * + * Use any chance of pgtable walking to verify this (e.g., when + * page swapped out or being migrated for all purposes). It means + * something is already wrong. Tell the admin even before the + * process crashes. We also nail it with wrong pgtable setup. + */ + WARN_ON_ONCE(wp && pte_write(pte)); +#endif + + return wp; } static inline pte_t pte_mkuffd_wp(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 7c9c968a42ef..7d4ad8907297 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -48,15 +48,6 @@ do { \ #endif /* !__ASSEMBLY__ */ /* - * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM - */ -#ifdef CONFIG_FLATMEM -#define kern_addr_valid(addr) (1) -#else -#define kern_addr_valid(kaddr) (0) -#endif - -/* * This is used to calculate the .brk reservation for initial pagetables. * Enough space is reserved to allocate pagetables sufficient to cover all * of LOWMEM_PAGES, which is an upper bound on the size of the direct map of diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e479491da8d5..7929327abe00 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) #define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) -extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 04f36063ad54..38bf837e3554 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -19,6 +19,7 @@ typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; +typedef struct { pmdval_t pmd; } pmd_t; #ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled; diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h index d34cce1b995c..4f056fb88174 100644 --- a/arch/x86/include/asm/pgtable_areas.h +++ b/arch/x86/include/asm/pgtable_areas.h @@ -11,6 +11,12 @@ #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) -#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE) +#ifdef CONFIG_X86_32 +#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \ + (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \ + CPU_ENTRY_AREA_BASE) +#else +#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE +#endif #endif /* _ASM_X86_PGTABLE_AREAS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index aa174fed3a71..447d4bee25c4 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -361,11 +361,9 @@ static inline pudval_t native_pud_val(pud_t pud) #endif #if CONFIG_PGTABLE_LEVELS > 2 -typedef struct { pmdval_t pmd; } pmd_t; - static inline pmd_t native_make_pmd(pmdval_t val) { - return (pmd_t) { val }; + return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 5f6daea1ee24..2d13f25b1bd8 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,11 +4,11 @@ #include <asm/rmwcc.h> #include <asm/percpu.h> +#include <asm/current.h> + #include <linux/thread_info.h> #include <linux/static_call_types.h> -DECLARE_PER_CPU(int, __preempt_count); - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) @@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc) int old, new; do { - old = raw_cpu_read_4(__preempt_count); + old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); + } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); } /* @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(pcpu_hot.preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(pcpu_hot.preempt_count, -val); } /* @@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + __percpu_arg([var])); } /* @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 02c2cbda4a74..a7f3d9100adb 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -35,7 +35,7 @@ */ #ifdef CONFIG_X86_64 /* Mask off the address space ID and SME encryption bits. */ -#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK) #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67c9d73b31fa..4e35c66edeb7 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -16,6 +16,7 @@ struct vm86; #include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeatures.h> +#include <asm/cpuid.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> @@ -146,17 +147,6 @@ struct cpuinfo_x86 { unsigned initialized : 1; } __randomize_layout; -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; - -enum cpuid_regs_idx { - CPUID_EAX = 0, - CPUID_EBX, - CPUID_ECX, - CPUID_EDX, -}; - #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 @@ -205,45 +195,6 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); -#ifdef CONFIG_X86_32 -extern int have_cpuid_p(void); -#else -static inline int have_cpuid_p(void) -{ - return 1; -} -#endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx) - : "memory"); -} - -#define native_cpuid_reg(reg) \ -static inline unsigned int native_cpuid_##reg(unsigned int op) \ -{ \ - unsigned int eax = op, ebx, ecx = 0, edx; \ - \ - native_cpuid(&eax, &ebx, &ecx, &edx); \ - \ - return reg; \ -} - -/* - * Native CPUID functions returning a single datum. - */ -native_cpuid_reg(eax) -native_cpuid_reg(ebx) -native_cpuid_reg(ecx) -native_cpuid_reg(edx) - /* * Friendlier CR3 helpers. */ @@ -426,8 +377,6 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - #ifdef CONFIG_X86_64 struct fixed_percpu_data { /* @@ -450,8 +399,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(void *, hardirq_stack_ptr); -DECLARE_PER_CPU(bool, hardirq_stack_inuse); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ @@ -460,8 +407,6 @@ void current_save_fsgs(void); #ifdef CONFIG_STACKPROTECTOR DECLARE_PER_CPU(unsigned long, __stack_chk_guard); #endif -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ struct perf_event; @@ -566,7 +511,7 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ - return this_cpu_read_stable(cpu_current_top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } static __always_inline bool on_thread_stack(void) @@ -578,7 +523,6 @@ static __always_inline bool on_thread_stack(void) #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else -#define __cpuid native_cpuid static inline void load_sp0(unsigned long sp0) { @@ -589,69 +533,6 @@ static inline void load_sp0(unsigned long sp0) unsigned long __get_wchan(struct task_struct *p); -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(unsigned int op, int count, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return eax; -} - -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ebx; -} - -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ecx; -} - -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return edx; -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void amd_e400_c1e_apic_setup(void); @@ -667,10 +548,9 @@ extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void switch_to_new_gdt(int); +extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); -extern void load_percpu_segment(int); extern void cpu_init(void); extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); @@ -805,24 +685,6 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } #endif -#define for_each_possible_hypervisor_cpuid_base(function) \ - for (function = 0x40000000; function < 0x40010000; function += 0x100) - -static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) -{ - uint32_t base, eax, signature[3]; - - for_each_possible_hypervisor_cpuid_base(base) { - cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); - - if (!memcmp(sig, signature, 12) && - (leaves == 0 || ((eax - base) >= leaves))) - return base; - } - - return 0; -} - extern unsigned long arch_align_stack(unsigned long sp); void free_init_pages(const char *what, unsigned long begin, unsigned long end); extern void free_kernel_image_pages(const char *what, void *begin, void *end); diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index dbb38a6b4dfb..42b17cf10b10 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -14,8 +14,6 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock -#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" -#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock @@ -37,32 +35,27 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .spinlock.text, \"ax\";" - ".globl " PV_UNLOCK ";" - ".type " PV_UNLOCK ", @function;" - ".align 4,0x90;" - PV_UNLOCK ": " - ASM_ENDBR - FRAME_BEGIN - "push %rdx;" - "mov $0x1,%eax;" - "xor %edx,%edx;" - LOCK_PREFIX "cmpxchg %dl,(%rdi);" - "cmp $0x1,%al;" - "jne .slowpath;" - "pop %rdx;" +#define PV_UNLOCK_ASM \ + FRAME_BEGIN \ + "push %rdx\n\t" \ + "mov $0x1,%eax\n\t" \ + "xor %edx,%edx\n\t" \ + LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ + "cmp $0x1,%al\n\t" \ + "jne .slowpath\n\t" \ + "pop %rdx\n\t" \ + FRAME_END \ + ASM_RET \ + ".slowpath:\n\t" \ + "push %rsi\n\t" \ + "movzbl %al,%esi\n\t" \ + "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \ + "pop %rsi\n\t" \ + "pop %rdx\n\t" \ FRAME_END - ASM_RET - ".slowpath: " - "push %rsi;" - "movzbl %al,%esi;" - "call " PV_UNLOCK_SLOWPATH ";" - "pop %rsi;" - "pop %rdx;" - FRAME_END - ASM_RET - ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" - ".popsection"); + +DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fd6f6e5b755a..a336feef0af1 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -91,6 +91,7 @@ static inline void set_real_mode_mem(phys_addr_t mem) void reserve_real_mode(void); void load_trampoline_pgtable(void); +void init_real_mode(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index b45c4d27fd46..a5e89641bd2d 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -6,6 +6,9 @@ #include <asm/page.h> #include <asm-generic/set_memory.h> +#define set_memory_rox set_memory_rox +int set_memory_rox(unsigned long addr, int numpages); + /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index a73bced40e24..b4dbb20dab1a 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -3,10 +3,10 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include <linux/cpumask.h> -#include <asm/percpu.h> -#include <asm/thread_info.h> #include <asm/cpumask.h> +#include <asm/current.h> +#include <asm/thread_info.h> extern int smp_num_siblings; extern unsigned int num_processors; @@ -19,7 +19,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); @@ -150,11 +149,10 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); /* * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. + * from the initial startup. */ -#define raw_smp_processor_id() this_cpu_read(cpu_number) -#define __smp_processor_id() __this_cpu_read(cpu_number) +#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) +#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0361626841bc..cb1ee53ad3b1 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -5,6 +5,8 @@ #include <uapi/asm/svm.h> #include <uapi/asm/kvm.h> +#include <asm/hyperv-tlfs.h> + /* * 32-bit intercept words in the VMCB Control Area, starting * at Byte offset 000h. @@ -161,7 +163,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area { * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. */ - u8 reserved_sw[32]; + union { + struct hv_vmcb_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; @@ -293,12 +298,13 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[42]; + /* Reserved fields are named following their struct offset */ + u8 reserved_0xa0[42]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_0xd8[112]; u64 cr4; u64 cr3; u64 cr0; @@ -306,7 +312,7 @@ struct vmcb_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u8 reserved_0x180[88]; u64 rsp; u64 s_cet; u64 ssp; @@ -321,14 +327,14 @@ struct vmcb_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_6[72]; + u8 reserved_0x298[72]; u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; @@ -349,12 +355,12 @@ struct sev_es_save_area { u64 vmpl2_ssp; u64 vmpl3_ssp; u64 u_cet; - u8 reserved_1[2]; + u8 reserved_0xc8[2]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[104]; + u8 reserved_0xd8[104]; u64 xss; u64 cr4; u64 cr3; @@ -371,7 +377,7 @@ struct sev_es_save_area { u64 dr1_addr_mask; u64 dr2_addr_mask; u64 dr3_addr_mask; - u8 reserved_4[24]; + u8 reserved_0x1c0[24]; u64 rsp; u64 s_cet; u64 ssp; @@ -386,21 +392,21 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_7[80]; + u8 reserved_0x298[80]; u32 pkru; - u8 reserved_8[20]; - u64 reserved_9; /* rax already available at 0x01f8 */ + u32 tsc_aux; + u8 reserved_0x2f0[24]; u64 rcx; u64 rdx; u64 rbx; - u64 reserved_10; /* rsp already available at 0x01d8 */ + u64 reserved_0x320; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -412,7 +418,7 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_11[16]; + u8 reserved_0x380[16]; u64 guest_exit_info_1; u64 guest_exit_info_2; u64 guest_exit_int_info; @@ -425,7 +431,7 @@ struct sev_es_save_area { u64 pcpu_id; u64 event_inj; u64 xcr0; - u8 reserved_12[16]; + u8 reserved_0x3f0[16]; /* Floating point area */ u64 x87_dp; @@ -443,23 +449,23 @@ struct sev_es_save_area { } __packed; struct ghcb_save_area { - u8 reserved_1[203]; + u8 reserved_0x0[203]; u8 cpl; - u8 reserved_2[116]; + u8 reserved_0xcc[116]; u64 xss; - u8 reserved_3[24]; + u8 reserved_0x148[24]; u64 dr7; - u8 reserved_4[16]; + u8 reserved_0x168[16]; u64 rip; - u8 reserved_5[88]; + u8 reserved_0x180[88]; u64 rsp; - u8 reserved_6[24]; + u8 reserved_0x1e0[24]; u64 rax; - u8 reserved_7[264]; + u8 reserved_0x200[264]; u64 rcx; u64 rdx; u64 rbx; - u8 reserved_8[8]; + u8 reserved_0x320[8]; u64 rbp; u64 rsi; u64 rdi; @@ -471,12 +477,12 @@ struct ghcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_9[16]; + u8 reserved_0x380[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_10[56]; + u8 reserved_0x3b0[56]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; @@ -490,7 +496,7 @@ struct ghcb { u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; - u8 reserved_1[10]; + u8 reserved_0xff0[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ u32 ghcb_usage; } __packed; @@ -502,6 +508,9 @@ struct ghcb { #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE +#define BUILD_BUG_RESERVED_OFFSET(x, y) \ + ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y) + static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); @@ -509,6 +518,39 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); + + /* Check offsets of reserved fields */ + + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298); + + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0); + + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0); + + BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } struct vmcb { diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index c08eb0fdd11f..5c91305d09d2 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -66,13 +66,10 @@ static inline void update_task_stack(struct task_struct *task) { /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - if (static_cpu_has(X86_FEATURE_XENPV)) - load_sp0(task->thread.sp0); - else - this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else /* Xen PV enters the kernel on the thread stack. */ - if (static_cpu_has(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 1cc15528ce29..f4b87f08f5c5 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e9170457697e..c1c8c581759d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -285,6 +285,8 @@ struct x86_hyper_runtime { * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. + * @realmode_reserve: reserve memory for realmode trampoline + * @realmode_init: initialize realmode trampoline * @hyper: x86 hypervisor specific runtime callbacks */ struct x86_platform_ops { @@ -301,6 +303,8 @@ struct x86_platform_ops { void (*apic_post_init)(void); struct x86_legacy_features legacy; void (*set_legacy_features)(void); + void (*realmode_reserve)(void); + void (*realmode_init)(void); struct x86_hyper_runtime hyper; struct x86_guest guest; }; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 46de10a809ec..e48deab8901d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -53,14 +53,6 @@ /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ @@ -214,6 +206,8 @@ struct kvm_msr_list { struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) +#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \ + KVM_MSR_FILTER_WRITE) __u32 flags; __u32 nmsrs; /* number of msrs in bitmap */ __u32 base; /* MSR index the bitmap starts at */ @@ -222,8 +216,11 @@ struct kvm_msr_filter_range { #define KVM_MSR_FILTER_MAX_RANGES 16 struct kvm_msr_filter { +#ifndef __KERNEL__ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#endif #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) +#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY) __u32 flags; struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; }; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index f69c168391aa..80e1df482337 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -116,6 +116,12 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ + /* SW_EXITINFO1[3:0] */ \ + (((((u64)reason_set) & 0xf)) | \ + /* SW_EXITINFO1[11:4] */ \ + ((((u64)reason_code) & 0xff) << 4)) #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff /* Exit code reserved for hypervisor/software use */ |