diff options
Diffstat (limited to 'arch/x86')
117 files changed, 1575 insertions, 1906 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b6dd10b162a..3d498caca1ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,7 @@ config X86_32 select KMAP_LOCAL select MODULES_USE_ELF_REL select OLD_SIGACTION + select ARCH_SPLIT_ARG64 config X86_64 def_bool y @@ -889,7 +890,7 @@ config HPET_TIMER config HPET_EMULATE_RTC def_bool y - depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) + depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) config APB_TIMER def_bool y if X86_INTEL_MID @@ -1157,10 +1158,6 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_THERMAL_VECTOR - def_bool y - depends on X86_MCE_INTEL - source "arch/x86/events/Kconfig" config X86_LEGACY_VM86 diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 32dcdddc1089..b9f58b8993b3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -50,6 +50,9 @@ export BITS KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +# Intel CET isn't enabled in the kernel +KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 18d8f17f755c..0904f5676e4d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -73,10 +73,8 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, unsigned int nr) { if (likely(nr < IA32_NR_syscalls)) { - instrumentation_begin(); nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); - instrumentation_end(); } } @@ -91,8 +89,11 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) * or may not be necessary, but it matches the old asm behavior. */ nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + instrumentation_begin(); do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); syscall_exit_to_user_mode(regs); } @@ -121,11 +122,12 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) res = get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp); } - instrumentation_end(); if (res) { /* User code screwed up. */ regs->ax = -EFAULT; + + instrumentation_end(); syscall_exit_to_user_mode(regs); return false; } @@ -135,6 +137,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); syscall_exit_to_user_mode(regs); return true; } diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index cad08703c4ad..ce0464d630a2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -46,14 +46,6 @@ .code64 .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT_XXL -SYM_CODE_START(native_usergs_sysret64) - UNWIND_HINT_EMPTY - swapgs - sysretq -SYM_CODE_END(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT_XXL */ - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. + * In the Xen PV case we must use iret anyway. */ + + ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ + X86_FEATURE_XENPV + movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 @@ -215,7 +212,8 @@ syscall_return_via_sysret: popq %rdi popq %rsp - USERGS_SYSRET64 + swapgs + sysretq SYM_CODE_END(entry_SYSCALL_64) /* @@ -669,7 +667,7 @@ native_irq_return_ldt: */ pushq %rdi /* Stash user RDI */ - SWAPGS /* to kernel GS */ + swapgs /* to kernel GS */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ movq PER_CPU_VAR(espfix_waddr), %rdi @@ -699,7 +697,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - SWAPGS /* to user GS */ + swapgs /* to user GS */ popq %rdi /* Restore user RDI */ movq %rax, %rsp @@ -943,7 +941,7 @@ SYM_CODE_START_LOCAL(paranoid_entry) ret .Lparanoid_entry_swapgs: - SWAPGS + swapgs /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an @@ -1001,7 +999,7 @@ SYM_CODE_START_LOCAL(paranoid_exit) jnz restore_regs_and_return_to_kernel /* We are returning to a context with user GSBASE */ - SWAPGS_UNSAFE_STACK + swapgs jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) @@ -1426,7 +1424,7 @@ nmi_no_fsgsbase: jnz nmi_restore nmi_swapgs: - SWAPGS_UNSAFE_STACK + swapgs nmi_restore: POP_REGS diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index ccd32877a3c4..496b11ec469d 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -10,7 +10,7 @@ #include <asm/export.h> /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func, put_ret_addr_in_rdi=0 + .macro THUNK name, func SYM_FUNC_START_NOALIGN(\name) pushq %rbp movq %rsp, %rbp @@ -25,13 +25,8 @@ SYM_FUNC_START_NOALIGN(\name) pushq %r10 pushq %r11 - .if \put_ret_addr_in_rdi - /* 8(%rbp) is return addr on stack */ - movq 8(%rbp), %rdi - .endif - call \func - jmp .L_restore + jmp __thunk_restore SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm @@ -44,7 +39,7 @@ SYM_FUNC_END(\name) #endif #ifdef CONFIG_PREEMPTION -SYM_CODE_START_LOCAL_NOALIGN(.L_restore) +SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) popq %r11 popq %r10 popq %r9 @@ -56,6 +51,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %rdi popq %rbp ret - _ASM_NOKPROBE(.L_restore) -SYM_CODE_END(.L_restore) + _ASM_NOKPROBE(__thunk_restore) +SYM_CODE_END(__thunk_restore) #endif diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e04d90af4c27..6375967a8244 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -16,6 +16,7 @@ #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/idtentry.h> +#include <linux/kexec.h> #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> @@ -26,6 +27,8 @@ #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> +int hyperv_init_cpuhp; + void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); @@ -312,6 +315,25 @@ static struct syscore_ops hv_syscore_ops = { .resume = hv_resume, }; +static void (* __initdata old_setup_percpu_clockev)(void); + +static void __init hv_stimer_setup_percpu_clockev(void) +{ + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(); + + /* + * Still register the LAPIC timer, because the direct-mode STIMER is + * not supported by old versions of Hyper-V. This also allows users + * to switch to LAPIC timer via /sys, if they want to. + */ + if (old_setup_percpu_clockev) + old_setup_percpu_clockev(); +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -390,10 +412,14 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); /* - * Ignore any errors in setting up stimer clockevents - * as we can run with the LAPIC timer as a fallback. + * hyperv_init() is called before LAPIC is initialized: see + * apic_intr_mode_init() -> x86_platform.apic_post_init() and + * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER + * depends on LAPIC, so hv_stimer_alloc() should be called from + * x86_init.timers.setup_percpu_clockev. */ - (void)hv_stimer_alloc(); + old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; + x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; hv_apic_init(); @@ -401,6 +427,7 @@ void __init hyperv_init(void) register_syscore_ops(&hv_syscore_ops); + hyperv_init_cpuhp = cpuhp; return; remove_cpuhp_state: diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 5208ba49c89a..2c87350c1fb0 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -66,11 +66,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, if (!hv_hypercall_pg) goto do_native; - if (cpumask_empty(cpus)) - return; - local_irq_save(flags); + /* + * Only check the mask _after_ interrupt has been disabled to avoid the + * mask changing under our feet. + */ + if (cpumask_empty(cpus)) { + local_irq_restore(flags); + return; + } + flush_pcpu = (struct hv_tlb_flush **) this_cpu_ptr(hyperv_pcpu_input_arg); diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 34cb3c159481..412b51e059c8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -197,16 +197,6 @@ static inline bool apic_needs_pit(void) { return true; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC -/* - * Make previous memory operations globally visible before - * sending the IPI through x2apic wrmsr. We need a serializing instruction or - * mfence for this. - */ -static inline void x2apic_wrmsr_fence(void) -{ - asm volatile("mfence" : : : "memory"); -} - static inline void native_apic_msr_write(u32 reg, u32 v) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 7f828fe49797..4819d5e5a335 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -84,4 +84,22 @@ do { \ #include <asm-generic/barrier.h> +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + asm volatile("mfence; lfence" : : : "memory"); +} + #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 59bf91c57aa8..1728d4ce5730 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_8000_001F_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 84b887825f12..1feb6c089ba2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +/* FREE! ( 7*32+20) */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -236,8 +236,6 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -385,6 +383,13 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 7947cb1782da..b7dd944dc867 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -91,6 +91,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c98f78330b09..4d0b126835b8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -12,6 +12,7 @@ #include <linux/pgtable.h> extern unsigned long efi_fw_vendor, efi_config_table; +extern unsigned long efi_mixed_mode_stack_pa; /* * We map the EFI regions needed for runtime services non-contiguously, @@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table; #f " called with too many arguments (" #p ">" #n ")"); \ }) +static inline void efi_fpu_begin(void) +{ + /* + * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires + * that FCW and MXCSR (64-bit) must be initialized prior to calling + * UEFI code. (Oddly the spec does not require that the FPU stack + * be empty.) + */ + kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +} + +static inline void efi_fpu_end(void) +{ + kernel_fpu_end(); +} + #ifdef CONFIG_X86_32 #define arch_efi_call_virt_setup() \ ({ \ - kernel_fpu_begin(); \ + efi_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ }) #define arch_efi_call_virt_teardown() \ ({ \ firmware_restrict_branch_speculation_end(); \ - kernel_fpu_end(); \ + efi_fpu_end(); \ }) #define arch_efi_call_virt(p, f, args...) p->f(args) @@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...); __efi_call(__VA_ARGS__); \ }) -/* - * struct efi_scratch - Scratch space used while switching to/from efi_mm - * @phys_stack: stack used during EFI Mixed Mode - * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm - */ -struct efi_scratch { - u64 phys_stack; - struct mm_struct *prev_mm; -} __packed; - #define arch_efi_call_virt_setup() \ ({ \ efi_sync_low_kernel_mappings(); \ - kernel_fpu_begin(); \ + efi_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ - efi_switch_mm(&efi_mm); \ + efi_enter_mm(); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -117,9 +124,9 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - efi_switch_mm(efi_scratch.prev_mm); \ + efi_leave_mm(); \ firmware_restrict_branch_speculation_end(); \ - kernel_fpu_end(); \ + efi_fpu_end(); \ }) #ifdef CONFIG_KASAN @@ -136,7 +143,6 @@ struct efi_scratch { #endif /* CONFIG_X86_32 */ -extern struct efi_scratch efi_scratch; extern int __init efi_memblock_x86_reserve_range(void); extern void __init efi_print_memmap(void); extern void __init efi_map_region(efi_memory_desc_t *md); @@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); -extern void efi_switch_mm(struct mm_struct *mm); -extern void efi_recover_from_page_fault(unsigned long phys_addr); +extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); +void efi_enter_mm(void); +void efi_leave_mm(void); + /* kexec external ABI */ struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 6fe54b2813c1..2b87b191b3b8 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -43,8 +43,6 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) } #define arch_check_user_regs arch_check_user_regs -#define ARCH_SYSCALL_EXIT_WORK (_TIF_SINGLESTEP) - static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index a5aba4ab0224..ed33a14188f6 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -16,14 +16,37 @@ * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods * of time. - * If you intend to use the FPU in softirq you need to check first with + * If you intend to use the FPU in irq/softirq you need to check first with * irq_fpu_usable() if it is possible. */ -extern void kernel_fpu_begin(void); + +/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ +#define KFPU_387 _BITUL(0) /* 387 state will be initialized */ +#define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */ + +extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); +/* Code that is unaware of kernel_fpu_begin_mask() can use this */ +static inline void kernel_fpu_begin(void) +{ +#ifdef CONFIG_X86_64 + /* + * Any 64-bit code that uses 387 instructions must explicitly request + * KFPU_387. + */ + kernel_fpu_begin_mask(KFPU_MXCSR); +#else + /* + * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * as long as it checks that the CPU has the required capability. + */ + kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +#endif +} + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. * A context switch will (and softirq might) save CPU's FPU registers to diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 247a60a47331..41e2e2e1b439 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -585,6 +585,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); #else DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); #endif +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); +#endif #endif /* NMI */ @@ -605,6 +608,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); +#endif /* #VC */ #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -613,6 +619,7 @@ DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication); #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); +DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); #endif /* Device interrupts common/spurious */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5e658ba2654a..9abe842dbd84 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -97,6 +97,7 @@ #define INTEL_FAM6_LAKEFIELD 0x8A #define INTEL_FAM6_ALDERLAKE 0x97 +#define INTEL_FAM6_ALDERLAKE_L 0x9A /* "Small Core" Processors (Atom) */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 528c8a71fe7f..76d389691b5b 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -40,8 +40,6 @@ extern void native_init_IRQ(void); extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); - extern void init_ISA_irqs(void); extern void __init init_IRQ(void); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2dfc8d380dab..144d70ea4393 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void) return flags; } -extern inline void native_restore_fl(unsigned long flags); -extern inline void native_restore_fl(unsigned long flags) -{ - asm volatile("push %0 ; popf" - : /* no output */ - :"g" (flags) - :"memory", "cc"); -} - static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); @@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void) return native_save_fl(); } -static __always_inline void arch_local_irq_restore(unsigned long flags) -{ - native_restore_fl(flags); -} - static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); @@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void) #define SAVE_FLAGS(x) pushfq; popq %rax #endif -#define SWAPGS swapgs -/* - * Currently paravirt can't handle swapgs nicely when we - * don't have a stack we can rely on (such as a user space - * stack). So we either find a way around these or just fault - * and emulate if a guest tries to call swapgs directly. - * - * Either way, this is a good way to document that we don't - * have a reliable stack. x86_64 only. - */ -#define SWAPGS_UNSAFE_STACK swapgs - #define INTERRUPT_RETURN jmp native_iret -#define USERGS_SYSRET64 \ - swapgs; \ - sysretq; -#define USERGS_SYSRET32 \ - swapgs; \ - sysretl #else #define INTERRUPT_RETURN iret @@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } + +static __always_inline void arch_local_irq_restore(unsigned long flags) +{ + if (!arch_irqs_disabled_flags(flags)) + arch_local_irq_enable(); +} +#else +#ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN_PV +#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV +#else +#define SWAPGS swapgs +#endif +#endif #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3ab7b46087b7..3d6616f6f6ef 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1010,9 +1010,21 @@ struct kvm_arch { */ bool tdp_mmu_enabled; - /* List of struct tdp_mmu_pages being used as roots */ + /* + * List of struct kvmp_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set. + * All struct kvm_mmu_pages in the list should have a positive + * root_count except when a thread holds the MMU lock and is removing + * an entry from the list. + */ struct list_head tdp_mmu_roots; - /* List of struct tdp_mmu_pages not being used as roots */ + + /* + * List of struct kvmp_mmu_pages not being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set and a root_count of 0. + */ struct list_head tdp_mmu_pages; }; @@ -1287,6 +1299,8 @@ struct kvm_x86_ops { void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); + + void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); }; struct kvm_x86_nested_ops { @@ -1468,6 +1482,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/x86/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local64.h> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 56cdeaac76a0..ddfb3cad8dff 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -289,28 +289,6 @@ extern void (*mce_threshold_vector)(void); extern void (*deferred_error_int_vector)(void); /* - * Thermal handler - */ - -void intel_init_thermal(struct cpuinfo_x86 *c); - -/* Interrupt Handler for core thermal thresholds */ -extern int (*platform_thermal_notify)(__u64 msr_val); - -/* Interrupt Handler for package thermal thresholds */ -extern int (*platform_thermal_package_notify)(__u64 msr_val); - -/* Callback support of rate control, return true, if - * callback has rate control */ -extern bool (*platform_thermal_package_rate_control)(void); - -#ifdef CONFIG_X86_THERMAL_VECTOR -extern void mcheck_intel_therm_init(void); -#else -static inline void mcheck_intel_therm_init(void) { } -#endif - -/* * Used by APEI to report memory error via /dev/mcelog */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2b7cc5397f80..ab45a220fac4 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void) } #ifdef CONFIG_MICROCODE -int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); extern bool initrd_gone; #else -static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ffc289992d1b..30f76b966857 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -74,6 +74,8 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {} #if IS_ENABLED(CONFIG_HYPERV) +extern int hyperv_init_cpuhp; + extern void *hv_hypercall_pg; extern void __percpu **hyperv_pcpu_input_arg; diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 0b4920a7238e..e16cccdd0420 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -86,7 +86,7 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ -static inline unsigned long long notrace __rdmsr(unsigned int msr) +static __always_inline unsigned long long __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); @@ -98,7 +98,7 @@ static inline unsigned long long notrace __rdmsr(unsigned int msr) return EAX_EDX_VAL(val, low, high); } -static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) +static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 645bd1d0ee07..64297eabad63 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -66,7 +66,7 @@ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical * address, then that syscall will enter the kernel with a * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable + * We avoid this particular problem by preventing anything * from being mapped at the maximum canonical address. * * On AMD CPUs in the Ryzen family, there's a nasty bug in which the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f8dce11d2bc1..4abf110e2243 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void) return PVOP_CALLEE0(unsigned long, irq.save_fl); } -static inline notrace void arch_local_irq_restore(unsigned long f) -{ - PVOP_VCALLEE1(irq.restore_fl, f); -} - static inline notrace void arch_local_irq_disable(void) { PVOP_VCALLEE0(irq.irq_disable); @@ -776,31 +771,6 @@ extern void default_banner(void); #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL -/* - * If swapgs is used while the userspace stack is still current, - * there's no way to call a pvop. The PV replacement *must* be - * inlined, or the swapgs instruction must be trapped and emulated. - */ -#define SWAPGS_UNSAFE_STACK \ - PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs) - -/* - * Note: swapgs is very special, and in practise is either going to be - * implemented with a single "swapgs" instruction or something very - * special. Either way, we don't need to save any registers for - * it. - */ -#define SWAPGS \ - PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ - ) - -#define USERGS_SYSRET64 \ - PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);) - #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS(clobbers) \ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b6b02b7c19cc..de87087d3bde 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -156,20 +156,10 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); - void (*swapgs)(void); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif @@ -178,16 +168,13 @@ struct pv_cpu_ops { struct pv_irq_ops { #ifdef CONFIG_PARAVIRT_XXL /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. + * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF; + * all other bits returned from save_fl are undefined. * * NOTE: These functions callers expect the callee to preserve * more registers than the standard C calling convention. */ struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; struct paravirt_callee_save irq_disable; struct paravirt_callee_save irq_enable; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 394757ee030a..f24d7ef8fffa 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -177,8 +177,6 @@ enum page_cache_mode { #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) -#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) - #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 3ff0d48469f2..b2d504f11937 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 07603064df8f..d60ed0668a59 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -56,19 +56,22 @@ static void __resctrl_sched_in(void) struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); u32 closid = state->default_closid; u32 rmid = state->default_rmid; + u32 tmp; /* * If this task has a closid/rmid assigned, use it. * Else use the closid/rmid assigned to this cpu. */ if (static_branch_likely(&rdt_alloc_enable_key)) { - if (current->closid) - closid = current->closid; + tmp = READ_ONCE(current->closid); + if (tmp) + closid = tmp; } if (static_branch_likely(&rdt_mon_enable_key)) { - if (current->rmid) - rmid = current->rmid; + tmp = READ_ONCE(current->rmid); + if (tmp) + rmid = tmp; } if (closid != state->cur_closid || rmid != state->cur_rmid) { diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h new file mode 100644 index 000000000000..ddbdefd5b94f --- /dev/null +++ b/arch/x86/include/asm/thermal.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_THERMAL_H +#define _ASM_X86_THERMAL_H + +#ifdef CONFIG_X86_THERMAL_VECTOR +void intel_init_thermal(struct cpuinfo_x86 *c); +bool x86_thermal_enabled(void); +void intel_thermal_interrupt(void); +#else +static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } +#endif + +#endif /* _ASM_X86_THERMAL_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 488a8e848754..9239399e5491 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -110,6 +110,8 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +extern unsigned int __max_die_per_package; + #ifdef CONFIG_SMP #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) @@ -118,8 +120,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) -extern unsigned int __max_die_per_package; - static inline int topology_max_die_per_package(void) { return __max_die_per_package; diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 26efbec94448..9e8ac5073ecb 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -36,7 +36,6 @@ struct vm86 { unsigned long saved_sp0; unsigned long flags; - unsigned long screen_bitmap; unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h index d2ee4e307ef8..18909b8050bc 100644 --- a/arch/x86/include/uapi/asm/vm86.h +++ b/arch/x86/include/uapi/asm/vm86.h @@ -97,7 +97,7 @@ struct revectored_struct { struct vm86_struct { struct vm86_regs regs; unsigned long flags; - unsigned long screen_bitmap; + unsigned long screen_bitmap; /* unused, preserved by vm86() */ unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; @@ -106,7 +106,7 @@ struct vm86_struct { /* * flags masks */ -#define VM86_SCREEN_BITMAP 0x0001 +#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */ struct vm86plus_info_struct { unsigned long force_return_for_pic:1; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6bd20c0de8bc..7f4c081f59f0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -41,6 +41,7 @@ #include <asm/perf_event.h> #include <asm/x86_init.h> #include <linux/atomic.h> +#include <asm/barrier.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> @@ -477,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index df6adc5674c9..f4da9bb69a88 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); } @@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long flags; u32 dest; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 0e4e81971567..6bde05a86b4e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -43,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); } @@ -54,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long this_cpu; unsigned long flags; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); @@ -125,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which) { unsigned long cfg = __prepare_ICR(which, vector, 0); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); native_x2apic_icr_write(cfg, 0); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 828be792231e..b14533af7676 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,9 +13,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT_XXL - OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, - cpu.usergs_sysret64); - OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs); #ifdef CONFIG_DEBUG_ENTRY OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f8ca66f3d861..347a956f71ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -542,12 +542,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 35ad8480c464..9215b91bc044 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 59a1e3ce3f14..0e422a544835 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,6 +24,7 @@ #include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> +#include <asm/thermal.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 @@ -1159,6 +1162,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), {} }; diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile index 9f020c994154..015856abdbb1 100644 --- a/arch/x86/kernel/cpu/mce/Makefile +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o mce-inject-y := inject.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 13d3f1cbda17..7962355436da 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -878,6 +878,12 @@ static atomic_t mce_executing; static atomic_t mce_callin; /* + * Track which CPUs entered the MCA broadcast synchronization and which not in + * order to print holdouts. + */ +static cpumask_t mce_missing_cpus = CPU_MASK_ALL; + +/* * Check if a timeout waiting for other CPUs happened. */ static int mce_timed_out(u64 *t, const char *msg) @@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) + if (mca_cfg.tolerant <= 1) { + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); + } cpu_missing = 1; return 1; } @@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out) * is updated before mce_callin. */ order = atomic_inc_return(&mce_callin); + cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); /* * Wait for everyone. @@ -1114,6 +1125,7 @@ static int mce_end(int order) reset: atomic_set(&global_nwo, 0); atomic_set(&mce_callin, 0); + cpumask_setall(&mce_missing_cpus); barrier(); /* @@ -1992,10 +2004,9 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); - trace_hardirqs_off_finish(); + machine_check_vector(regs); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); + instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } @@ -2004,7 +2015,9 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); + machine_check_vector(regs); + instrumentation_end(); irqentry_exit_to_user_mode(regs); } @@ -2177,7 +2190,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - mcheck_intel_therm_init(); mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); @@ -2712,6 +2724,7 @@ static void mce_reset(void) atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); + cpumask_setall(&mce_missing_cpus); } static int fake_panic_get(void *data, u64 *val) diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index c2476fe0682e..e309476743b7 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c) void mce_intel_feature_init(struct cpuinfo_x86 *c) { - intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); intel_ppin_init(c); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c deleted file mode 100644 index a7cd2d203ced..000000000000 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Thermal throttle event support code (such as syslog messaging and rate - * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). - * - * This allows consistent reporting of CPU thermal throttle events. - * - * Maintains a counter in /sys that keeps track of the number of thermal - * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog is rate limited). - * - * Author: Dmitriy Zavin (dmitriyz@google.com) - * - * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * Inspired by Ross Biro's and Al Borchers' counter code. - */ -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <asm/processor.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/trace/irq_vectors.h> - -#include "internal.h" - -/* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) - -#define THERMAL_THROTTLING_EVENT 0 -#define POWER_LIMIT_EVENT 1 - -/** - * struct _thermal_state - Represent the current thermal event state - * @next_check: Stores the next timestamp, when it is allowed - * to log the next warning message. - * @last_interrupt_time: Stores the timestamp for the last threshold - * high event. - * @therm_work: Delayed workqueue structure - * @count: Stores the current running count for thermal - * or power threshold interrupts. - * @last_count: Stores the previous running count for thermal - * or power threshold interrupts. - * @max_time_ms: This shows the maximum amount of time CPU was - * in throttled state for a single thermal - * threshold high to low state. - * @total_time_ms: This is a cumulative time during which CPU was - * in the throttled state. - * @rate_control_active: Set when a throttling message is logged. - * This is used for the purpose of rate-control. - * @new_event: Stores the last high/low status of the - * THERM_STATUS_PROCHOT or - * THERM_STATUS_POWER_LIMIT. - * @level: Stores whether this _thermal_state instance is - * for a CORE level or for PACKAGE level. - * @sample_index: Index for storing the next sample in the buffer - * temp_samples[]. - * @sample_count: Total number of samples collected in the buffer - * temp_samples[]. - * @average: The last moving average of temperature samples - * @baseline_temp: Temperature at which thermal threshold high - * interrupt was generated. - * @temp_samples: Storage for temperature samples to calculate - * moving average. - * - * This structure is used to represent data related to thermal state for a CPU. - * There is a separate storage for core and package level for each CPU. - */ -struct _thermal_state { - u64 next_check; - u64 last_interrupt_time; - struct delayed_work therm_work; - unsigned long count; - unsigned long last_count; - unsigned long max_time_ms; - unsigned long total_time_ms; - bool rate_control_active; - bool new_event; - u8 level; - u8 sample_index; - u8 sample_count; - u8 average; - u8 baseline_temp; - u8 temp_samples[3]; -}; - -struct thermal_state { - struct _thermal_state core_throttle; - struct _thermal_state core_power_limit; - struct _thermal_state package_throttle; - struct _thermal_state package_power_limit; - struct _thermal_state core_thresh0; - struct _thermal_state core_thresh1; - struct _thermal_state pkg_thresh0; - struct _thermal_state pkg_thresh1; -}; - -/* Callback to handle core threshold interrupts */ -int (*platform_thermal_notify)(__u64 msr_val); -EXPORT_SYMBOL(platform_thermal_notify); - -/* Callback to handle core package threshold_interrupts */ -int (*platform_thermal_package_notify)(__u64 msr_val); -EXPORT_SYMBOL_GPL(platform_thermal_package_notify); - -/* Callback support of rate control, return true, if - * callback has rate control */ -bool (*platform_thermal_package_rate_control)(void); -EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); - - -static DEFINE_PER_CPU(struct thermal_state, thermal_state); - -static atomic_t therm_throt_en = ATOMIC_INIT(0); - -static u32 lvtthmr_init __read_mostly; - -#ifdef CONFIG_SYSFS -#define define_therm_throt_device_one_ro(_name) \ - static DEVICE_ATTR(_name, 0444, \ - therm_throt_device_show_##_name, \ - NULL) \ - -#define define_therm_throt_device_show_func(event, name) \ - \ -static ssize_t therm_throt_device_show_##event##_##name( \ - struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) { \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).event.name); \ - } else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ -} - -define_therm_throt_device_show_func(core_throttle, count); -define_therm_throt_device_one_ro(core_throttle_count); - -define_therm_throt_device_show_func(core_power_limit, count); -define_therm_throt_device_one_ro(core_power_limit_count); - -define_therm_throt_device_show_func(package_throttle, count); -define_therm_throt_device_one_ro(package_throttle_count); - -define_therm_throt_device_show_func(package_power_limit, count); -define_therm_throt_device_one_ro(package_power_limit_count); - -define_therm_throt_device_show_func(core_throttle, max_time_ms); -define_therm_throt_device_one_ro(core_throttle_max_time_ms); - -define_therm_throt_device_show_func(package_throttle, max_time_ms); -define_therm_throt_device_one_ro(package_throttle_max_time_ms); - -define_therm_throt_device_show_func(core_throttle, total_time_ms); -define_therm_throt_device_one_ro(core_throttle_total_time_ms); - -define_therm_throt_device_show_func(package_throttle, total_time_ms); -define_therm_throt_device_one_ro(package_throttle_total_time_ms); - -static struct attribute *thermal_throttle_attrs[] = { - &dev_attr_core_throttle_count.attr, - &dev_attr_core_throttle_max_time_ms.attr, - &dev_attr_core_throttle_total_time_ms.attr, - NULL -}; - -static const struct attribute_group thermal_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" -}; -#endif /* CONFIG_SYSFS */ - -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 - -#define THERM_THROT_POLL_INTERVAL HZ -#define THERM_STATUS_PROCHOT_LOG BIT(1) - -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) - -static void clear_therm_status_log(int level) -{ - int msr; - u64 mask, msr_val; - - if (level == CORE_LEVEL) { - msr = MSR_IA32_THERM_STATUS; - mask = THERM_STATUS_CLEAR_CORE_MASK; - } else { - msr = MSR_IA32_PACKAGE_THERM_STATUS; - mask = THERM_STATUS_CLEAR_PKG_MASK; - } - - rdmsrl(msr, msr_val); - msr_val &= mask; - wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); -} - -static void get_therm_status(int level, bool *proc_hot, u8 *temp) -{ - int msr; - u64 msr_val; - - if (level == CORE_LEVEL) - msr = MSR_IA32_THERM_STATUS; - else - msr = MSR_IA32_PACKAGE_THERM_STATUS; - - rdmsrl(msr, msr_val); - if (msr_val & THERM_STATUS_PROCHOT_LOG) - *proc_hot = true; - else - *proc_hot = false; - - *temp = (msr_val >> 16) & 0x7F; -} - -static void __maybe_unused throttle_active_work(struct work_struct *work) -{ - struct _thermal_state *state = container_of(to_delayed_work(work), - struct _thermal_state, therm_work); - unsigned int i, avg, this_cpu = smp_processor_id(); - u64 now = get_jiffies_64(); - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* temperature value is offset from the max so lesser means hotter */ - if (!hot && temp > state->baseline_temp) { - if (state->rate_control_active) - pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - - state->rate_control_active = false; - return; - } - - if (time_before64(now, state->next_check) && - state->rate_control_active) - goto re_arm; - - state->next_check = now + CHECK_INTERVAL; - - if (state->count != state->last_count) { - /* There was one new thermal interrupt */ - state->last_count = state->count; - state->average = 0; - state->sample_count = 0; - state->sample_index = 0; - } - - state->temp_samples[state->sample_index] = temp; - state->sample_count++; - state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); - if (state->sample_count < ARRAY_SIZE(state->temp_samples)) - goto re_arm; - - avg = 0; - for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) - avg += state->temp_samples[i]; - - avg /= ARRAY_SIZE(state->temp_samples); - - if (state->average > avg) { - pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - state->rate_control_active = true; - } - - state->average = avg; - -re_arm: - clear_therm_status_log(state->level); - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); -} - -/*** - * therm_throt_process - Process thermal throttling event from interrupt - * @curr: Whether the condition is current or not (boolean), since the - * thermal interrupt normally gets called both when the thermal - * event begins and once the event has ended. - * - * This function is called by the thermal interrupt after the - * IRQ has been acknowledged. - * - * It will take care of rate limiting and printing messages to the syslog. - */ -static void therm_throt_process(bool new_event, int event, int level) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - bool old_event; - u64 now; - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - - now = get_jiffies_64(); - if (level == CORE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->core_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->core_power_limit; - else - return; - } else if (level == PACKAGE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->package_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->package_power_limit; - else - return; - } else - return; - - old_event = state->new_event; - state->new_event = new_event; - - if (new_event) - state->count++; - - if (event != THERMAL_THROTTLING_EVENT) - return; - - if (new_event && !state->last_interrupt_time) { - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* - * Ignore short temperature spike as the system is not close - * to PROCHOT. 10C offset is large enough to ignore. It is - * already dropped from the high threshold temperature. - */ - if (temp > 10) - return; - - state->baseline_temp = temp; - state->last_interrupt_time = now; - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); - } else if (old_event && state->last_interrupt_time) { - unsigned long throttle_time; - - throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); - if (throttle_time > state->max_time_ms) - state->max_time_ms = throttle_time; - state->total_time_ms += throttle_time; - state->last_interrupt_time = 0; - } -} - -static int thresh_event_valid(int level, int event) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - u64 now = get_jiffies_64(); - - if (level == PACKAGE_LEVEL) - state = (event == 0) ? &pstate->pkg_thresh0 : - &pstate->pkg_thresh1; - else - state = (event == 0) ? &pstate->core_thresh0 : - &pstate->core_thresh1; - - if (time_before64(now, state->next_check)) - return 0; - - state->next_check = now + CHECK_INTERVAL; - - return 1; -} - -static bool int_pln_enable; -static int __init int_pln_enable_setup(char *s) -{ - int_pln_enable = true; - - return 1; -} -__setup("int_pln_enable", int_pln_enable_setup); - -#ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device: */ -static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) -{ - int err; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - err = sysfs_create_group(&dev->kobj, &thermal_attr_group); - if (err) - return err; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_core_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - - if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_max_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_total_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - } - - return 0; - -del_group: - sysfs_remove_group(&dev->kobj, &thermal_attr_group); - - return err; -} - -static void thermal_throttle_remove_dev(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &thermal_attr_group); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int thermal_throttle_online(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - state->package_throttle.level = PACKAGE_LEVEL; - state->core_throttle.level = CORE_LEVEL; - - INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); - INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); - - /* Unmask the thermal vector after the above workqueues are initialized. */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - return thermal_throttle_add_dev(dev, cpu); -} - -static int thermal_throttle_offline(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - /* Mask the thermal vector before draining evtl. pending work */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); - - cancel_delayed_work_sync(&state->package_throttle.therm_work); - cancel_delayed_work_sync(&state->core_throttle.therm_work); - - state->package_throttle.rate_control_active = false; - state->core_throttle.rate_control_active = false; - - thermal_throttle_remove_dev(dev); - return 0; -} - -static __init int thermal_throttle_init_device(void) -{ - int ret; - - if (!atomic_read(&therm_throt_en)) - return 0; - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", - thermal_throttle_online, - thermal_throttle_offline); - return ret < 0 ? ret : 0; -} -device_initcall(thermal_throttle_init_device); - -#endif /* CONFIG_SYSFS */ - -static void notify_package_thresholds(__u64 msr_val) -{ - bool notify_thres_0 = false; - bool notify_thres_1 = false; - - if (!platform_thermal_package_notify) - return; - - /* lower threshold check */ - if (msr_val & THERM_LOG_THRESHOLD0) - notify_thres_0 = true; - /* higher threshold check */ - if (msr_val & THERM_LOG_THRESHOLD1) - notify_thres_1 = true; - - if (!notify_thres_0 && !notify_thres_1) - return; - - if (platform_thermal_package_rate_control && - platform_thermal_package_rate_control()) { - /* Rate control is implemented in callback */ - platform_thermal_package_notify(msr_val); - return; - } - - /* lower threshold reached */ - if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) - platform_thermal_package_notify(msr_val); - /* higher threshold reached */ - if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) - platform_thermal_package_notify(msr_val); -} - -static void notify_thresholds(__u64 msr_val) -{ - /* check whether the interrupt handler is defined; - * otherwise simply return - */ - if (!platform_thermal_notify) - return; - - /* lower threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD0) && - thresh_event_valid(CORE_LEVEL, 0)) - platform_thermal_notify(msr_val); - /* higher threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD1) && - thresh_event_valid(CORE_LEVEL, 1)) - platform_thermal_notify(msr_val); -} - -/* Thermal transition interrupt handler */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - - /* Check for violation of core thermal thresholds*/ - notify_thresholds(msr_val); - - therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PTS)) { - rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - /* check violations of package thermal thresholds */ - notify_package_thresholds(msr_val); - therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL); - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & - PACKAGE_THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - PACKAGE_LEVEL); - } -} - -static void unexpected_thermal_interrupt(void) -{ - pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); -} - -static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; - -DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) -{ - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); -} - -/* Thermal monitoring depends on APIC, ACPI and clock modulation */ -static int intel_thermal_supported(struct cpuinfo_x86 *c) -{ - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return 0; - return 1; -} - -void __init mcheck_intel_therm_init(void) -{ - /* - * This function is only called on boot CPU. Save the init thermal - * LVT value on BSP and use that value to restore APs' thermal LVT - * entry BIOS programmed later - */ - if (intel_thermal_supported(&boot_cpu_data)) - lvtthmr_init = apic_read(APIC_LVTTHMR); -} - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - if (!intel_thermal_supported(c)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - h = lvtthmr_init; - /* - * The initial value of thermal LVT entries on all APs always reads - * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI - * sequence to them and LVT registers are reset to 0s except for - * the mask bits which are set to 1s when APs receive INIT IPI. - * If BIOS takes over the thermal interrupt and sets its interrupt - * delivery mode to SMI (not fixed), it restores the value that the - * BIOS has programmed on AP based on BSP's info we saved since BIOS - * is always setting the same value for all threads/cores. - */ - if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) - apic_write(APIC_LVTTHMR, lvtthmr_init); - - - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - if (system_state == SYSTEM_BOOTING) - pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - /* early Pentium M models use different method for enabling TM2 */ - if (cpu_has(c, X86_FEATURE_TM2)) { - if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { - rdmsr(MSR_THERM2_CTL, l, h); - if (l & MSR_THERM2_CTL_TM_SELECT) - tm2 = 1; - } else if (l & MSR_IA32_MISC_ENABLE_TM2) - tm2 = 1; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - (l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - if (cpu_has(c, X86_FEATURE_PTS)) { - rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - (l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE)) - & ~PACKAGE_THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE - | PACKAGE_THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); - } - - smp_thermal_vector = intel_thermal_interrupt; - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ec6f0415bc6d..b935e1b5f115 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -int __init microcode_init(void) +static int __init microcode_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; int error; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f628e3dc150f..43b54bef5448 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -135,14 +135,32 @@ static void hv_machine_shutdown(void) { if (kexec_in_progress && hv_kexec_handler) hv_kexec_handler(); + + /* + * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor + * corrupts the old VP Assist Pages and can crash the kexec kernel. + */ + if (kexec_in_progress && hyperv_init_cpuhp > 0) + cpuhp_remove_state(hyperv_init_cpuhp); + + /* The function calls stop_other_cpus(). */ native_machine_shutdown(); + + /* Disable the hypercall page when there is only 1 active CPU. */ + if (kexec_in_progress) + hyperv_cleanup(); } static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) hv_crash_handler(regs); + + /* The function calls crash_smp_send_stop(). */ native_machine_crash_shutdown(regs); + + /* Disable the hypercall page when there is only 1 active CPU. */ + hyperv_cleanup(); } #endif /* CONFIG_KEXEC_CORE */ #endif /* CONFIG_HYPERV */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5bd011737272..9231640782fa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) if (!size_base) continue; - size_base = to_size_factor(size_base, &size_factor), + size_base = to_size_factor(size_base, &size_factor); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), + start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 23ad8e953dfb..b90f3f437765 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -3,7 +3,6 @@ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ -#define DEBUG #include <linux/export.h> #include <linux/init.h> @@ -167,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, *repeat = 0; *uniform = 1; - /* Make end inclusive instead of exclusive */ - end--; - prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -261,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) int repeat; u64 partial_end; + /* Make end inclusive instead of exclusive */ + end--; + if (!mtrr_state_set) return MTRR_TYPE_INVALID; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 61eb26edc6d2..28c8a23aa42e 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -31,8 +31,6 @@ System Programming Guide; Section 9.11. (1997 edition - PPro). */ -#define DEBUG - #include <linux/types.h> /* FIXME: kvm_para.h needs this */ #include <linux/stop_machine.h> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ee71c47844cb..c4d320d02fd5 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -572,6 +572,7 @@ union cpuid_0x10_x_edx { void rdt_last_cmd_clear(void); void rdt_last_cmd_puts(const char *s); +__printf(1, 2) void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 29ffb95b25ff..f9190adc52cb 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -525,89 +525,70 @@ static void rdtgroup_remove(struct rdtgroup *rdtgrp) kfree(rdtgrp); } -struct task_move_callback { - struct callback_head work; - struct rdtgroup *rdtgrp; -}; - -static void move_myself(struct callback_head *head) +static void _update_task_closid_rmid(void *task) { - struct task_move_callback *callback; - struct rdtgroup *rdtgrp; - - callback = container_of(head, struct task_move_callback, work); - rdtgrp = callback->rdtgrp; - /* - * If resource group was deleted before this task work callback - * was invoked, then assign the task to root group and free the - * resource group. + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. */ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - current->closid = 0; - current->rmid = 0; - rdtgroup_remove(rdtgrp); - } - - if (unlikely(current->flags & PF_EXITING)) - goto out; - - preempt_disable(); - /* update PQR_ASSOC MSR to make resource group go into effect */ - resctrl_sched_in(); - preempt_enable(); + if (task == current) + resctrl_sched_in(); +} -out: - kfree(callback); +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); } static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { - struct task_move_callback *callback; - int ret; - - callback = kzalloc(sizeof(*callback), GFP_KERNEL); - if (!callback) - return -ENOMEM; - callback->work.func = move_myself; - callback->rdtgrp = rdtgrp; + /* If the task is already in rdtgrp, no need to move the task. */ + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && + tsk->rmid == rdtgrp->mon.rmid) || + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && + tsk->closid == rdtgrp->mon.parent->closid)) + return 0; /* - * Take a refcount, so rdtgrp cannot be freed before the - * callback has been invoked. + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. */ - atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, TWA_RESUME); - if (ret) { - /* - * Task is exiting. Drop the refcount and free the callback. - * No need to check the refcount as the group cannot be - * deleted before the write function unlocks rdtgroup_mutex. - */ - atomic_dec(&rdtgrp->waitcount); - kfree(callback); - rdt_last_cmd_puts("Task exited\n"); - } else { - /* - * For ctrl_mon groups move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; - tsk->rmid = rdtgrp->mon.rmid; - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - ret = -EINVAL; - } + + if (rdtgrp->type == RDTCTRL_GROUP) { + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) { + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } } - return ret; + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + */ + barrier(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; } static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) @@ -2329,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - t->closid = to->closid; - t->rmid = to->mon.rmid; + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 236924930bf0..972ec3bfa9c0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, - { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index f2eac41bb4ff..8ce6d8371cfb 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -72,6 +72,9 @@ static int sgx_release(struct inode *inode, struct file *file) synchronize_srcu(&encl->srcu); mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); kfree(encl_mm); + + /* 'encl_mm' is gone, put encl_mm->encl reference: */ + kref_put(&encl->refcount, sgx_encl_release); } kref_put(&encl->refcount, sgx_encl_release); diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index ee50a5010277..7449ef33f081 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) struct sgx_encl_page *entry; unsigned long phys_addr; struct sgx_encl *encl; - unsigned long pfn; vm_fault_t ret; encl = vma->vm_private_data; @@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) phys_addr = sgx_get_epc_phys_addr(entry->epc_page); - /* Check if another thread got here first to insert the PTE. */ - if (!follow_pfn(vma, addr, &pfn)) { - mutex_unlock(&encl->lock); - - return VM_FAULT_NOPAGE; - } - ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); if (ret != VM_FAULT_NOPAGE) { mutex_unlock(&encl->lock); @@ -481,6 +473,9 @@ static void sgx_mmu_notifier_free(struct mmu_notifier *mn) { struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + /* 'encl_mm' is going away, put encl_mm->encl reference: */ + kref_put(&encl_mm->encl->refcount, sgx_encl_release); + kfree(encl_mm); } @@ -534,6 +529,8 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) if (!encl_mm) return -ENOMEM; + /* Grab a refcount for the encl_mm->encl reference: */ + kref_get(&encl->refcount); encl_mm->encl = encl; encl_mm->mm = mm; encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index c519fc5f6948..8df81a3ed945 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void) return true; } -static void __init sgx_init(void) +static int __init sgx_init(void) { int ret; int i; if (!cpu_feature_enabled(X86_FEATURE_SGX)) - return; + return -ENODEV; if (!sgx_page_cache_init()) - return; + return -ENOMEM; - if (!sgx_page_reclaimer_init()) + if (!sgx_page_reclaimer_init()) { + ret = -ENOMEM; goto err_page_cache; + } ret = sgx_drv_init(); if (ret) goto err_kthread; - return; + return 0; err_kthread: kthread_stop(ksgxd_tsk); @@ -728,6 +730,8 @@ err_page_cache: vfree(sgx_epc_sections[i].pages); memunmap(sgx_epc_sections[i].virt_addr); } + + return ret; } device_initcall(sgx_init); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 1068002c8532..8678864ce712 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -25,10 +25,10 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -#ifdef CONFIG_SMP unsigned int __max_die_per_package __read_mostly = 1; EXPORT_SYMBOL(__max_die_per_package); +#ifdef CONFIG_SMP /* * Check if given CPUID extended toplogy "leaf" is implemented */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index eb86a2b831b1..571220ac8bea 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu) } EXPORT_SYMBOL(copy_fpregs_to_fpstate); -void kernel_fpu_begin(void) +void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); @@ -141,13 +141,14 @@ void kernel_fpu_begin(void) } __cpu_invalidate_fpregs_state(); - if (boot_cpu_has(X86_FEATURE_XMM)) + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); - if (boot_cpu_has(X86_FEATURE_FPU)) + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } -EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5d8047441a0a..683749b80ae2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu) fx->fop = 0; fx->rip = 0; fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); + memset(fx->st_space, 0, sizeof(fx->st_space)); } /* * SSE is in init state */ if (!(xfeatures & XFEATURE_MASK_SSE)) - memset(&fx->xmm_space[0], 0, 256); + memset(fx->xmm_space, 0, sizeof(fx->xmm_space)); /* * First two features are FPU and SSE, which above we handled diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 03aa33b58165..668a4a6533d9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) CPU_ENTRY_AREA_TOTAL_SIZE)) return true; + /* + * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU + * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. + */ +#ifdef CONFIG_SMP + if (within_area(addr, end, (unsigned long)__per_cpu_offset, + sizeof(unsigned long) * nr_cpu_ids)) + return true; +#else + if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, + sizeof(pcpu_unit_offsets))) + return true; +#endif + for_each_possible_cpu(cpu) { /* The original rw GDT is being used after load_direct_gdt() */ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), @@ -293,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) (unsigned long)&per_cpu(cpu_tlbstate, cpu), sizeof(struct tlb_state))) return true; + + /* + * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() + * will read per-cpu cpu_dr7 before clear dr7 register. + */ + if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), + sizeof(cpu_dr7))) + return true; } return false; @@ -491,15 +513,12 @@ static int hw_breakpoint_handler(struct die_args *args) struct perf_event *bp; unsigned long *dr6_p; unsigned long dr6; + bool bpx; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; - /* If it's a single step, TRAP bits are random */ - if (dr6 & DR_STEP) - return NOTIFY_DONE; - /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; @@ -509,28 +528,29 @@ static int hw_breakpoint_handler(struct die_args *args) if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + bp = this_cpu_read(bp_per_reg[i]); + if (!bp) + continue; + + bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; + /* - * The counter may be concurrently released but that can only - * occur from a call_rcu() path. We can then safely fetch - * the breakpoint, use its callback, touch its counter - * while we are in an rcu_read_lock() path. + * TF and data breakpoints are traps and can be merged, however + * instruction breakpoints are faults and will be raised + * separately. + * + * However DR6 can indicate both TF and instruction + * breakpoints. In that case take TF as that has precedence and + * delay the instruction breakpoint for the next exception. */ - rcu_read_lock(); + if (bpx && (dr6 & DR_STEP)) + continue; - bp = this_cpu_read(bp_per_reg[i]); /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling */ (*dr6_p) &= ~(DR_TRAP0 << i); - /* - * bp can be NULL due to lazy debug register switching - * or due to concurrent perf counter removing. - */ - if (!bp) { - rcu_read_unlock(); - break; - } perf_bp_event(bp, args->regs); @@ -538,11 +558,10 @@ static int hw_breakpoint_handler(struct die_args *args) * Set up resume flag to avoid breakpoint recursion when * returning back to origin. */ - if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + if (bpx) args->regs->flags |= X86_EFLAGS_RF; - - rcu_read_unlock(); } + /* * Further processing in do_debug() is needed for a) user-space * breakpoints (to generate signals) and b) when the system has diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c5dd50369e2f..d4ad344e80bf 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -21,6 +21,7 @@ #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/traps.h> +#include <asm/thermal.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -374,3 +375,23 @@ void fixup_irqs(void) } } #endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} +#endif diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 0db0375235b4..8ef35063964b 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl) ret SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) - -/* - * void native_restore_fl(unsigned long flags) - * %eax/%rdi: flags - */ -SYM_FUNC_START(native_restore_fl) - push %_ASM_ARG1 - popf - ret -SYM_FUNC_END(native_restore_fl) -EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 8a67d1fa8dc5..ed8ac6bcbafb 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = security_locked_down(LOCKDOWN_MSR); if (err) break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6c3407ba6ee9..c60222ab8ab9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insn_buff, len); - else if (type == PARAVIRT_PATCH(cpu.iret) || - type == PARAVIRT_PATCH(cpu.usergs_sysret64)) + else if (type == PARAVIRT_PATCH(cpu.iret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); #endif @@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_usergs_sysret64(void); static struct resource reserve_ioports = { .start = 0, @@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.usergs_sysret64 = native_usergs_sysret64, .cpu.iret = native_iret, - .cpu.swapgs = native_swapgs, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, @@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .irq.safe_halt = native_safe_halt, diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c index ace6e334cb39..abd27ec67397 100644 --- a/arch/x86/kernel/paravirt_patch.c +++ b/arch/x86/kernel/paravirt_patch.c @@ -25,10 +25,7 @@ struct patch_xxl { const unsigned char mmu_read_cr2[3]; const unsigned char mmu_read_cr3[3]; const unsigned char mmu_write_cr3[3]; - const unsigned char irq_restore_fl[2]; const unsigned char cpu_wbinvd[2]; - const unsigned char cpu_usergs_sysret64[6]; - const unsigned char cpu_swapgs[3]; const unsigned char mov64[3]; }; @@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = { .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, - 0x48, 0x0f, 0x07 }, // swapgs; sysretq - .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax }; @@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, switch (type) { #ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); PATCH_CASE(irq, save_fl, xxl, insn_buff, len); PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); @@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); - PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); #endif diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 2e9006c1e240..42e92ec62973 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -4,9 +4,6 @@ #include <linux/string.h> #include <linux/kallsyms.h> - -#define DEBUG 1 - static struct iommu_table_entry * __init find_dependents_of(struct iommu_table_entry *start, struct iommu_table_entry *finish, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index bedca011459c..87a4143aa7d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request, int ret; unsigned long __user *datap = (unsigned long __user *)data; +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index db115943e8bd..9991c5920aac 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, }, + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + /* Sony */ { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = set_bios_reboot, diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index 7d04b356d44d..cdc04d091242 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -305,14 +305,14 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u64)insn->immediate.value << 16; + *exitinfo |= (u8)insn->immediate.value << 16; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u64)insn->immediate.value << 16; + *exitinfo |= (u8)insn->immediate.value << 16; break; /* IN register opcodes */ diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 0bd1a0fc587e..84c1821819af 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -225,7 +225,7 @@ static inline u64 sev_es_rd_ghcb_msr(void) return __rdmsr(MSR_AMD64_SEV_ES_GHCB); } -static inline void sev_es_wr_ghcb_msr(u64 val) +static __always_inline void sev_es_wr_ghcb_msr(u64 val) { u32 low, high; @@ -286,6 +286,12 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(target, size)) { + memcpy(dst, buf, size); + return ES_OK; + } + switch (size) { case 1: memcpy(&d1, buf, 1); @@ -335,6 +341,12 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(s, size)) { + memcpy(buf, src, size); + return ES_OK; + } + switch (size) { case 1: if (get_user(d1, s)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8ca66af96a54..02813a7f3a7c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> +#include <linux/syscore_ops.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -1832,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled) arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool turbo_disabled(void) { @@ -2083,6 +2085,23 @@ static void init_counter_refs(void) this_cpu_write(arch_prev_mperf, mperf); } +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) +{ + /* Bail out if registered already. */ + if (freq_invariance_syscore_ops.node.prev) + return; + + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif + static void init_freq_invariance(bool secondary, bool cppc_ready) { bool ret = false; @@ -2109,6 +2128,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready) if (ret) { init_counter_refs(); static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } else { pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..0f3c307b37b3 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child) regs->flags |= X86_EFLAGS_TF; /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also + * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); + /* + * Ensure that a trap is triggered once stepping out of a system + * call prior to executing any user instruction. + */ + set_task_syscall_work(child, SYSCALL_EXIT_TRAP); + oflags = regs->flags; /* Set TF on the kernel stack.. */ @@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child) /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); + clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 504fa5425bce..660b78827638 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - long error; - error = -EINVAL; if (off & ~PAGE_MASK) - goto out; + return -EINVAL; - error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); -out: - return error; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } static void find_start_end(unsigned long addr, unsigned long flags, diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 764573de3996..e5a7a10a0164 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); - unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ user_access_end(); @@ -160,49 +164,6 @@ Efault: do_exit(SIGSEGV); } -static void mark_screen_rdonly(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; - - mmap_write_lock(mm); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; - p4d = p4d_offset(pgd, 0xA0000); - if (p4d_none_or_clear_bad(p4d)) - goto out; - pud = pud_offset(p4d, 0xA0000); - if (pud_none_or_clear_bad(pud)) - goto out; - pmd = pmd_offset(pud, 0xA0000); - - if (pmd_trans_huge(*pmd)) { - vma = find_vma(mm, 0xA0000); - split_huge_pmd(vma, pmd, 0xA0000); - } - if (pmd_none_or_clear_bad(pmd)) - goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); - for (i = 0; i < 32; i++) { - if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); - pte++; - } - pte_unmap_unlock(pte, ptl); -out: - mmap_write_unlock(mm); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); -} - - - static int do_vm86_irq_handling(int subfunction, int irqnumber); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); @@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) offsetof(struct vm86_struct, int_revectored))) return -EFAULT; + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + memset(&vm86regs, 0, sizeof(vm86regs)); vm86regs.pt.bx = v.regs.ebx; @@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86regs.gs = v.regs.gs; vm86->flags = v.flags; - vm86->screen_bitmap = v.screen_bitmap; vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, @@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) update_task_stack(tsk); preempt_enable(); - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); - memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); return regs->ax; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 13036cf0b912..38172ca627d3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -321,7 +321,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; return 0; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 56cae1ff9e3f..66a08322988f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2879,6 +2879,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data : (u32)msr_data; + if (efer & EFER_LMA) + ctxt->mode = X86EMUL_MODE_PROT64; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f15bc16de07c..a889563ad02d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,31 +9,6 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); -} - -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); -} - -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); -} - -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); -} - #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ { \ @@ -43,7 +18,6 @@ static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \ unsigned long val) \ { \ vcpu->arch.regs[VCPU_REGS_##uname] = val; \ - kvm_register_mark_dirty(vcpu, VCPU_REGS_##uname); \ } BUILD_KVM_GPR_ACCESSORS(rax, RAX) BUILD_KVM_GPR_ACCESSORS(rbx, RBX) @@ -63,6 +37,31 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif +static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) { if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3136e05831cf..43cceadd073e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -674,7 +674,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) (unsigned long long)vcpu->arch.pv_eoi.msr_val); return false; } - return val & 0x1; + return val & KVM_PV_EOI_ENABLED; } static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) @@ -2898,7 +2898,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); + kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9c4a9c8e43d9..261be1d2032b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,12 +44,19 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -static inline u64 rsvd_bits(int s, int e) +static __always_inline u64 rsvd_bits(int s, int e) { + BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s); + + if (__builtin_constant_p(e)) + BUILD_BUG_ON(e > 63); + else + e &= 63; + if (e < s) return 0; - return ((1ULL << (e - s + 1)) - 1) << s; + return ((2ULL << (e - s)) - 1) << s; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c478904af518..6d16481aa29d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3493,26 +3493,25 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. */ -static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct kvm_shadow_walk_iterator iterator; - int leaf = vcpu->arch.mmu->root_level; + int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr); + for (shadow_walk_init(&iterator, vcpu, addr), + *root_level = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); - sptes[leaf - 1] = spte; + sptes[leaf] = spte; if (!is_shadow_present_pte(spte)) break; - } walk_shadow_page_lockless_end(vcpu); @@ -3520,14 +3519,12 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) return leaf; } -/* return true if reserved bit is detected on spte. */ +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { - u64 sptes[PT64_ROOT_MAX_LEVEL]; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; - int root = vcpu->arch.mmu->shadow_root_level; - int leaf; - int level; + int root, leaf, level; bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { @@ -3536,35 +3533,45 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) } if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else - leaf = get_walk(vcpu, addr, sptes); + leaf = get_walk(vcpu, addr, sptes, &root); + + if (unlikely(leaf < 0)) { + *sptep = 0ull; + return reserved; + } + + *sptep = sptes[leaf]; + + /* + * Skip reserved bits checks on the terminal leaf if it's not a valid + * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by + * design, always have reserved bits set. The purpose of the checks is + * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. + */ + if (!is_shadow_present_pte(sptes[leaf])) + leaf++; rsvd_check = &vcpu->arch.mmu->shadow_zero_check; - for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level - 1])) - break; + for (level = root; level >= leaf; level--) /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | - __is_rsvd_bits_set(rsvd_check, sptes[level - 1], - level); - } + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | + __is_rsvd_bits_set(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[level - 1], level); + sptes[level], level); } - *sptep = sptes[leaf - 1]; - return reserved; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4bd2f1dc0172..b56d604809b8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -44,7 +44,48 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); } -#define for_each_tdp_mmu_root(_kvm, _root) \ +static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + if (kvm_mmu_put_root(kvm, root)) + kvm_tdp_mmu_free_root(kvm, root); +} + +static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + lockdep_assert_held(&kvm->mmu_lock); + + if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) + return false; + + kvm_mmu_get_root(kvm, root); + return true; + +} + +static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + struct kvm_mmu_page *next_root; + + next_root = list_next_entry(root, link); + tdp_mmu_put_root(kvm, root); + return next_root; +} + +/* + * Note: this iterator gets and puts references to the roots it iterates over. + * This makes it safe to release the MMU lock and yield within the loop, but + * if exiting the loop early, the caller must drop the reference to the most + * recent root. (Unless keeping a live reference is desirable.) + */ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ + for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ + typeof(*_root), link); \ + tdp_mmu_next_root_valid(_kvm, _root); \ + _root = tdp_mmu_next_root(_kvm, _root)) + +#define for_each_tdp_mmu_root(_kvm, _root) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) @@ -447,18 +488,9 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) struct kvm_mmu_page *root; bool flush = false; - for_each_tdp_mmu_root(kvm, root) { - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - + for_each_tdp_mmu_root_yield_safe(kvm, root) flush |= zap_gfn_range(kvm, root, start, end, true); - kvm_mmu_put_root(kvm, root); - } - return flush; } @@ -619,13 +651,7 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, int ret = 0; int as_id; - for_each_tdp_mmu_root(kvm, root) { - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - + for_each_tdp_mmu_root_yield_safe(kvm, root) { as_id = kvm_mmu_page_as_id(root); slots = __kvm_memslots(kvm, as_id); kvm_for_each_memslot(memslot, slots) { @@ -647,8 +673,6 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, ret |= handler(kvm, memslot, root, gfn_start, gfn_end, data); } - - kvm_mmu_put_root(kvm, root); } return ret; @@ -838,21 +862,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); - - kvm_mmu_put_root(kvm, root); } return spte_set; @@ -906,21 +922,13 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } return spte_set; @@ -1029,28 +1037,20 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } return spte_set; } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Clear leaf entries which could be replaced by large mappings, for + * GFNs within the slot. */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1062,7 +1062,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, tdp_root_for_each_pte(iter, root, start, end) { if (!is_shadow_present_pte(iter.old_spte) || - is_last_spte(iter.old_spte, iter.level)) + !is_last_spte(iter.old_spte, iter.level)) continue; pfn = spte_to_pfn(iter.old_spte); @@ -1089,21 +1089,13 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_mmu_page *root; int root_as_id; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - zap_collapsible_spte_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } } @@ -1160,16 +1152,19 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; - int leaf = vcpu->arch.mmu->shadow_root_level; gfn_t gfn = addr >> PAGE_SHIFT; + int leaf = -1; + + *root_level = vcpu->arch.mmu->shadow_root_level; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; - sptes[leaf - 1] = iter.old_spte; + sptes[leaf] = iter.old_spte; } return leaf; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 556e065503f6..cbbdbadd1526 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -44,5 +44,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level); + #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b0b667456b2e..db30670dd8c4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -199,6 +199,10 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + + if (WARN_ON(!is_guest_mode(vcpu))) + return true; + if (!nested_svm_vmrun_msrpm(svm)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -227,6 +231,7 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) { + struct kvm_vcpu *vcpu = &svm->vcpu; bool vmcb12_lma; if ((vmcb12->save.efer & EFER_SVME) == 0) @@ -240,18 +245,10 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); - if (!vmcb12_lma) { - if (vmcb12->save.cr4 & X86_CR4_PAE) { - if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) - return false; - } else { - if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) - return false; - } - } else { + if (vmcb12_lma) { if (!(vmcb12->save.cr4 & X86_CR4_PAE) || !(vmcb12->save.cr0 & X86_CR0_PE) || - (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) + (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits)) return false; } if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) @@ -595,6 +592,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -754,6 +753,7 @@ void svm_leave_nested(struct vcpu_svm *svm) leave_guest_mode(&svm->vcpu); copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); + vmcb_mark_all_dirty(svm->vmcb); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); @@ -1194,6 +1194,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * in the registers, the save area of the nested state instead * contains saved L1 state. */ + + svm->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9858d5ae9ddd..48017fef1cd9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -342,6 +342,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long first, last; int ret; + lockdep_assert_held(&kvm->lock); + if (ulen == 0 || uaddr + ulen < uaddr) return ERR_PTR(-EINVAL); @@ -1119,12 +1121,20 @@ int svm_register_enc_region(struct kvm *kvm, if (!region) return -ENOMEM; + mutex_lock(&kvm->lock); region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); + mutex_unlock(&kvm->lock); goto e_free; } + region->uaddr = range->addr; + region->size = range->size; + + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + /* * The guest may change the memory encryption attribute from C=0 -> C=1 * or vice versa for this memory range. Lets make sure caches are @@ -1133,13 +1143,6 @@ int svm_register_enc_region(struct kvm *kvm, */ sev_clflush_pages(region->pages, region->npages); - region->uaddr = range->addr; - region->size = range->size; - - mutex_lock(&kvm->lock); - list_add_tail(®ion->list, &sev->regions_list); - mutex_unlock(&kvm->lock); - return ret; e_free: @@ -1415,16 +1418,13 @@ static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) * to be returned: * GPRs RAX, RBX, RCX, RDX * - * Copy their values to the GHCB if they are dirty. + * Copy their values, even if they may not have been written during the + * VM-Exit. It's the guest's responsibility to not consume random data. */ - if (kvm_register_is_dirty(vcpu, VCPU_REGS_RAX)) - ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); - if (kvm_register_is_dirty(vcpu, VCPU_REGS_RBX)) - ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); - if (kvm_register_is_dirty(vcpu, VCPU_REGS_RCX)) - ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); - if (kvm_register_is_dirty(vcpu, VCPU_REGS_RDX)) - ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); + ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); + ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); + ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); + ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); } static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) @@ -1563,6 +1563,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) goto vmgexit_err; break; case SVM_VMGEXIT_NMI_COMPLETE: + case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: break; @@ -1888,6 +1889,9 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_NMI_COMPLETE: ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET); break; + case SVM_VMGEXIT_AP_HLT_LOOP: + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + break; case SVM_VMGEXIT_AP_JUMP_TABLE: { struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; @@ -2001,7 +2005,7 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu) * of which one step is to perform a VMLOAD. Since hardware does not * perform a VMSAVE on VMRUN, the host savearea must be updated. */ - asm volatile(__ex("vmsave") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); + asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); /* * Certain MSRs are restored on VMEXIT, only save ones that aren't @@ -2040,3 +2044,21 @@ void sev_es_vcpu_put(struct vcpu_svm *svm) wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]); } } + +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* First SIPI: Use the values as initially set by the VMM */ + if (!svm->received_first_sipi) { + svm->received_first_sipi = true; + return; + } + + /* + * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where + * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a + * non-zero value. + */ + ghcb_set_sw_exit_info_2(svm->ghcb, 1); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cce0143a6f80..3442d44ca53b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -454,6 +454,11 @@ static int has_svm(void) return 0; } + if (sev_active()) { + pr_info("KVM is unsupported when running as an SEV guest\n"); + return 0; + } + return 1; } @@ -3677,8 +3682,6 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); - static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) { @@ -3741,6 +3744,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + trace_kvm_entry(vcpu); + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; @@ -4384,6 +4389,14 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } +static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) +{ + if (!sev_es_guest(vcpu->kvm)) + return kvm_vcpu_deliver_sipi_vector(vcpu, vector); + + sev_vcpu_deliver_sipi_vector(vcpu, vector); +} + static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); @@ -4526,6 +4539,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .msr_filter_changed = svm_msr_filter_changed, .complete_emulated_msr = svm_complete_emulated_msr, + + .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5431e6335e2e..6e7d070f8b86 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -185,6 +185,7 @@ struct vcpu_svm { struct vmcb_save_area *vmsa; struct ghcb *ghcb; struct kvm_host_map ghcb_map; + bool received_first_sipi; /* SEV-ES scratch area support */ void *ghcb_sa; @@ -402,9 +403,6 @@ static inline bool gif_set(struct vcpu_svm *svm) } /* svm.c */ -#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U -#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U -#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU extern int sev; @@ -591,6 +589,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm); void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu); void sev_es_vcpu_put(struct vcpu_svm *svm); +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); /* vmenter.S */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e2f26564a12d..f2b9bfb58206 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3124,13 +3124,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_host_map *map; - struct page *page; - u64 hpa; /* * hv_evmcs may end up being not mapped after migration (when @@ -3153,6 +3149,17 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } } + return true; +} + +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; + struct page *page; + u64 hpa; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3221,6 +3228,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + + return true; +} + +static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + if (!nested_get_evmcs_page(vcpu)) + return false; + + if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) + return false; + return true; } @@ -4442,6 +4461,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + /* Service the TLB flush request for L2 before switching to L1. */ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); @@ -6075,11 +6096,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (vmx->nested.hv_evmcs) - copy_enlightened_to_vmcs12(vmx); - else if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); + } else { + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + if (!vmx->nested.need_vmcs12_to_shadow_sync) { + if (vmx->nested.hv_evmcs) + copy_enlightened_to_vmcs12(vmx); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } } BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); @@ -6600,7 +6624,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_nested_state_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a886a47daebd..cdf5f34518f4 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -345,7 +345,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -355,6 +357,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, x86_pmu.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, + edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75c9c6a0a3a4..eb69fef57485 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6653,6 +6653,8 @@ reenter_guest: if (vmx->emulation_required) return EXIT_FASTPATH_NONE; + trace_kvm_entry(vcpu); + if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; vmcs_write32(PLE_WINDOW, vmx->ple_window); @@ -6858,11 +6860,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) switch (index) { case MSR_IA32_TSX_CTRL: /* - * No need to pass TSX_CTRL_CPUID_CLEAR through, so - * let's avoid changing CPUID bits under the host - * kernel's feet. + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID + * interception. Keep the host value unchanged to avoid + * changing CPUID bits under the host kernel's feet. + * + * hle=0, rtm=0, tsx_ctrl=1 can be found with some + * combinations of new kernel and old userspace. If + * those guests run on a tsx=off host, do allow guests + * to use TSX_CTRL, but do not change the value on the + * host so that TSX remains always disabled. */ - vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + if (boot_cpu_has(X86_FEATURE_RTM)) + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + else + vmx->guest_uret_msrs[j].mask = 0; break; default: vmx->guest_uret_msrs[j].mask = -1ull; @@ -7707,6 +7718,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .msr_filter_changed = vmx_msr_filter_changed, .complete_emulated_msr = kvm_complete_insn_gp, .cpu_dirty_log_size = vmx_cpu_dirty_log_size, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; static __init int hardware_setup(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f7c1fc7a3ce..1b404e4d7dd8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -105,6 +105,7 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void process_smi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); @@ -1393,16 +1394,24 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; - /* - * On TAA affected systems: - * - nothing to do if TSX is disabled on the host. - * - we emulate TSX_CTRL if present on the host. - * This lets the guest use VERW to clear CPU buffers. - */ - if (!boot_cpu_has(X86_FEATURE_RTM)) - data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); - else if (!boot_cpu_has_bug(X86_BUG_TAA)) + if (!boot_cpu_has(X86_FEATURE_RTM)) { + /* + * If RTM=0 because the kernel has disabled TSX, the host might + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 + * and therefore knows that there cannot be TAA) but keep + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, + * and we want to allow migrating those guests to tsx=off hosts. + */ + data &= ~ARCH_CAP_TAA_NO; + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; + } else { + /* + * Nothing to do here; we emulate TSX_CTRL if present on the + * host so the guest can choose between disabling TSX or + * using VERW to clear CPU buffers. + */ + } return data; } @@ -4230,6 +4239,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, { process_nmi(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); + /* * In guest mode, payload delivery should be deferred, * so that the L1 hypervisor can intercept #PF before @@ -7976,17 +7988,22 @@ void kvm_arch_exit(void) kmem_cache_destroy(x86_fpu_cache); } -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + vcpu->arch.mp_state = state; return 1; } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; + vcpu->run->exit_reason = reason; return 0; } } + +int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +{ + return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) @@ -8000,6 +8017,14 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -8973,8 +8998,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.request_immediate_exit(vcpu); } - trace_kvm_entry(vcpu); - fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); @@ -9094,6 +9117,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9520,8 +9544,9 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && - vcpu->arch.pv.pv_unhalted) + if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || + vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && + vcpu->arch.pv.pv_unhalted) mp_state->mp_state = KVM_MP_STATE_RUNNABLE; else mp_state->mp_state = vcpu->arch.mp_state; @@ -9599,6 +9624,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return false; + if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) + return false; } else { /* * Not in 64-bit mode: EFER.LMA is clear and the code @@ -9976,6 +10003,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -10152,6 +10180,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } +EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); int kvm_arch_hardware_enable(void) { @@ -10476,7 +10505,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return 0; old_npages = slot->npages; - hva = 0; + hva = slot->userspace_addr; } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -11538,6 +11567,7 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, } EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c5ee0f5ce0f1..0f727b50bd3d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -425,6 +425,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); __reserved_bits |= X86_CR4_UMIP; \ if (!__cpu_has(__c, X86_FEATURE_VMX)) \ __reserved_bits |= X86_CR4_VMXE; \ + if (!__cpu_has(__c, X86_FEATURE_PCID)) \ + __reserved_bits |= X86_CR4_PCIDE; \ __reserved_bits; \ }) diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 4321fa02e18d..419365c48b2a 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -26,6 +26,16 @@ #include <asm/fpu/api.h> #include <asm/asm.h> +/* + * Use KFPU_387. MMX instructions are not affected by MXCSR, + * but both AMD and Intel documentation states that even integer MMX + * operations will result in #MF if an exception is pending in FCW. + * + * EMMS is not needed afterwards because, after calling kernel_fpu_end(), + * any subsequent user of the 387 stack will reinitialize it using + * KFPU_387. + */ + void *_mmx_memcpy(void *to, const void *from, size_t len) { void *p; @@ -37,7 +47,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) p = to; i = len >> 6; /* len/64 */ - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( "1: prefetch (%0)\n" /* This set is 28 bytes */ @@ -127,7 +137,7 @@ static void fast_clear_page(void *page) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -160,7 +170,7 @@ static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); /* * maybe the prefetch stuff can go before the expensive fnsave... @@ -247,7 +257,7 @@ static void fast_clear_page(void *page) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -282,7 +292,7 @@ static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( "1: prefetch (%0)\n" diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f1f1b5a0956a..525197381baa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,7 +16,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ -#include <linux/efi.h> /* efi_recover_from_page_fault()*/ +#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ @@ -25,7 +25,7 @@ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ -#include <asm/efi.h> /* efi_recover_from_page_fault()*/ +#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ @@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * @@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, #ifdef CONFIG_X86_64 case 0x40: /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. + * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif @@ -110,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, } } +static bool is_amd_k8_pre_npt(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && + c->x86_vendor == X86_VENDOR_AMD && + c->x86 == 0xf && c->x86_model < 0x40); +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { @@ -117,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) unsigned char *instr; int prefetch = 0; + /* Erratum #91 affects AMD K8, pre-NPT CPUs */ + if (!is_amd_k8_pre_npt()) + return 0; + /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: @@ -127,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) - return 0; + /* + * This code has historically always bailed out if IP points to a + * not-present page (e.g. due to a race). No one has ever + * complained about this. + */ + pagefault_disable(); while (instr < max_instr) { unsigned char opcode; - if (get_kernel_nofault(opcode, instr)) - break; + if (user_mode(regs)) { + if (get_user(opcode, instr)) + break; + } else { + if (get_kernel_nofault(opcode, instr)) + break; + } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } + + pagefault_enable(); return prefetch; } @@ -262,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -#ifdef CONFIG_VM86 - unsigned long bit; - - if (!v8086_mode(regs) || !tsk->thread.vm86) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.vm86->screen_bitmap |= 1 << bit; -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; @@ -335,15 +336,6 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -} - static int bad_address(void *p) { unsigned long dummy; @@ -427,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) || boot_cpu_data.x86 != 0xf) return 0; + if (user_mode(regs)) + return 0; + if (address != regs->ip) return 0; @@ -462,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) } /* Pentium F0 0F C7 C8 bug workaround: */ -static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && + idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } @@ -630,53 +627,20 @@ static void set_signal_archinfo(unsigned long address, } static noinline void -no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) +page_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - struct task_struct *tsk = current; unsigned long flags; int sig; if (user_mode(regs)) { /* - * This is an implicit supervisor-mode access from user - * mode. Bypass all the kernel-mode recovery code and just - * OOPS. + * Implicit kernel access from user mode? Skip the stack + * overflow and EFI special cases. */ goto oops; } - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { - /* - * Any interrupt that takes a fault gets the fixup. This makes - * the below recursive fault logic only apply to a faults from - * task context. - */ - if (in_interrupt()) - return; - - /* - * Per the above we're !in_interrupt(), aka. task context. - * - * In this case we need to make sure we're not recursively - * faulting through the emulate_vsyscall() logic. - */ - if (current->thread.sig_on_uaccess_err && signal) { - sanitize_error_code(address, &error_code); - - set_signal_archinfo(address, error_code); - - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); - } - - /* - * Barring that, we can do the fixup and be happy. - */ - return; - } - #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial @@ -684,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && - (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || - address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + (((unsigned long)current->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) { unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -709,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code, #endif /* - * 32-bit: - * - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * 64-bit: - * - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, error_code, address)) - return; - - if (is_errata93(regs, address)) - return; - - /* - * Buggy firmware could access regions which might page fault, try to - * recover from such faults. + * Buggy firmware could access regions which might page fault. If + * this happens, EFI has a special OOPS path that will try to + * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) - efi_recover_from_page_fault(address); + efi_crash_gracefully_on_page_fault(address); oops: /* @@ -741,7 +689,7 @@ oops: show_fault_oops(regs, error_code, address); - if (task_stack_end_corrupted(tsk)) + if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; @@ -754,6 +702,53 @@ oops: oops_end(flags, regs, sig); } +static noinline void +kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + WARN_ON_ONCE(user_mode(regs)); + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ + if (current->thread.sig_on_uaccess_err && signal) { + sanitize_error_code(address, &error_code); + + set_signal_archinfo(address, error_code); + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } + + /* + * Barring that, we can do the fixup and be happy. + */ + return; + } + + /* + * AMD erratum #91 manifests as a spurious page fault on a PREFETCH + * instruction. + */ + if (is_prefetch(regs, error_code, address)) + return; + + page_fault_oops(regs, error_code, address); +} + /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: @@ -796,47 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs) && (error_code & X86_PF_USER)) { - /* - * It's possible to have interrupts off here: - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space: - */ - if (is_prefetch(regs, error_code, address)) - return; + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + return; + } - if (is_errata100(regs, address)) - return; + if (!(error_code & X86_PF_USER)) { + /* Implicit user access to kernel memory -- just oops */ + page_fault_oops(regs, error_code, address); + return; + } - sanitize_error_code(address, &error_code); + /* + * User mode accesses just cause a SIGSEGV. + * It's possible to have interrupts off here: + */ + local_irq_enable(); - if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) - return; + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; - if (likely(show_unhandled_signals)) - show_signal_msg(regs, error_code, address, tsk); + if (is_errata100(regs, address)) + return; - set_signal_archinfo(address, error_code); + sanitize_error_code(address, &error_code); - if (si_code == SEGV_PKUERR) - force_sig_pkuerr((void __user *)address, pkey); + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; - force_sig_fault(SIGSEGV, si_code, (void __user *)address); + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); - local_irq_disable(); + set_signal_archinfo(address, error_code); - return; - } + if (si_code == SEGV_PKUERR) + force_sig_pkuerr((void __user *)address, pkey); - if (is_f00f_bug(regs, address)) - return; + force_sig_fault(SIGSEGV, si_code, (void __user *)address); - no_context(regs, error_code, address, SIGSEGV, si_code); + local_irq_disable(); } static noinline void @@ -926,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -961,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } -static noinline void -mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, vm_fault_t fault) -{ - if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, 0, 0); - return; - } - - if (fault & VM_FAULT_OOM) { - /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); - return; - } - - /* - * We ran out of memory, call the OOM killer, and return the - * userspace (which will retry the fault, or kill us if we got - * oom-killed): - */ - pagefault_out_of_memory(); - } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| - VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, fault); - else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address); - else - BUG(); - } -} - static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) @@ -1209,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } #endif + if (is_f00f_bug(regs, hw_error_code, address)) + return; + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; @@ -1229,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); -/* Handle faults in the user portion of the address space */ +/* + * Handle faults in the user portion of the address space. Nothing in here + * should check X86_PF_USER without a specific justification: for almost + * all purposes, we should treat a normal kernel access to user memory + * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. + * The one exception is AC flag handling, which is, per the x86 + * architecture, special for WRUSS. + */ static inline void do_user_addr_fault(struct pt_regs *regs, - unsigned long hw_error_code, + unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; @@ -1244,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs, tsk = current; mm = tsk->mm; + if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { + /* + * Whoops, this is kernel mode code trying to execute from + * user memory. Unless this is AMD erratum #93, which + * corrupts RIP such that it looks like a user address, + * this is unrecoverable. Don't even try to look up the + * VMA or look for extable entries. + */ + if (is_errata93(regs, address)) + return; + + page_fault_oops(regs, error_code, address); + return; + } + /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) return; @@ -1252,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs, * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ - if (unlikely(hw_error_code & X86_PF_RSVD)) - pgtable_bad(regs, hw_error_code, address); + if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user @@ -1263,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs, * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && - !(hw_error_code & X86_PF_USER) && - !(regs->flags & X86_EFLAGS_AC))) - { - bad_area_nosemaphore(regs, hw_error_code, address); + !(error_code & X86_PF_USER) && + !(regs->flags & X86_EFLAGS_AC))) { + /* + * No extable entry here. This was a kernel access to an + * invalid pointer. get_kernel_nofault() will not get here. + */ + page_fault_oops(regs, error_code, address); return; } @@ -1275,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1296,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (hw_error_code & X86_PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (hw_error_code & X86_PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1314,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs, * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(hw_error_code, regs, address)) + if (emulate_vsyscall(error_code, regs, address)) return; } #endif @@ -1337,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } retry: @@ -1353,17 +1344,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } @@ -1372,8 +1363,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(hw_error_code, vma))) { - bad_area_access_error(regs, hw_error_code, address, vma); + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); return; } @@ -1392,11 +1383,14 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { + /* + * Quick path to respond to signals. The core mm code + * has unlocked the mm for us if we get here. + */ if (!user_mode(regs)) - no_context(regs, hw_error_code, address, SIGBUS, - BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR); return; } @@ -1412,12 +1406,37 @@ good_area: } mmap_read_unlock(mm); - if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, hw_error_code, address, fault); + if (likely(!(fault & VM_FAULT_ERROR))) + return; + + if (fatal_signal_pending(current) && !user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); return; } - check_v8086_mode(regs, address, tsk); + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; + } + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address); + else + BUG(); + } } NOKPROBE_SYMBOL(do_user_addr_fault); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e26f5c5c6565..dd694fb93916 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num) } /* - * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. - * With KASLR memory randomization, depending on the machine e820 memory - * and the PUD alignment. We may need twice more pages when KASLR memory + * By default need to be able to allocate page tables below PGD firstly for + * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. + * With KASLR memory randomization, depending on the machine e820 memory and the + * PUD alignment, twice that many pages may be needed when KASLR memory * randomization is enabled. */ + +#ifndef CONFIG_X86_5LEVEL +#define INIT_PGD_PAGE_TABLES 3 +#else +#define INIT_PGD_PAGE_TABLES 4 +#endif + #ifndef CONFIG_RANDOMIZE_MEMORY -#define INIT_PGD_PAGE_COUNT 6 +#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) #else -#define INIT_PGD_PAGE_COUNT 12 +#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES) #endif + #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index c79e5736ab2b..4b01f7dbaf30 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -382,6 +382,7 @@ bool sev_active(void) { return sev_status & MSR_AMD64_SEV_ENABLED; } +EXPORT_SYMBOL_GPL(sev_active); /* Needs to be called from non-instrumentable code */ bool noinstr sev_es_active(void) @@ -474,9 +475,10 @@ void __init mem_encrypt_init(void) swiotlb_update_mem_attributes(); /* - * With SEV, we need to unroll the rep string I/O instructions. + * With SEV, we need to unroll the rep string I/O instructions, + * but SEV-ES supports them through the #VC handler. */ - if (sev_active()) + if (sev_active() && !sev_es_active()) static_branch_enable(&sev_enable_key); print_mem_encrypt_feature_info(); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index bd7aff5c51f7..cd768dafca9e 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -10,8 +10,6 @@ #define pr_fmt(fmt) "mmiotrace: " fmt -#define DEBUG 1 - #include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/slab.h> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfd82f51ba66..f6a9e2e36642 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -829,6 +829,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) } free_page((unsigned long)pmd_sv); + + pgtable_pmd_page_dtor(virt_to_page(pmd)); free_page((unsigned long)pmd); return 1; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 796506dcfc42..79e7a0ec1da5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +/* Some 1-byte opcodes for binary ALU operations */ +static u8 simple_alu_opcodes[] = { + [BPF_ADD] = 0x01, + [BPF_SUB] = 0x29, + [BPF_AND] = 0x21, + [BPF_OR] = 0x09, + [BPF_XOR] = 0x31, + [BPF_LSH] = 0xE0, + [BPF_RSH] = 0xE8, + [BPF_ARSH] = 0xF8, +}; + static void jit_fill_hole(void *area, unsigned int size) { /* Fill whole space with INT3 instructions */ @@ -681,6 +693,42 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */ +static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_imm8(off)) { + /* 1-byte signed displacement. + * + * If off == 0 we could skip this and save one extra byte, but + * special case of x86 R13 which always needs an offset is not + * worth the hassle + */ + EMIT2(add_2reg(0x40, ptr_reg, val_reg), off); + } else { + /* 4-byte signed displacement */ + EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off); + } + *pprog = prog; +} + +/* + * Emit a REX byte if it will be necessary to address these registers + */ +static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) + EMIT1(add_2mod(0x48, dst_reg, src_reg)); + else if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + *pprog = prog; +} + /* LDX: dst_reg = *(u8*)(src_reg + off) */ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -708,15 +756,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); break; } - /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + emit_insn_suffix(&prog, src_reg, dst_reg, off); *pprog = prog; } @@ -751,11 +791,51 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); break; } - if (is_imm8(off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + emit_insn_suffix(&prog, dst_reg, src_reg, off); + *pprog = prog; +} + +static int emit_atomic(u8 **pprog, u8 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +{ + u8 *prog = *pprog; + int cnt = 0; + + EMIT1(0xF0); /* lock prefix */ + + maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + + emit_insn_suffix(&prog, dst_reg, src_reg, off); + *pprog = prog; + return 0; } static bool ex_handler_bpf(const struct exception_table_entry *x, @@ -789,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } +static int emit_nops(u8 **pprog, int len) +{ + u8 *prog = *pprog; + int i, noplen, cnt = 0; + + while (len > 0) { + noplen = len; + + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + + for (i = 0; i < noplen; i++) + EMIT1(ideal_nops[noplen][i]); + len -= noplen; + } + + *pprog = prog; + + return cnt; +} + +#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) + int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; @@ -800,8 +903,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; - int proglen = 0; + int ilen, proglen = 0; u8 *prog = temp; + int err; detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -813,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_prog->aux->func_idx != 0); push_callee_regs(&prog, callee_regs_used); - addrs[0] = prog - temp; + + ilen = prog - temp; + if (image) + memcpy(image + proglen, temp, ilen); + proglen += ilen; + addrs[0] = proglen; + prog = temp; for (i = 1; i <= insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; u8 b2 = 0, b3 = 0; + u8 *start_of_ldx; s64 jmp_offset; u8 jmp_cond; - int ilen; u8 *func; + int nops; switch (insn->code) { /* ALU */ @@ -837,17 +948,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_AND | BPF_X: case BPF_ALU64 | BPF_OR | BPF_X: case BPF_ALU64 | BPF_XOR | BPF_X: - switch (BPF_OP(insn->code)) { - case BPF_ADD: b2 = 0x01; break; - case BPF_SUB: b2 = 0x29; break; - case BPF_AND: b2 = 0x21; break; - case BPF_OR: b2 = 0x09; break; - case BPF_XOR: b2 = 0x31; break; - } - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_ALU64); + b2 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; @@ -1027,12 +1130,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } - + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; if (imm32 == 1) EMIT2(0xD1, add_1reg(b3, dst_reg)); else @@ -1066,11 +1164,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); if (src_reg != BPF_REG_4) @@ -1185,12 +1279,30 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + /* test src_reg, src_reg */ + maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */ + EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg)); + /* jne start_of_ldx */ + EMIT2(X86_JNE, 0); + /* xor dst_reg, dst_reg */ + emit_mov_imm32(&prog, false, dst_reg, 0); + /* jmp byte_after_ldx */ + EMIT2(0xEB, 0); + + /* populate jmp_offset for JNE above */ + temp[4] = prog - temp - 5 /* sizeof(test + jne) */; + start_of_ldx = prog; + } emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; u8 *_insn = image + proglen; s64 delta; + /* populate jmp_offset for JMP above */ + start_of_ldx[-1] = prog - start_of_ldx; + if (!bpf_prog->aux->extable) break; @@ -1230,21 +1342,56 @@ st: if (is_imm8(insn->off)) } break; - /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ - case BPF_STX | BPF_XADD | BPF_W: - /* Emit 'lock add dword ptr [rax + off], eax' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); - else - EMIT2(0xF0, 0x01); - goto xadd; - case BPF_STX | BPF_XADD | BPF_DW: - EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) { + u8 *branch_target; + bool is64 = BPF_SIZE(insn->code) == BPF_DW; + + /* + * Can't be implemented with a single x86 insn. + * Need to do a CMPXCHG loop. + */ + + /* Will need RAX as a CMPXCHG operand so save R0 */ + emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + branch_target = prog; + /* Load old value */ + emit_ldx(&prog, BPF_SIZE(insn->code), + BPF_REG_0, dst_reg, insn->off); + /* + * Perform the (commutative) operation locally, + * put the result in the AUX_REG. + */ + emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); + maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], + add_2reg(0xC0, AUX_REG, src_reg)); + /* Attempt to swap in new value */ + err = emit_atomic(&prog, BPF_CMPXCHG, + dst_reg, AUX_REG, insn->off, + BPF_SIZE(insn->code)); + if (WARN_ON(err)) + return err; + /* + * ZF tells us whether we won the race. If it's + * cleared we need to try again. + */ + EMIT2(X86_JNE, -(prog - branch_target) - 2); + /* Return the pre-modification value */ + emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + /* Restore R0 after clobbering RAX */ + emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); + break; + + } + + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + if (err) + return err; break; /* call */ @@ -1295,20 +1442,16 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSGE | BPF_X: case BPF_JMP32 | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: /* test dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; @@ -1344,10 +1487,8 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSLE | BPF_K: /* test dst_reg, dst_reg to save one extra byte */ if (imm32 == 0) { - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + maybe_emit_mod(&prog, dst_reg, dst_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); goto emit_cond_jmp; } @@ -1409,6 +1550,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ } jmp_offset = addrs[i + insn->off] - addrs[i]; if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To keep the jmp_offset valid, the extra bytes are + * padded before the jump insn, so we substract the + * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp_cond, then this BPF insn won't shrink, so + * "nops" is 0. + * + * On the other hand, if the previous pass emits an + * imm32 jmp_cond, the extra 4 bytes(*) is padded to + * keep the image from shrinking further. + * + * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond + * is 2 bytes, so the size difference is 4 bytes. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 4) { + pr_err("unexpected jmp_cond padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset); @@ -1431,11 +1596,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ else jmp_offset = addrs[i + insn->off] - addrs[i]; - if (!jmp_offset) - /* Optimize out nop jumps */ + if (!jmp_offset) { + /* + * If jmp_padding is enabled, the extra nops will + * be inserted. Otherwise, optimize out nop jumps. + */ + if (jmp_padding) { + /* There are 3 possible conditions. + * (1) This BPF_JA is already optimized out in + * the previous run, so there is no need + * to pad any extra byte (0 byte). + * (2) The previous pass emits an imm8 jmp, + * so we pad 2 bytes to match the previous + * insn size. + * (3) Similarly, the previous pass emits an + * imm32 jmp, and 5 bytes is padded. + */ + nops = INSN_SZ_DIFF; + if (nops != 0 && nops != 2 && nops != 5) { + pr_err("unexpected nop jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } break; + } emit_jmp: if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To avoid breaking jmp_offset, the extra bytes + * are padded before the actual jmp insn, so + * 2 bytes is substracted from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp, there is nothing to pad (0 byte). + * + * If it emits an imm32 jmp (5 bytes) previously + * and now an imm8 jmp (2 bytes), then we pad + * (5 - 2 = 3) bytes to stop the image from + * shrinking further. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 3) { + pr_err("unexpected jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT1_off32(0xE9, jmp_offset); @@ -1531,17 +1740,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; + u8 *jmp_insn; int cnt = 0; - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) - return -EINVAL; - } else { - if (emit_call(&prog, __bpf_prog_enter, prog)) + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_enter_sleepable : + __bpf_prog_enter, prog)) return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); - } + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* if (__bpf_prog_enter*(prog) == 0) + * goto skip_exec_of_prog; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + /* emit 2 nops that will be replaced with JE insn */ + jmp_insn = prog; + emit_nops(&prog, 2); /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1561,43 +1778,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + /* replace 2 nops with JE insn, since jmp target is known */ + jmp_insn[0] = X86_JE; + jmp_insn[1] = prog - jmp_insn - 2; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_exit_sleepable : + __bpf_prog_exit, prog)) return -EINVAL; - } else { - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; - } *pprog = prog; return 0; } -static void emit_nops(u8 **pprog, unsigned int len) -{ - unsigned int i, noplen; - u8 *prog = *pprog; - int cnt = 0; - - while (len > 0) { - noplen = len; - - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - - for (i = 0; i < noplen; i++) - EMIT1(ideal_nops[noplen][i]); - len -= noplen; - } - - *pprog = prog; -} - static void emit_align(u8 **pprog, u32 align) { u8 *target, *prog = *pprog; @@ -1972,6 +2169,9 @@ struct x64_jit_data { struct jit_context ctx; }; +#define MAX_PASSES 20 +#define PADDING_PASSES (MAX_PASSES - 5) + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; @@ -1981,6 +2181,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct jit_context ctx = {}; bool tmp_blinded = false; bool extra_pass = false; + bool padding = false; u8 *image = NULL; int *addrs; int pass; @@ -2017,6 +2218,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image = jit_data->image; header = jit_data->header; extra_pass = true; + padding = true; goto skip_init_addrs; } addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); @@ -2042,8 +2244,10 @@ skip_init_addrs: * may converge on the last pass. In such case do one more * pass to emit the final image. */ - for (pass = 0; pass < 20 || image; pass++) { - proglen = do_jit(prog, addrs, image, oldproglen, &ctx); + for (pass = 0; pass < MAX_PASSES || image; pass++) { + if (!padding && pass >= PADDING_PASSES) + padding = true; + proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 96fde03aa987..d17b67c69f89 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2243,10 +2243,8 @@ emit_jmp: return -EFAULT; } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; case BPF_JMP | BPF_EXIT: if (seen_exit) { diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 00bfa1ebad6c..0bb3b8b44e4e 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -9,16 +9,23 @@ in the right sequence from here. */ static __init int pci_arch_init(void) { - int type; - - x86_create_pci_msi_domain(); + int type, pcbios = 1; type = pci_direct_probe(); if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); - if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) + if (x86_init.pci.arch_init) + pcbios = x86_init.pci.arch_init(); + + /* + * Must happen after x86_init.pci.arch_init(). Xen sets up the + * x86_init.irqs.create_pci_msi_domain there. + */ + x86_create_pci_msi_domain(); + + if (!pcbios) return 0; pci_pcbios_init(); diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index d0e835470d01..b2f90a1a89f1 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -4,7 +4,6 @@ obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ -obj-y += goldfish/ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index e1e8d4e3a213..1b82d77019b1 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -54,10 +54,7 @@ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ static u64 efi_va = EFI_VA_START; - -struct efi_scratch efi_scratch; - -EXPORT_SYMBOL_GPL(efi_mm); +static struct mm_struct *efi_prev_mm; /* * We need our own copy of the higher levels of the page tables @@ -115,31 +112,12 @@ void efi_sync_low_kernel_mappings(void) pud_t *pud_k, *pud_efi; pgd_t *efi_pgd = efi_mm.pgd; - /* - * We can share all PGD entries apart from the one entry that - * covers the EFI runtime mapping space. - * - * Make sure the EFI runtime region mappings are guaranteed to - * only span a single PGD entry and that the entry also maps - * other important kernel regions. - */ - MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); - MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != - (EFI_VA_END & PGDIR_MASK)); - pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); pgd_k = pgd_offset_k(PAGE_OFFSET); num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET); memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); - /* - * As with PGDs, we share all P4D entries apart from the one entry - * that covers the EFI runtime mapping space. - */ - BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END)); - BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK)); - pgd_efi = efi_pgd + pgd_index(EFI_VA_END); pgd_k = pgd_offset_k(EFI_VA_END); p4d_efi = p4d_offset(pgd_efi, 0); @@ -256,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 1; } - efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */ + efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */ npages = (_etext - _text) >> PAGE_SHIFT; text = __pa(_text); @@ -481,11 +459,17 @@ void __init efi_dump_pagetable(void) * can not change under us. * It should be ensured that there are no concurent calls to this function. */ -void efi_switch_mm(struct mm_struct *mm) +void efi_enter_mm(void) +{ + efi_prev_mm = current->active_mm; + current->active_mm = &efi_mm; + switch_mm(efi_prev_mm, &efi_mm, NULL); +} + +void efi_leave_mm(void) { - efi_scratch.prev_mm = current->active_mm; - current->active_mm = mm; - switch_mm(efi_scratch.prev_mm, mm, NULL); + current->active_mm = efi_prev_mm; + switch_mm(&efi_mm, efi_prev_mm, NULL); } static DEFINE_SPINLOCK(efi_runtime_lock); @@ -549,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size, efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_switch_mm(&efi_mm); + efi_enter_mm(); status = __efi_thunk(set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_switch_mm(efi_scratch.prev_mm); + efi_leave_mm(); local_irq_restore(flags); return status; @@ -848,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_switch_mm(&efi_mm); + efi_enter_mm(); - kernel_fpu_begin(); + efi_fpu_begin(); /* Disable interrupts around EFI calls: */ local_irq_save(flags); @@ -859,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size, descriptor_version, virtual_map); local_irq_restore(flags); - kernel_fpu_end(); + efi_fpu_end(); /* grab the virtually remapped EFI runtime services table pointer */ efi.runtime = READ_ONCE(systab->runtime); - efi_switch_mm(efi_scratch.prev_mm); + efi_leave_mm(); return status; } diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 26f0da238c1c..fd3dd1708eba 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk) * Switch to 1:1 mapped 32-bit stack pointer. */ movq %rsp, %rax - movq efi_scratch(%rip), %rsp + movq efi_mixed_mode_stack_pa(%rip), %rsp push %rax /* @@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk) pushl %ebp lret SYM_CODE_END(__efi64_thunk) + + .bss + .balign 8 +SYM_DATA(efi_mixed_mode_stack_pa, .quad 0) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 5a40fe411ebd..67d93a243c35 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, * @return: Returns, if the page fault is not handled. This function * will never return if the page fault is handled successfully. */ -void efi_recover_from_page_fault(unsigned long phys_addr) +void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) { if (!IS_ENABLED(CONFIG_X86_64)) return; /* + * If we get an interrupt/NMI while processing an EFI runtime service + * then this is a regular OOPS, not an EFI failure. + */ + if (in_interrupt()) + return; + + /* * Make sure that an efi runtime service caused the page fault. + * READ_ONCE() because we might be OOPSing in a different thread, + * and we don't want to trip KTSAN while trying to OOPS. */ - if (efi_rts_work.efi_rts_id == EFI_NONE) + if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE || + current_work() != &efi_rts_work.work) return; /* @@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr) set_current_state(TASK_IDLE); schedule(); } - - return; } diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index c33f744b5388..b39bf3b5e108 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -22,6 +22,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <linux/dmi.h> #include <asm/geode.h> @@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = { static struct gpio_led alix_leds[] = { { .name = "alix:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 1, }, { .name = "alix:2", - .gpio = 25, .default_trigger = "default-off", - .active_low = 1, }, { .name = "alix:3", - .gpio = 27, .default_trigger = "default-off", - .active_low = 1, }, }; @@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = { .leds = alix_leds, }; +static struct gpiod_lookup_table alix_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW), + { } + }, +}; + static struct platform_device alix_leds_dev = { .name = "leds-gpio", .id = -1, @@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = { static void __init register_alix(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&alix_leds_gpio_table); platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs)); } diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index 73a3f49b4eb6..d263528c90bb 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -20,6 +20,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <linux/dmi.h> #include <asm/geode.h> @@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = { static struct gpio_led geos_leds[] = { { .name = "geos:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 1, }, { .name = "geos:2", - .gpio = 25, .default_trigger = "default-off", - .active_low = 1, }, { .name = "geos:3", - .gpio = 27, .default_trigger = "default-off", - .active_low = 1, }, }; @@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = { .leds = geos_leds, }; +static struct gpiod_lookup_table geos_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW), + { } + }, +}; + static struct platform_device geos_leds_dev = { .name = "leds-gpio", .id = -1, @@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = { static void __init register_geos(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&geos_leds_gpio_table); platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs)); } diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 163e1b545517..558384acd777 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -20,6 +20,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <asm/geode.h> @@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = { static struct gpio_led net5501_leds[] = { { .name = "net5501:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 0, }, }; @@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = { .leds = net5501_leds, }; +static struct gpiod_lookup_table net5501_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH), + { } + }, +}; + static struct platform_device net5501_leds_dev = { .name = "leds-gpio", .id = -1, @@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = { static void __init register_net5501(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&net5501_leds_gpio_table); platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs)); } diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile deleted file mode 100644 index 072c395379ac..000000000000 --- a/arch/x86/platform/goldfish/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GOLDFISH) += goldfish.o diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c deleted file mode 100644 index 6b6f8b4360dd..000000000000 --- a/arch/x86/platform/goldfish/goldfish.c +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2007 Google, Inc. - * Copyright (C) 2011 Intel, Inc. - * Copyright (C) 2013 Intel, Inc. - */ - -#include <linux/kernel.h> -#include <linux/irq.h> -#include <linux/platform_device.h> - -/* - * Where in virtual device memory the IO devices (timers, system controllers - * and so on) - */ - -#define GOLDFISH_PDEV_BUS_BASE (0xff001000) -#define GOLDFISH_PDEV_BUS_END (0xff7fffff) -#define GOLDFISH_PDEV_BUS_IRQ (4) - -#define GOLDFISH_TTY_BASE (0x2000) - -static struct resource goldfish_pdev_bus_resources[] = { - { - .start = GOLDFISH_PDEV_BUS_BASE, - .end = GOLDFISH_PDEV_BUS_END, - .flags = IORESOURCE_MEM, - }, - { - .start = GOLDFISH_PDEV_BUS_IRQ, - .end = GOLDFISH_PDEV_BUS_IRQ, - .flags = IORESOURCE_IRQ, - } -}; - -static bool goldfish_enable __initdata; - -static int __init goldfish_setup(char *str) -{ - goldfish_enable = true; - return 0; -} -__setup("goldfish", goldfish_setup); - -static int __init goldfish_init(void) -{ - if (!goldfish_enable) - return -ENODEV; - - platform_device_register_simple("goldfish_pdev_bus", -1, - goldfish_pdev_bus_resources, 2); - return 0; -} -device_initcall(goldfish_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 31dda18bb370..2930b6e9473e 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -88,8 +88,8 @@ static int __init bt_sfi_init(void) memset(&info, 0, sizeof(info)); info.fwnode = ddata->dev->fwnode; info.parent = ddata->dev; - info.name = ddata->name, - info.id = PLATFORM_DEVID_NONE, + info.name = ddata->name; + info.id = PLATFORM_DEVID_NONE; pdev = platform_device_register_full(&info); if (IS_ERR(pdev)) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 9e87ab010c82..e68ea5f4ad1c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -164,10 +164,10 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) else per_cpu(xen_vcpu_id, cpu) = cpu; rc = xen_vcpu_setup(cpu); - if (rc) + if (rc || !xen_have_vector_callback) return rc; - if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); @@ -188,6 +188,8 @@ static int xen_cpu_dead_hvm(unsigned int cpu) return 0; } +static bool no_vector_callback __initdata; + static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) @@ -207,7 +209,7 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - if (xen_feature(XENFEAT_hvm_callback_vector)) + if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -233,6 +235,13 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); +static __init int xen_parse_no_vector_callback(char *arg) +{ + no_vector_callback = true; + return 0; +} +early_param("xen_no_vector_callback", xen_parse_no_vector_callback); + bool __init xen_hvm_need_lapic(void) { if (xen_pv_domain()) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4409306364dc..dc0a337f985b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs); DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) { - /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */ exc_nmi(regs); } +DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault) +{ + /* On Xen PV, DF doesn't use IST. The C part is the same as native. */ + exc_double_fault(regs, error_code); +} + DEFINE_IDTENTRY_RAW(xenpv_exc_debug) { /* @@ -583,6 +589,27 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug) exc_debug(regs); } +DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap) +{ + /* This should never happen and there is no way to handle it. */ + pr_err("Unknown trap in Xen PV mode."); + BUG(); +} + +#ifdef CONFIG_X86_MCE +DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_machine_check(regs); + else + exc_machine_check(regs); +} +#endif + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -601,9 +628,9 @@ struct trap_array_entry { static struct trap_array_entry trap_array[] = { TRAP_ENTRY_REDIR(exc_debug, true ), - TRAP_ENTRY(exc_double_fault, true ), + TRAP_ENTRY_REDIR(exc_double_fault, true ), #ifdef CONFIG_X86_MCE - TRAP_ENTRY(exc_machine_check, true ), + TRAP_ENTRY_REDIR(exc_machine_check, true ), #endif TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), @@ -631,6 +658,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) { unsigned int nr; bool ist_okay = false; + bool found = false; /* * Replace trap handler addresses by Xen specific ones. @@ -645,6 +673,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) if (*addr == entry->orig) { *addr = entry->xen; ist_okay = entry->ist_okay; + found = true; break; } } @@ -655,9 +684,13 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) nr = (*addr - (void *)early_idt_handler_array[0]) / EARLY_IDT_HANDLER_SIZE; *addr = (void *)xen_early_idt_handler_array[nr]; + found = true; } - if (WARN_ON(ist != 0 && !ist_okay)) + if (!found) + *addr = (void *)xen_asm_exc_xen_unknown_trap; + + if (WARN_ON(found && ist != 0 && !ist_okay)) return false; return true; @@ -1002,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void) */ if (xen_have_vcpu_info_placement) { pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_ops.irq.restore_fl = - __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); pv_ops.irq.irq_enable = @@ -1040,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_pmc = xen_read_pmc, .iret = xen_iret, - .usergs_sysret64 = xen_sysret64, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -1065,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - /* Xen takes care of %gs when switching to usermode for us */ - .swapgs = paravirt_nop, - .start_context_switch = paravirt_start_context_switch, .end_context_switch = xen_end_context_switch, }; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 850c93f346c7..dfa091d79c2e 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); -__visible void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* See xen_irq_enable() for why preemption must be disabled. */ - preempt_disable(); - vcpu = this_cpu_read(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - - if (flags == 0) { - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - xen_force_evtchn_callback(); - preempt_enable(); - } else - preempt_enable_no_resched(); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); - asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to @@ -118,7 +96,6 @@ static void xen_halt(void) static const struct pv_irq_ops xen_irq_ops __initconst = { .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f5e7db4f82ab..6ff3c887e0b9 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -33,9 +33,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) int cpu; native_smp_prepare_cpus(max_cpus); - WARN_ON(xen_smp_intr_init(0)); - xen_init_lock_cpu(0); + if (xen_have_vector_callback) { + WARN_ON(xen_smp_intr_init(0)); + xen_init_lock_cpu(0); + } for_each_possible_cpu(cpu) { if (cpu == 0) @@ -50,9 +52,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) static void xen_hvm_cpu_die(unsigned int cpu) { if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); + if (xen_have_vector_callback) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + } } } #else @@ -64,14 +68,19 @@ static void xen_hvm_cpu_die(unsigned int cpu) void __init xen_hvm_smp_init(void) { - if (!xen_have_vector_callback) + smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_cpus_done = xen_smp_cpus_done; + smp_ops.cpu_die = xen_hvm_cpu_die; + + if (!xen_have_vector_callback) { +#ifdef CONFIG_PARAVIRT_SPINLOCKS + nopvspin = true; +#endif return; + } - smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; - smp_ops.cpu_die = xen_hvm_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; - smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; - smp_ops.smp_cpus_done = xen_smp_cpus_done; } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1cb0e84b9161..02f31341e435 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -72,34 +72,6 @@ SYM_FUNC_START(xen_save_fl_direct) ret SYM_FUNC_END(xen_save_fl_direct) - -/* - * In principle the caller should be passing us a value return from - * xen_save_fl_direct, but for robustness sake we test only the - * X86_EFLAGS_IF flag rather than the whole byte. After setting the - * interrupt mask state, it checks for unmasked pending events and - * enters the hypervisor to get them delivered if so. - */ -SYM_FUNC_START(xen_restore_fl_direct) - FRAME_BEGIN - testw $X86_EFLAGS_IF, %di - setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - /* - * Preempt here doesn't matter because that will deal with any - * pending interrupts. The pending check may end up being run - * on the wrong CPU, but that doesn't hurt. - */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jnz 1f - call check_events -1: - FRAME_END - ret -SYM_FUNC_END(xen_restore_fl_direct) - - /* * Force an event check by making a hypercall, but preserve regs * before making the call. @@ -161,7 +133,7 @@ xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op xen_pv_trap asm_exc_device_not_available -xen_pv_trap asm_exc_double_fault +xen_pv_trap asm_xenpv_exc_double_fault xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present @@ -172,12 +144,13 @@ xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE -xen_pv_trap asm_exc_machine_check +xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION xen_pv_trap entry_INT80_compat #endif +xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback __INIT @@ -214,26 +187,6 @@ SYM_CODE_START(xen_iret) jmp hypercall_iret SYM_CODE_END(xen_iret) -SYM_CODE_START(xen_sysret64) - /* - * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back. - * - * tss.sp2 is scratch space. - */ - movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - pushq $__USER_DS - pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - pushq %r11 - pushq $__USER_CS - pushq %rcx - - pushq $VGCF_in_syscall - jmp hypercall_iret -SYM_CODE_END(xen_sysret64) - /* * Xen handles syscall callbacks much like ordinary exceptions, which * means we have: diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9546c3384c75..8d7ec49a35fb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params) __visible void xen_irq_enable_direct(void); __visible void xen_irq_disable_direct(void); __visible unsigned long xen_save_fl_direct(void); -__visible void xen_restore_fl_direct(unsigned long); __visible unsigned long xen_read_cr2(void); __visible unsigned long xen_read_cr2_direct(void); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); -__visible void xen_sysret32(void); -__visible void xen_sysret64(void); extern int xen_panic_handler_init(void); |