diff options
author | Ingo Molnar <mingo@kernel.org> | 2023-10-05 11:05:51 +0300 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2023-10-05 11:05:51 +0300 |
commit | 3fc18b06b8f3408951b5e43548f22984412b0831 (patch) | |
tree | 5d509acc9a9010aae791cdba9e9f6a02e94b962c /arch/x86 | |
parent | a11e097504ac1889b35b6858f495565838325f88 (diff) | |
parent | 8a749fd1a8720d4619c91c8b6e7528c0a355c0aa (diff) | |
download | linux-3fc18b06b8f3408951b5e43548f22984412b0831.tar.xz |
Merge tag 'v6.6-rc4' into x86/entry, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
47 files changed, 463 insertions, 385 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c130bf3176fe..723fd7afef28 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1945,6 +1945,7 @@ config EFI select UCS2_STRING select EFI_RUNTIME_WRAPPERS select ARCH_USE_MEMREMAP_PROT + select EFI_RUNTIME_MAP if KEXEC_CORE help This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -2020,7 +2021,6 @@ config EFI_MAX_FAKE_MEM config EFI_RUNTIME_MAP bool "Export EFI runtime maps to sysfs" if EXPERT depends on EFI - default KEXEC_CORE help Export EFI runtime memory regions to /sys/firmware/efi/runtime-map. That memory map is required by the 2nd kernel to set up EFI virtual diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index bcc956c17872..08f93b0401bb 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -59,6 +59,14 @@ static void *alloc_pgt_page(void *context) return NULL; } + /* Consumed more tables than expected? */ + if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { + debug_putstr("pgt_buf running low in " __FILE__ "\n"); + debug_putstr("Need to raise BOOT_PGT_SIZE?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + } + entry = pages->pgt_buf + pages->pgt_buf_offset; pages->pgt_buf_offset += PAGE_SIZE; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a34e1a1adcf8..328990dd8632 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -305,7 +305,7 @@ static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) inc_irq_stat(irq_hv_callback_count); - xen_hvm_evtchn_do_upcall(); + xen_evtchn_do_upcall(); set_irq_regs(old_regs); } diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index abadd5f23425..e24976593a29 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -534,8 +534,12 @@ static void amd_pmu_cpu_reset(int cpu) /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); - /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); + /* + * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze + * and PerfCntrGLobalStatus.PerfCntrOvfl + */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); } static int amd_pmu_cpu_prepare(int cpu) @@ -570,6 +574,7 @@ static void amd_pmu_cpu_starting(int cpu) int i, nb_id; cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -591,8 +596,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -601,6 +604,7 @@ static void amd_pmu_cpu_dead(int cpu) kfree(cpuhw->lbr_sel); cpuhw->lbr_sel = NULL; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -613,8 +617,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - amd_pmu_cpu_reset(cpu); } static inline void amd_pmu_set_global_ctl(u64 ctl) @@ -884,7 +886,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) struct hw_perf_event *hwc; struct perf_event *event; int handled = 0, idx; - u64 status, mask; + u64 reserved, status, mask; bool pmu_enabled; /* @@ -909,6 +911,14 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) status &= ~GLOBAL_STATUS_LBRS_FROZEN; } + reserved = status & ~amd_pmu_global_cntr_mask; + if (reserved) + pr_warn_once("Reserved PerfCntrGlobalStatus bits are set (0x%llx), please consider updating microcode\n", + reserved); + + /* Clear any reserved bits set by buggy microcode */ + status &= amd_pmu_global_cntr_mask; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 4ae14339cb8c..b3a7cfb0d99e 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -40,23 +40,40 @@ #ifdef CONFIG_X86_64 # define BOOT_STACK_SIZE 0x4000 +/* + * Used by decompressor's startup_32() to allocate page tables for identity + * mapping of the 4G of RAM in 4-level paging mode: + * - 1 level4 table; + * - 1 level3 table; + * - 4 level2 table that maps everything with 2M pages; + * + * The additional level5 table needed for 5-level paging is allocated from + * trampoline_32bit memory. + */ # define BOOT_INIT_PGT_SIZE (6*4096) -# ifdef CONFIG_RANDOMIZE_BASE + /* - * Assuming all cross the 512GB boundary: - * 1 page for level4 - * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel - * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). - * Total is 19 pages. + * Total number of page tables kernel_add_identity_map() can allocate, + * including page tables consumed by startup_32(). + * + * Worst-case scenario: + * - 5-level paging needs 1 level5 table; + * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel, + * assuming all of them cross 256T boundary: + * + 4*2 level4 table; + * + 4*2 level3 table; + * + 4*2 level2 table; + * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM): + * + 1 level4 table; + * + 1 level3 table; + * + 1 level2 table; + * Total: 28 tables + * + * Add 4 spare table in case decompressor touches anything beyond what is + * accounted above. Warn if it happens. */ -# ifdef CONFIG_X86_VERBOSE_BOOTUP -# define BOOT_PGT_SIZE (19*4096) -# else /* !CONFIG_X86_VERBOSE_BOOTUP */ -# define BOOT_PGT_SIZE (17*4096) -# endif -# else /* !CONFIG_RANDOMIZE_BASE */ -# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE -# endif +# define BOOT_PGT_SIZE_WARN (28*4096) +# define BOOT_PGT_SIZE (32*4096) #else /* !CONFIG_X86_64 */ # define BOOT_STACK_SIZE 0x1000 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b0994ae3bc23..c4555b269a1b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -91,19 +91,6 @@ static inline void efi_fpu_end(void) #ifdef CONFIG_X86_32 #define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) - -#define arch_efi_call_virt_setup() \ -({ \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ -}) - -#define arch_efi_call_virt_teardown() \ -({ \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #else /* !CONFIG_X86_32 */ #define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT @@ -116,14 +103,6 @@ extern bool efi_disable_ibt_for_runtime; __efi_call(__VA_ARGS__); \ }) -#define arch_efi_call_virt_setup() \ -({ \ - efi_sync_low_kernel_mappings(); \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ - efi_enter_mm(); \ -}) - #undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \ @@ -132,13 +111,6 @@ extern bool efi_disable_ibt_for_runtime; ret; \ }) -#define arch_efi_call_virt_teardown() \ -({ \ - efi_leave_mm(); \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -168,8 +140,8 @@ extern void efi_delete_dummy_variable(void); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -void efi_enter_mm(void); -void efi_leave_mm(void); +void arch_efi_call_virt_setup(void); +void arch_efi_call_virt_teardown(void); /* kexec external ABI */ struct efi_setup_data { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1a4def36d5bb..17715cb8731d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1419,7 +1419,6 @@ struct kvm_arch { * the thread holds the MMU lock in write mode. */ spinlock_t tdp_mmu_pages_lock; - struct workqueue_struct *tdp_mmu_zap_wq; #endif /* CONFIG_X86_64 */ /* @@ -1835,7 +1834,7 @@ void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 97a3de7892d3..571fe4d2d232 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,6 +8,14 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) +#ifdef CONFIG_64BIT +/* + * The generic version tends to create spurious ENDBR instructions under + * certain conditions. + */ +#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; }) +#endif + #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ @@ -97,6 +105,13 @@ CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) +/* UML needs to be able to override memcpy() and friends for KASAN. */ +#ifdef CONFIG_UML +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK +#else +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS +#endif + /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ #define SYM_TYPED_FUNC_START(name) \ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 416901d406f8..8dac45a2c7fc 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -186,8 +186,7 @@ do { \ #else #define deactivate_mm(tsk, mm) \ do { \ - if (!tsk->vfork_done) \ - shstk_free(tsk); \ + shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4acbcddddc29..772d03487520 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -9,13 +9,6 @@ struct paravirt_patch_site { u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ }; - -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; #endif #ifdef CONFIG_PARAVIRT @@ -549,14 +542,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); -void paravirt_flush_lazy_mmu(void); - void _paravirt_nop(void); void paravirt_BUG(void); unsigned long paravirt_ret0(void); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d6ad98ca1288..e02b179ec659 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -955,6 +955,14 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } +static inline pte_t pte_next_pfn(pte_t pte) +{ + if (__pte_needs_invert(pte_val(pte))) + return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); +} +#define pte_next_pfn pte_next_pfn + static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 93cd28d6f317..6e30b27b1ebe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -683,13 +683,11 @@ extern u16 get_llc_id(unsigned int cpu); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); -extern bool cpu_has_ibpb_brtype_microcode(void); extern void amd_clear_divider(void); extern void amd_check_microcode(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } -static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } #endif diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 5fc35f889cd1..7048dfacc04b 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -36,6 +36,7 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; +#include <asm/bug.h> #include <asm/processor.h> #define XEN_SIGNATURE "XenVMMXenVMM" @@ -63,4 +64,40 @@ void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); #endif +/* Lazy mode for batching updates / context switch */ +enum xen_lazy_mode { + XEN_LAZY_NONE, + XEN_LAZY_MMU, + XEN_LAZY_CPU, +}; + +DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode); +DECLARE_PER_CPU(unsigned int, xen_lazy_nesting); + +static inline void enter_lazy(enum xen_lazy_mode mode) +{ + enum xen_lazy_mode old_mode = this_cpu_read(xen_lazy_mode); + + if (mode == old_mode) { + this_cpu_inc(xen_lazy_nesting); + return; + } + + BUG_ON(old_mode != XEN_LAZY_NONE); + + this_cpu_write(xen_lazy_mode, mode); +} + +static inline void leave_lazy(enum xen_lazy_mode mode) +{ + BUG_ON(this_cpu_read(xen_lazy_mode) != mode); + + if (this_cpu_read(xen_lazy_nesting) == 0) + this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE); + else + this_cpu_dec(xen_lazy_nesting); +} + +enum xen_lazy_mode xen_get_lazy_mode(void); + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a5ead6a6d233..517ee01503be 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -720,13 +720,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; - /* - * Do not patch out the default return thunks if those needed are the - * ones generated by the compiler. - */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK) && - (x86_return_thunk == __x86_return_thunk)) - return; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + static_call_force_reinit(); for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d9f5d7492f83..205cee567629 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -1533,7 +1533,7 @@ static void __init build_socket_tables(void) { struct uv_gam_range_entry *gre = uv_gre_table; int nums, numn, nump; - int cpu, i, lnid; + int i, lnid, apicid; int minsock = _min_socket; int maxsock = _max_socket; int minpnode = _min_pnode; @@ -1584,15 +1584,14 @@ static void __init build_socket_tables(void) /* Set socket -> node values: */ lnid = NUMA_NO_NODE; - for_each_possible_cpu(cpu) { - int nid = cpu_to_node(cpu); - int apicid, sockid; + for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) { + int nid = __apicid_to_node[apicid]; + int sockid; - if (lnid == nid) + if ((nid == NUMA_NO_NODE) || (lnid == nid)) continue; lnid = nid; - apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; if (_socket_to_node[sockid - minsock] == SOCK_EMPTY) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index c06bfc086565..faa9f2299848 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -272,7 +272,6 @@ void __init callthunks_patch_builtin_calls(void) pr_info("Setting up call depth tracking\n"); mutex_lock(&text_mutex); callthunks_setup(&cs, &builtin_coretext); - static_call_force_reinit(); thunks_initialized = true; mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dd8379d84445..03ef962a6992 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -766,6 +766,15 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TOPOEXT)) smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { + if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + setup_force_cpu_cap(X86_FEATURE_SBPB); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -1301,25 +1310,6 @@ void amd_check_microcode(void) on_each_cpu(zenbleed_check_cpu, NULL, 1); } -bool cpu_has_ibpb_brtype_microcode(void) -{ - switch (boot_cpu_data.x86) { - /* Zen1/2 IBPB flushes branch type predictions too. */ - case 0x17: - return boot_cpu_has(X86_FEATURE_AMD_IBPB); - case 0x19: - /* Poke the MSR bit on Zen3/4 to check its presence. */ - if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { - setup_force_cpu_cap(X86_FEATURE_SBPB); - return true; - } else { - return false; - } - default: - return false; - } -} - /* * Issue a DIV 0/1 insn to clear any division data from previous DIV * operations. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f081d26616ac..10499bcd4e39 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2404,27 +2404,16 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode; + bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) goto pred_cmd; - /* - * The first check is for the kernel running as a guest in order - * for guests to verify whether IBPB is a viable mitigation. - */ - has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode(); if (!has_microcode) { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); } else { /* - * Enable the synthetic (even if in a real CPUID leaf) - * flags for guests. - */ - setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - - /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. */ @@ -2444,7 +2433,7 @@ static void __init srso_select_mitigation(void) switch (srso_cmd) { case SRSO_CMD_OFF: - return; + goto pred_cmd; case SRSO_CMD_MICROCODE: if (has_microcode) { @@ -2717,7 +2706,7 @@ static ssize_t srso_show_state(char *buf) return sysfs_emit(buf, "%s%s\n", srso_strings[srso_mitigation], - (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode")); + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode"); } static ssize_t gds_show_state(char *buf) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index afa755e06ae9..0a3ea787ac51 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1304,7 +1304,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), VULNBL_AMD(0x19, SRSO), {} }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 91fa70e51004..279148e72459 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +/* + * Ensure the SECS page is not swapped out. Must be called with encl->lock + * to protect the enclave states including SECS and ensure the SECS page is + * not swapped out again while being used. + */ +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; } - if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock); + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 3a43a2dee658..9c9faa1634fb 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -695,7 +695,6 @@ void kgdb_arch_exit(void) } /** - * * kgdb_skipexception - Bail out of KGDB when we've been triggered. * @exception: Exception vector number * @regs: Current &struct pt_regs. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 975f98d5eee5..97f1436c1a20 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -143,66 +143,7 @@ int paravirt_disable_iospace(void) return request_resource(&ioport_resource, &reserve_ioports); } -static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; - -static inline void enter_lazy(enum paravirt_lazy_mode mode) -{ - BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - - this_cpu_write(paravirt_lazy_mode, mode); -} - -static void leave_lazy(enum paravirt_lazy_mode mode) -{ - BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); - - this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); -} - -void paravirt_enter_lazy_mmu(void) -{ - enter_lazy(PARAVIRT_LAZY_MMU); -} - -void paravirt_leave_lazy_mmu(void) -{ - leave_lazy(PARAVIRT_LAZY_MMU); -} - -void paravirt_flush_lazy_mmu(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } - - preempt_enable(); -} - #ifdef CONFIG_PARAVIRT_XXL -void paravirt_start_context_switch(struct task_struct *prev) -{ - BUG_ON(preemptible()); - - if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { - arch_leave_lazy_mmu_mode(); - set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); - } - enter_lazy(PARAVIRT_LAZY_CPU); -} - -void paravirt_end_context_switch(struct task_struct *next) -{ - BUG_ON(preemptible()); - - leave_lazy(PARAVIRT_LAZY_CPU); - - if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) - arch_enter_lazy_mmu_mode(); -} - static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); @@ -229,14 +170,6 @@ static noinstr void pv_native_safe_halt(void) } #endif -enum paravirt_lazy_mode paravirt_get_lazy_mode(void) -{ - if (in_interrupt()) - return PARAVIRT_LAZY_NONE; - - return this_cpu_read(paravirt_lazy_mode); -} - struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f0909142a0a..b6f4e8399fca 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -257,13 +257,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); - /* - * If copy_thread() if failing, don't leak the shadow stack possibly - * allocated in shstk_alloc_thread_stack() above. - */ - if (ret) - shstk_free(p); - return ret; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b9145a63da77..b098b1fa2470 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -358,15 +358,11 @@ static void __init add_early_ima_buffer(u64 phys_addr) #if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) int __init ima_free_kexec_buffer(void) { - int rc; - if (!ima_kexec_buffer_size) return -ENOENT; - rc = memblock_phys_free(ima_kexec_buffer_phys, - ima_kexec_buffer_size); - if (rc) - return rc; + memblock_free_late(ima_kexec_buffer_phys, + ima_kexec_buffer_size); ima_kexec_buffer_phys = 0; ima_kexec_buffer_size = 0; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index fd689921a1db..59e15dd8d0f8 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -205,10 +205,21 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl return 0; /* - * For CLONE_VM, except vfork, the child needs a separate shadow + * For CLONE_VFORK the child will share the parents shadow stack. + * Make sure to clear the internal tracking of the thread shadow + * stack so the freeing logic run for child knows to leave it alone. + */ + if (clone_flags & CLONE_VFORK) { + shstk->base = 0; + shstk->size = 0; + return 0; + } + + /* + * For !CLONE_VM the child will use a copy of the parents shadow * stack. */ - if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) + if (!(clone_flags & CLONE_VM)) return 0; size = adjust_shstk_size(stack_size); @@ -408,7 +419,25 @@ void shstk_free(struct task_struct *tsk) if (!tsk->mm || tsk->mm != current->mm) return; + /* + * If shstk->base is NULL, then this task is not managing its + * own shadow stack (CLONE_VFORK). So skip freeing it. + */ + if (!shstk->base) + return; + + /* + * shstk->base is NULL for CLONE_VFORK child tasks, and so is + * normal. But size = 0 on a shstk->base is not normal and + * indicated an attempt to free the thread shadow stack twice. + * Warn about it. + */ + if (WARN_ON(!shstk->size)) + return; + unmap_shadow_stack(shstk->base, shstk->size); + + shstk->size = 0; } static int wrss_control(bool enable) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4e45ff44aa07..48e040618731 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -579,7 +579,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -603,7 +602,14 @@ static int x86_cluster_flags(void) return cpu_cluster_flags() | x86_sched_itmt_flags(); } #endif -#endif + +static int x86_die_flags(void) +{ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return x86_sched_itmt_flags(); + + return 0; +} /* * Set if a package/die has multiple NUMA nodes inside. @@ -640,7 +646,7 @@ static void __init build_sched_topology(void) */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, SD_INIT_NAME(DIE) + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) }; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e1d011c67cc6..f7901cb4d2fa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6167,20 +6167,15 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -int kvm_mmu_init_vm(struct kvm *kvm) +void kvm_mmu_init_vm(struct kvm *kvm) { - int r; - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - if (tdp_mmu_enabled) { - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; - } + if (tdp_mmu_enabled) + kvm_mmu_init_tdp_mmu(kvm); kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; @@ -6189,8 +6184,6 @@ int kvm_mmu_init_vm(struct kvm *kvm) kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; - - return 0; } static void mmu_free_vm_memory_caches(struct kvm *kvm) @@ -6246,7 +6239,6 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { bool flush; - int i; if (WARN_ON_ONCE(gfn_end <= gfn_start)) return; @@ -6257,11 +6249,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); - if (tdp_mmu_enabled) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, - gfn_end, true, flush); - } + if (tdp_mmu_enabled) + flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index b102014e2c60..decc1f153669 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -58,7 +58,12 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; - u8 mmu_valid_gen; + union { + u8 mmu_valid_gen; + + /* Only accessed under slots_lock. */ + bool tdp_mmu_scheduled_root_to_zap; + }; /* * The shadow page can't be replaced by an equivalent huge page @@ -100,13 +105,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ tdp_ptep_t ptep; }; - union { - DECLARE_BITMAP(unsync_child_bitmap, 512); - struct { - struct work_struct tdp_mmu_async_work; - void *tdp_mmu_async_data; - }; - }; + DECLARE_BITMAP(unsync_child_bitmap, 512); /* * Tracks shadow pages that, if zapped, would allow KVM to create an NX diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6c63f2d1675f..6cd4dd631a2f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -12,18 +12,10 @@ #include <trace/events/kvm.h> /* Initializes the TDP MMU for the VM, if enabled. */ -int kvm_mmu_init_tdp_mmu(struct kvm *kvm) +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - struct workqueue_struct *wq; - - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); - if (!wq) - return -ENOMEM; - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - kvm->arch.tdp_mmu_zap_wq = wq; - return 1; } /* Arbitrarily returns true so that this may be used in if statements. */ @@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) * ultimately frees all roots. */ kvm_tdp_mmu_invalidate_all_roots(kvm); - - /* - * Destroying a workqueue also first flushes the workqueue, i.e. no - * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). - */ - destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); + kvm_tdp_mmu_zap_invalidated_roots(kvm); WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* * Ensure that all the outstanding RCU callbacks to free shadow pages - * can run before the VM is torn down. Work items on tdp_mmu_zap_wq - * can call kvm_tdp_mmu_put_root and create new callbacks. + * can run before the VM is torn down. Putting the last reference to + * zapped roots will create new callbacks. */ rcu_barrier(); } @@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } -static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared); - -static void tdp_mmu_zap_root_work(struct work_struct *work) -{ - struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, - tdp_mmu_async_work); - struct kvm *kvm = root->tdp_mmu_async_data; - - read_lock(&kvm->mmu_lock); - - /* - * A TLB flush is not necessary as KVM performs a local TLB flush when - * allocating a new root (see kvm_mmu_load()), and when migrating vCPU - * to a different pCPU. Note, the local TLB flush on reuse also - * invalidates any paging-structure-cache entries, i.e. TLB entries for - * intermediate paging structures, that may be zapped, as such entries - * are associated with the ASID on both VMX and SVM. - */ - tdp_mmu_zap_root(kvm, root, true); - - /* - * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for - * avoiding an infinite loop. By design, the root is reachable while - * it's being asynchronously zapped, thus a different task can put its - * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an - * asynchronously zapped root is unavoidable. - */ - kvm_tdp_mmu_put_root(kvm, root, true); - - read_unlock(&kvm->mmu_lock); -} - -static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) -{ - root->tdp_mmu_async_data = kvm; - INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); - queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); -} - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { @@ -211,8 +158,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ + if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ + } else /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, @@ -292,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) * by a memslot update or by the destruction of the VM. Initialize the * refcount to two; one reference for the vCPU, and one reference for * the TDP MMU itself, which is held until the root is invalidated and - * is ultimately put by tdp_mmu_zap_root_work(). + * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); @@ -877,13 +828,12 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or * more SPTEs were zapped since the MMU lock was last acquired. */ -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, - bool can_yield, bool flush) +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); + for_each_tdp_mmu_root_yield_safe(kvm, root, false) + flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; } @@ -891,7 +841,6 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *root; - int i; /* * Zap all roots, including invalid roots, as all SPTEs must be dropped @@ -905,10 +854,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - for_each_tdp_mmu_root_yield_safe(kvm, root, i) - tdp_mmu_zap_root(kvm, root, false); - } + for_each_tdp_mmu_root_yield_safe(kvm, root, false) + tdp_mmu_zap_root(kvm, root, false); } /* @@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + struct kvm_mmu_page *root; + + read_lock(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, true) { + if (!root->tdp_mmu_scheduled_root_to_zap) + continue; + + root->tdp_mmu_scheduled_root_to_zap = false; + KVM_BUG_ON(!root->role.invalid, kvm); + + /* + * A TLB flush is not necessary as KVM performs a local TLB + * flush when allocating a new root (see kvm_mmu_load()), and + * when migrating a vCPU to a different pCPU. Note, the local + * TLB flush on reuse also invalidates paging-structure-cache + * entries, i.e. TLB entries for intermediate paging structures, + * that may be zapped, as such entries are associated with the + * ASID on both VMX and SVM. + */ + tdp_mmu_zap_root(kvm, root, true); + + /* + * The referenced needs to be put *after* zapping the root, as + * the root must be reachable by mmu_notifiers while it's being + * zapped + */ + kvm_tdp_mmu_put_root(kvm, root, true); + } + + read_unlock(&kvm->mmu_lock); } /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual - * zapping is performed asynchronously. Using a separate workqueue makes it - * easy to ensure that the destruction is performed before the "fast zap" - * completes, without keeping a separate list of invalidated roots; the list is - * effectively the list of work items in the workqueue. + * zapping is done separately so that it happens with mmu_lock with read, + * whereas invalidating roots must be done with mmu_lock held for write (unless + * the VM is being destroyed). * - * Note, the asynchronous worker is gifted the TDP MMU's reference. + * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_get_vcpu_root_hpa(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) @@ -953,19 +929,20 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) /* * As above, mmu_lock isn't held when destroying the VM! There can't * be other references to @kvm, i.e. nothing else can invalidate roots - * or be consuming roots, but walking the list of roots does need to be - * guarded against roots being deleted by the asynchronous zap worker. + * or get/put references to roots. */ - rcu_read_lock(); - - list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + /* + * Note, invalid roots can outlive a memslot update! Invalid + * roots must be *zapped* before the memslot update completes, + * but a different task can acquire a reference and keep the + * root alive after its been zapped. + */ if (!root->role.invalid) { + root->tdp_mmu_scheduled_root_to_zap = true; root->role.invalid = true; - tdp_mmu_schedule_zap_root(kvm, root); } } - - rcu_read_unlock(); } /* @@ -1146,8 +1123,13 @@ retry: bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, - range->end, range->may_block, flush); + struct kvm_mmu_page *root; + + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false) + flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, + range->may_block, flush); + + return flush; } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 0a63b1afabd3..733a3aef3a96 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,7 +7,7 @@ #include "spte.h" -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); @@ -20,8 +20,7 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush); +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b9a0a939d59f..4900c078045a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2962,6 +2962,32 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) count, in); } +static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { + bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); + } +} + +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_cpuid_entry2 *best; + + /* For sev guests, the memory encryption bit is not reserved in CR3. */ + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); + if (best) + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); + + if (sev_es_guest(svm->vcpu.kvm)) + sev_es_vcpu_after_set_cpuid(svm); +} + static void sev_es_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -3024,14 +3050,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && - (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) { - set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); - if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP)) - svm_clr_intercept(svm, INTERCEPT_RDTSCP); - } } void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f283eb47f6ac..9507df93f410 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -683,6 +683,21 @@ static int svm_hardware_enable(void) amd_pmu_enable_virt(); + /* + * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type + * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. + * Since Linux does not change the value of TSC_AUX once set, prime the + * TSC_AUX field now to avoid a RDMSR on every vCPU run. + */ + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { + struct sev_es_save_area *hostsa; + u32 msr_hi; + + hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); + + rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi); + } + return 0; } @@ -1532,7 +1547,14 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); - if (likely(tsc_aux_uret_slot >= 0)) + /* + * TSC_AUX is always virtualized for SEV-ES guests when the feature is + * available. The user return MSR support is not required in this case + * because TSC_AUX is restored on #VMEXIT from the host save area + * (which has been initialized in svm_hardware_enable()). + */ + if (likely(tsc_aux_uret_slot >= 0) && + (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); svm->guest_state_loaded = true; @@ -3087,6 +3109,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_TSC_AUX: /* + * TSC_AUX is always virtualized for SEV-ES guests when the + * feature is available. The user return MSR support is not + * required in this case because TSC_AUX is restored on #VMEXIT + * from the host save area (which has been initialized in + * svm_hardware_enable()). + */ + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) + break; + + /* * TSC_AUX is usually changed only during boot and never read * directly. Intercept TSC_AUX instead of exposing it to the * guest via direct_access_msrs, and switch it via user return. @@ -4284,7 +4316,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_cpuid_entry2 *best; /* * SVM doesn't provide a way to disable just XSAVES in the guest, KVM @@ -4328,12 +4359,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); - /* For sev guests, the memory encryption bit is not reserved in CR3. */ - if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F); - if (best) - vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); - } + if (sev_guest(vcpu->kvm)) + sev_vcpu_after_set_cpuid(svm); init_vmcb_after_set_cpuid(vcpu); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f41253958357..be67ab7fdd10 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -684,6 +684,7 @@ void __init sev_hardware_setup(void); void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); void sev_init_vmcb(struct vcpu_svm *svm); +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c9c81e82e65..9f18b06bbda6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12308,9 +12308,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out; - ret = kvm_mmu_init_vm(kvm); - if (ret) - goto out_page_track; + kvm_mmu_init_vm(kvm); ret = static_call(kvm_x86_vm_init)(kvm); if (ret) @@ -12355,7 +12353,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) out_uninit_mmu: kvm_mmu_uninit_vm(kvm); -out_page_track: kvm_page_track_cleanup(kvm); out: return ret; diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 8f95fb267caa..76697df8dfd5 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -40,7 +40,7 @@ SYM_TYPED_FUNC_START(__memcpy) SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy) -SYM_FUNC_ALIAS(memcpy, __memcpy) +SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_START_LOCAL(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0559b206fb11..ccdf3a597045 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -212,5 +212,5 @@ SYM_FUNC_START(__memmove) SYM_FUNC_END(__memmove) EXPORT_SYMBOL(__memmove) -SYM_FUNC_ALIAS(memmove, __memmove) +SYM_FUNC_ALIAS_MEMFUNC(memmove, __memmove) EXPORT_SYMBOL(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 7c59a704c458..3d818b849ec6 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -40,7 +40,7 @@ SYM_FUNC_START(__memset) SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset) -SYM_FUNC_ALIAS(memset, __memset) +SYM_FUNC_ALIAS_MEMFUNC(memset, __memset) EXPORT_SYMBOL(memset) SYM_FUNC_START_LOCAL(memset_orig) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 1451e0c4ae22..235bbda6fc82 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -56,7 +56,6 @@ SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) SYM_FUNC_START(__put_user_nocheck_1) - ENDBR ASM_STAC 2: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -76,7 +75,6 @@ SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) SYM_FUNC_START(__put_user_nocheck_2) - ENDBR ASM_STAC 4: movw %ax,(%_ASM_CX) xor %ecx,%ecx @@ -96,7 +94,6 @@ SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) SYM_FUNC_START(__put_user_nocheck_4) - ENDBR ASM_STAC 6: movl %eax,(%_ASM_CX) xor %ecx,%ecx @@ -119,7 +116,6 @@ SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) SYM_FUNC_START(__put_user_nocheck_8) - ENDBR ASM_STAC 9: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index e06a199423c0..b2cc7b4552a1 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -140,3 +140,15 @@ void __init efi_runtime_update_mappings(void) } } } + +void arch_efi_call_virt_setup(void) +{ + efi_fpu_begin(); + firmware_restrict_branch_speculation_start(); +} + +void arch_efi_call_virt_teardown(void) +{ + firmware_restrict_branch_speculation_end(); + efi_fpu_end(); +} diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77f7ac3668cb..91d31ac422d6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -474,19 +474,34 @@ void __init efi_dump_pagetable(void) * can not change under us. * It should be ensured that there are no concurrent calls to this function. */ -void efi_enter_mm(void) +static void efi_enter_mm(void) { efi_prev_mm = current->active_mm; current->active_mm = &efi_mm; switch_mm(efi_prev_mm, &efi_mm, NULL); } -void efi_leave_mm(void) +static void efi_leave_mm(void) { current->active_mm = efi_prev_mm; switch_mm(&efi_mm, efi_prev_mm, NULL); } +void arch_efi_call_virt_setup(void) +{ + efi_sync_low_kernel_mappings(); + efi_fpu_begin(); + firmware_restrict_branch_speculation_start(); + efi_enter_mm(); +} + +void arch_efi_call_virt_teardown(void) +{ + efi_leave_mm(); + firmware_restrict_branch_speculation_end(); + efi_fpu_end(); +} + static DEFINE_SPINLOCK(efi_runtime_lock); /* diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index c2a29be35c01..08aa0f25f12a 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -19,6 +19,10 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY # optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS)) +# When LTO is enabled, llvm emits many text sections, which is not supported +# by kexec. Remove -flto=* flags. +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS)) + # When linking purgatory.ro with -r unresolved symbols are not checked, # also link a purgatory.chk binary without -r to check for unresolved symbols. PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 863d0d6b3edc..7250d0e0e1a9 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -138,7 +138,7 @@ void __init xen_efi_init(struct boot_params *boot_params) if (efi_systab_xen == NULL) return; - strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen", + strscpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen", sizeof(boot_params->efi_info.efi_loader_signature)); boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen); boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b8db2148c07d..0337392a3121 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -32,7 +32,7 @@ EXPORT_SYMBOL_GPL(hypercall_page); * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info * but during boot it is switched to point to xen_vcpu_info. - * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events. + * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events. */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 9a192f51f1b0..3f8c34707c50 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -136,7 +136,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback) inc_irq_stat(irq_hv_callback_count); - xen_hvm_evtchn_do_upcall(); + xen_evtchn_do_upcall(); set_irq_regs(old_regs); } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 49352fad7d1d..bbbfdd495ebd 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -101,6 +101,17 @@ struct tls_descs { struct desc_struct desc[3]; }; +DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE; +DEFINE_PER_CPU(unsigned int, xen_lazy_nesting); + +enum xen_lazy_mode xen_get_lazy_mode(void) +{ + if (in_interrupt()) + return XEN_LAZY_NONE; + + return this_cpu_read(xen_lazy_mode); +} + /* * Updating the 3 TLS descriptors in the GDT on every task switch is * surprisingly expensive so we avoid updating them if they haven't @@ -362,10 +373,25 @@ static noinstr unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } +static void xen_start_context_switch(struct task_struct *prev) +{ + BUG_ON(preemptible()); + + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } + enter_lazy(XEN_LAZY_CPU); +} + static void xen_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + xen_mc_flush(); - paravirt_end_context_switch(next); + leave_lazy(XEN_LAZY_CPU); + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } static unsigned long xen_store_tr(void) @@ -472,7 +498,7 @@ static void xen_set_ldt(const void *addr, unsigned entries) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gdt(const struct desc_ptr *dtr) @@ -568,7 +594,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * exception between the new %fs descriptor being loaded and * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (xen_get_lazy_mode() == XEN_LAZY_CPU) loadsegment(fs, 0); xen_mc_batch(); @@ -577,7 +603,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) load_TLS_descriptor(t, cpu, 1); load_TLS_descriptor(t, cpu, 2); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gs_index(unsigned int idx) @@ -909,7 +935,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } @@ -973,7 +999,7 @@ static void xen_write_cr0(unsigned long cr0) MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_write_cr4(unsigned long cr4) @@ -1156,7 +1182,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, + .start_context_switch = xen_start_context_switch, .end_context_switch = xen_end_context_switch, }, }; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1652c39e3dfb..b6830554ff69 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -236,7 +236,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -270,7 +270,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) { struct mmu_update u; - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + if (xen_get_lazy_mode() != XEN_LAZY_MMU) return false; xen_mc_batch(); @@ -279,7 +279,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) u.val = pte_val_ma(pteval); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); return true; } @@ -325,7 +325,7 @@ void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } /* Assume pteval_t is equivalent to all the other *val_t types. */ @@ -419,7 +419,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -499,7 +499,7 @@ static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) __xen_set_p4d_hyper(ptr, val); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -531,7 +531,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) if (user_ptr) __xen_set_p4d_hyper((p4d_t *)user_ptr, val); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } #if CONFIG_PGTABLE_LEVELS >= 5 @@ -1245,7 +1245,7 @@ static noinline void xen_flush_tlb(void) op->cmd = MMUEXT_TLB_FLUSH_LOCAL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -1265,7 +1265,7 @@ static void xen_flush_tlb_one_user(unsigned long addr) op->arg1.linear_addr = addr & PAGE_MASK; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -1302,7 +1302,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus, MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } static unsigned long xen_read_cr3(void) @@ -1361,7 +1361,7 @@ static void xen_write_cr3(unsigned long cr3) else __xen_write_cr3(false, 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */ } /* @@ -1396,7 +1396,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) __xen_write_cr3(true, cr3); - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */ } static int xen_pgd_alloc(struct mm_struct *mm) @@ -1557,7 +1557,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } } @@ -1587,7 +1587,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) __set_pfn_prot(pfn, PAGE_KERNEL); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); ClearPagePinned(page); } @@ -1804,7 +1804,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) */ xen_mc_batch(); __xen_write_cr3(true, __pa(init_top_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); /* We can't that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for @@ -2083,6 +2083,23 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } +static void xen_enter_lazy_mmu(void) +{ + enter_lazy(XEN_LAZY_MMU); +} + +static void xen_flush_lazy_mmu(void) +{ + preempt_disable(); + + if (xen_get_lazy_mode() == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + static void __init xen_post_allocator_init(void) { pv_ops.mmu.set_pte = xen_set_pte; @@ -2107,7 +2124,7 @@ static void xen_leave_lazy_mmu(void) { preempt_disable(); xen_mc_flush(); - paravirt_leave_lazy_mmu(); + leave_lazy(XEN_LAZY_MMU); preempt_enable(); } @@ -2166,9 +2183,9 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .exit_mmap = xen_exit_mmap, .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, + .enter = xen_enter_lazy_mmu, .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, + .flush = xen_flush_lazy_mmu, }, .set_fixmap = xen_set_fixmap, @@ -2385,7 +2402,7 @@ static noinline void xen_flush_tlb_all(void) op->cmd = MMUEXT_TLB_FLUSH_ALL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 1c51b2c87f30..c3867b585e0d 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -26,7 +26,7 @@ static inline void xen_mc_batch(void) /* need to disable interrupts until this entry is complete */ local_irq_save(flags); - trace_xen_mc_batch(paravirt_get_lazy_mode()); + trace_xen_mc_batch(xen_get_lazy_mode()); __this_cpu_write(xen_mc_irq_flags, flags); } @@ -44,7 +44,7 @@ static inline void xen_mc_issue(unsigned mode) { trace_xen_mc_issue(mode); - if ((paravirt_get_lazy_mode() & mode) == 0) + if ((xen_get_lazy_mode() & mode) == 0) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ |