diff options
Diffstat (limited to 'arch/x86/kernel')
51 files changed, 2634 insertions, 865 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f4e8fa6ed75..2ff3e600f426 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_sev.o = -pg +CFLAGS_REMOVE_cc_platform.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n KASAN_SANITIZE_sev.o := n +KASAN_SANITIZE_cc_platform.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -47,6 +49,7 @@ endif KCOV_INSTRUMENT := n CFLAGS_head$(BITS).o += -fno-stack-protector +CFLAGS_cc_platform.o += -fno-stack-protector CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace @@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o + +obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e9da3dc71254..23fb4d51a5da 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -29,6 +29,7 @@ #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> +#include <asm/asm-prototypes.h> int __read_mostly alternatives_patched; @@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { struct insn insn; int i = 0; @@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins * optimized. */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, a->instrlen, i); + i += optimize_nops_range(instr, len, i); else i += insn.length; - if (i >= a->instrlen) + if (i >= len) return; } } @@ -331,10 +333,185 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); next: - optimize_nops(a, instr); + optimize_nops(instr, a->instrlen); } } +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,amd when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ + #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) @@ -643,6 +820,12 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + + /* * Then patch alternatives, such that those paravirt calls that are in * alternatives can be overwritten by their immediate fragments. */ diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index f4da9bb69a88..e696e22d0531 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -15,9 +15,15 @@ struct cluster_mask { struct cpumask mask; }; -static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +/* + * __x2apic_send_IPI_mask() possibly needs to read + * x86_cpu_to_logical_apicid for all online cpus in a sequential way. + * Using per cpu variable would cost one cache line per cpu. + */ +static u32 *x86_cpu_to_logical_apicid __read_mostly; + static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) static void x2apic_send_IPI(int cpu, int vector) { - u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + u32 dest = x86_cpu_to_logical_apicid[cpu]; /* x2apic MSRs are special and need a special fence: */ weak_wrmsr_fence(); @@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) dest = 0; for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) - dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); + dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) continue; @@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector) static u32 x2apic_calc_apicid(unsigned int cpu) { - return per_cpu(x86_cpu_to_logical_apicid, cpu); + return x86_cpu_to_logical_apicid[cpu]; } static void init_x2apic_ldr(void) @@ -103,7 +109,7 @@ static void init_x2apic_ldr(void) u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - this_cpu_write(x86_cpu_to_logical_apicid, apicid); + x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; if (cmsk) goto update; @@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) static int x2apic_cluster_probe(void) { + u32 slots; + if (!x2apic_mode) return 0; + slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids); + x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL); + if (!x86_cpu_to_logical_apicid) + return 0; + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); + kfree(x86_cpu_to_logical_apicid); + x86_cpu_to_logical_apicid = NULL; return 0; } init_x2apic_ldr(); diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c new file mode 100644 index 000000000000..03bb2f343ddb --- /dev/null +++ b/arch/x86/kernel/cc_platform.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Confidential Computing Platform Capability checks + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/export.h> +#include <linux/cc_platform.h> +#include <linux/mem_encrypt.h> + +#include <asm/processor.h> + +static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_INTEL_TDX_GUEST + return false; +#else + return false; +#endif +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * cc_platform_has() function is used for this. When a distinction isn't + * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +static bool amd_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return sme_me_mask; + + case CC_ATTR_HOST_MEM_ENCRYPT: + return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED); + + case CC_ATTR_GUEST_MEM_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ENABLED; + + case CC_ATTR_GUEST_STATE_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ES_ENABLED; + + default: + return false; + } +#else + return false; +#endif +} + + +bool cc_platform_has(enum cc_attr attr) +{ + if (sme_me_mask) + return amd_cc_platform_has(attr); + + return false; +} +EXPORT_SYMBOL_GPL(cc_platform_has); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ecfca3bbcd96..31e2412b3f17 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,7 +22,7 @@ #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> @@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b5e36bd0425b..fe98a1465be6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) l2 = new_l2; #ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; + per_cpu(cpu_l2c_id, cpu) = l2_id; #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f8885949e8c..8ed18d00c211 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -42,7 +42,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/mtrr.h> #include <asm/hwcap2.h> #include <linux/numa.h> @@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu) } EXPORT_SYMBOL_GPL(get_llc_id); +/* L2 cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -326,6 +329,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_SMAP cr4_set_bits(X86_CR4_SMAP); #else + clear_cpu_cap(c, X86_FEATURE_SMAP); cr4_clear_bits(X86_CR4_SMAP); #endif } diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index defda61f372d..cb2fdd130aae 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 08831acc1d03..27cacf504663 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -526,7 +526,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, /* Fall back to method we used for older processors: */ switch (block) { case 0: - addr = msr_ops.misc(bank); + addr = mca_msr_reg(bank, MCA_MISC); break; case 1: offset = ((low & MASK_BLKPTR_LO) >> 21); @@ -978,8 +978,8 @@ static void log_error_deferred(unsigned int bank) { bool defrd; - defrd = _log_error_bank(bank, msr_ops.status(bank), - msr_ops.addr(bank), 0); + defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), 0); if (!mce_flags.smca) return; @@ -1009,7 +1009,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); + _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); } static void log_and_reset_block(struct threshold_block *block) @@ -1397,7 +1397,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, } } - err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8cb7816d03b4..6ed365337a3b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -121,8 +121,6 @@ mce_banks_t mce_banks_ce_disabled; static struct work_struct mce_work; static struct irq_work mce_irq_work; -static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); - /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -176,53 +174,27 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static inline u32 ctl_reg(int bank) -{ - return MSR_IA32_MCx_CTL(bank); -} - -static inline u32 status_reg(int bank) -{ - return MSR_IA32_MCx_STATUS(bank); -} - -static inline u32 addr_reg(int bank) -{ - return MSR_IA32_MCx_ADDR(bank); -} - -static inline u32 misc_reg(int bank) +u32 mca_msr_reg(int bank, enum mca_msr reg) { - return MSR_IA32_MCx_MISC(bank); -} - -static inline u32 smca_ctl_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_CTL(bank); -} - -static inline u32 smca_status_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_STATUS(bank); -} + if (mce_flags.smca) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } -static inline u32 smca_addr_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_ADDR(bank); -} + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } -static inline u32 smca_misc_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_MISC(bank); + return 0; } -struct mca_msr_regs msr_ops = { - .ctl = ctl_reg, - .status = status_reg, - .addr = addr_reg, - .misc = misc_reg -}; - static void __print_mce(struct mce *m) { pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", @@ -362,24 +334,27 @@ static int msr_to_offset(u32 msr) if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); - if (msr == msr_ops.status(bank)) + if (msr == mca_msr_reg(bank, MCA_STATUS)) return offsetof(struct mce, status); - if (msr == msr_ops.addr(bank)) + if (msr == mca_msr_reg(bank, MCA_ADDR)) return offsetof(struct mce, addr); - if (msr == msr_ops.misc(bank)) + if (msr == mca_msr_reg(bank, MCA_MISC)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); return -1; } -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { - pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } show_stack_regs(regs); @@ -387,8 +362,6 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, while (true) cpu_relax(); - - return true; } /* MSR access wrappers used for error injection */ @@ -420,32 +393,13 @@ static noinstr u64 mce_rdmsrl(u32 msr) */ asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); } -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, - regs->ip, (void *)regs->ip); - - show_stack_regs(regs); - - panic("MCA architectural violation!\n"); - - while (true) - cpu_relax(); - - return true; -} - static noinstr void mce_wrmsrl(u32 msr, u64 v) { u32 low, high; @@ -470,7 +424,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) /* See comment in mce_rdmsrl() */ asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) : : "c" (msr), "a"(low), "d" (high) : "memory"); } @@ -685,10 +639,10 @@ static struct notifier_block mce_default_nb = { static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(msr_ops.misc(i)); + m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(msr_ops.addr(i)); + m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -758,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.bank = i; barrier(); - m.status = mce_rdmsrl(msr_ops.status(i)); + m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) @@ -826,7 +780,7 @@ clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -841,6 +795,34 @@ clear_it: EXPORT_SYMBOL_GPL(machine_check_poll); /* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ @@ -851,13 +833,13 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); + if (mce_flags.snb_ifu_quirk) + quirk_sandybridge_ifu(i, m, regs); m->bank = i; if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { @@ -1144,7 +1126,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (test_bit(i, toclear)) - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1203,7 +1185,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1253,6 +1235,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin static void kill_me_now(struct callback_head *ch) { + struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me); + + p->mce_count = 0; force_sig(SIGBUS); } @@ -1262,13 +1247,14 @@ static void kill_me_maybe(struct callback_head *cb) int flags = MF_ACTION_REQUIRED; int ret; + p->mce_count = 0; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); if (!p->mce_ripv) flags |= MF_MUST_KILL; ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); - if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; @@ -1282,29 +1268,57 @@ static void kill_me_maybe(struct callback_head *cb) if (ret == -EHWPOISON) return; - if (p->mce_vaddr != (void __user *)-1l) { - force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); - } else { - pr_err("Memory error not recovered"); - kill_me_now(cb); - } + pr_err("Memory error not recovered"); + kill_me_now(cb); } -static void queue_task_work(struct mce *m, int kill_current_task) +static void kill_me_never(struct callback_head *cb) { - current->mce_addr = m->addr; - current->mce_kflags = m->kflags; - current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(m); + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +{ + int count = ++current->mce_count; + + /* First call, save all the details */ + if (count == 1) { + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + current->mce_kill_me.func = func; + } + + /* Ten is likely overkill. Don't expect more than two faults before task_work() */ + if (count > 10) + mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + + /* Second or later call, make sure page address matches the one from first call */ + if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) + mce_panic("Consecutive machine checks to different user pages", m, msg); + + /* Do not call task_work_add() more than once */ + if (count > 1) + return; task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } +/* Handle unconfigured int18 (should never happen) */ +static noinstr void unexpected_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); + instrumentation_end(); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1325,36 +1339,43 @@ static void queue_task_work(struct mce *m, int kill_current_task) */ noinstr void do_machine_check(struct pt_regs *regs) { + int worst = 0, order, no_way_out, kill_current_task, lmce; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; struct mce m, *final; char *msg = NULL; - int worst = 0; + + if (unlikely(mce_flags.p5)) + return pentium_machine_check(regs); + else if (unlikely(mce_flags.winchip)) + return winchip_machine_check(regs); + else if (unlikely(!mca_cfg.initialized)) + return unexpected_machine_check(regs); /* * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; + order = -1; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ - int no_way_out = 0; + no_way_out = 0; /* * If kill_current_task is not set, there might be a way to recover from this * error. */ - int kill_current_task = 0; + kill_current_task = 0; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ - int lmce = 1; + lmce = 1; this_cpu_inc(mce_exception_count); @@ -1438,7 +1459,10 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, kill_current_task); + if (kill_current_task) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); } else { /* @@ -1456,7 +1480,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, kill_current_task); + queue_task_work(&m, msg, kill_me_never); } out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); @@ -1666,8 +1690,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(msr_ops.ctl(i), b->ctl); - wrmsrl(msr_ops.status(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1693,39 +1717,11 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(msr_ops.ctl(i), msrval); + rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* - * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and - * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM - * Vol 3B Table 15-20). But this confuses both the code that determines - * whether the machine check occurred in kernel or user mode, and also - * the severity assessment code. Pretend that EIPV was set, and take the - * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. - */ -static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) -{ - if (bank != 0) - return; - if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) - return; - if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| - MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| - MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| - MCACOD)) != - (MCI_STATUS_UC|MCI_STATUS_EN| - MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| - MCI_STATUS_AR|MCACOD_INSTR)) - return; - - m->mcgstatus |= MCG_STATUS_EIPV; - m->ip = regs->ip; - m->cs = regs->cs; -} - /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { @@ -1799,7 +1795,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) cfg->bootlog = 0; if (c->x86 == 6 && c->x86_model == 45) - quirk_no_way_out = quirk_sandybridge_ifu; + mce_flags.snb_ifu_quirk = 1; } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { @@ -1829,9 +1825,11 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + mce_flags.p5 = 1; return 1; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + mce_flags.winchip = 1; return 1; default: return 0; @@ -1850,13 +1848,6 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); mce_flags.amd_threshold = 1; - - if (mce_flags.smca) { - msr_ops.ctl = smca_ctl_reg; - msr_ops.status = smca_status_reg; - msr_ops.addr = smca_addr_reg; - msr_ops.misc = smca_misc_reg; - } } } @@ -1986,18 +1977,6 @@ bool filter_mce(struct mce *m) return false; } -/* Handle unconfigured int18 (should never happen) */ -static noinstr void unexpected_machine_check(struct pt_regs *regs) -{ - instrumentation_begin(); - pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", - smp_processor_id()); - instrumentation_end(); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; - static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { irqentry_state_t irq_state; @@ -2008,31 +1987,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. */ - if (machine_check_vector == do_machine_check && - mce_check_crashing_cpu()) + if (mca_cfg.initialized && mce_check_crashing_cpu()) return; irq_state = irqentry_nmi_enter(regs); - /* - * The call targets are marked noinstr, but objtool can't figure - * that out because it's an indirect call. Annotate it. - */ - instrumentation_begin(); - machine_check_vector(regs); + do_machine_check(regs); - instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - machine_check_vector(regs); + do_machine_check(regs); - instrumentation_end(); irqentry_exit_to_user_mode(regs); } @@ -2099,7 +2069,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } - machine_check_vector = do_machine_check; + mca_cfg.initialized = 1; __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); @@ -2207,7 +2177,6 @@ int __init mcheck_init(void) mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); - mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); @@ -2232,7 +2201,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2584,7 +2553,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 88dcc79cfb07..acd61c41846c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -8,9 +8,6 @@ #include <linux/device.h> #include <asm/mce.h> -/* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *); - enum severity_level { MCE_NO_SEVERITY, MCE_DEFERRED_SEVERITY, @@ -38,8 +35,7 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, - int tolerant, char **msg, bool is_excp); +int mce_severity(struct mce *a, struct pt_regs *regs, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -61,7 +57,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } -static inline bool intel_filter_mce(struct mce *m) { return false; }; +static inline bool intel_filter_mce(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -117,23 +113,25 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool ignore_ce; - bool print_all; - __u64 lmce_disabled : 1, disabled : 1, ser : 1, recovery : 1, bios_cmci_threshold : 1, - __reserved : 59; + /* Proper #MC exception handler is set */ + initialized : 1, + __reserved : 58; + + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool print_all; - s8 bootlog; int tolerant; int monarch_timeout; int panic_timeout; u32 rip_msr; + s8 bootlog; }; extern struct mca_config mca_cfg; @@ -163,19 +161,28 @@ struct mce_vendor_flags { /* AMD-style error thresholding banks present. */ amd_threshold : 1, - __reserved_0 : 60; + /* Pentium, family 5-style MCA */ + p5 : 1, + + /* Centaur Winchip C6-style MCA */ + winchip : 1, + + /* SandyBridge IFU quirk */ + snb_ifu_quirk : 1, + + __reserved_0 : 57; }; extern struct mce_vendor_flags mce_flags; -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); +enum mca_msr { + MCA_CTL, + MCA_STATUS, + MCA_ADDR, + MCA_MISC, }; -extern struct mca_msr_regs msr_ops; +u32 mca_msr_reg(int bank, enum mca_msr reg); /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); @@ -183,17 +190,21 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); #else -static inline bool amd_filter_mce(struct mce *m) { return false; }; +static inline bool amd_filter_mce(struct mce *m) { return false; } #endif -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); - -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +noinstr void pentium_machine_check(struct pt_regs *regs); +noinstr void winchip_machine_check(struct pt_regs *regs); +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void enable_p5_mce(void) {} +static inline void pentium_machine_check(struct pt_regs *regs) {} +static inline void winchip_machine_check(struct pt_regs *regs) {} +#endif #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 19e90cae8e97..2272ad53fc33 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,7 +21,7 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static noinstr void pentium_machine_check(struct pt_regs *regs) +noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; @@ -54,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; - machine_check_vector = pentium_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 17e631443116..bb019a594a2c 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -265,25 +265,25 @@ static bool is_copy_from_user(struct pt_regs *regs) */ static int error_context(struct mce *m, struct pt_regs *regs) { - enum handler_type t; - if ((m->cs & 3) == 3) return IN_USER; if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; - t = ex_get_fault_handler_type(m->ip); - if (t == EX_HANDLER_FAULT) { - m->kflags |= MCE_IN_KERNEL_RECOV; - return IN_KERNEL_RECOV; - } - if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { - m->kflags |= MCE_IN_KERNEL_RECOV; + switch (ex_get_fixup_type(m->ip)) { + case EX_TYPE_UACCESS: + case EX_TYPE_COPY: + if (!regs || !is_copy_from_user(regs)) + return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; + fallthrough; + case EX_TYPE_FAULT_MCE_SAFE: + case EX_TYPE_DEFAULT_MCE_SAFE: + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + default: + return IN_KERNEL; } - - return IN_KERNEL; } static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) @@ -407,15 +407,14 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs, } } -/* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = - mce_severity_intel; - -void __init mcheck_vendor_init_severity(void) +int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, + bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - mce_severity = mce_severity_amd; + return mce_severity_amd(m, regs, tolerant, msg, is_excp); + else + return mce_severity_intel(m, regs, tolerant, msg, is_excp); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 9c9f0abd2d7f..6c99f2941909 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,7 +17,7 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static noinstr void winchip_machine_check(struct pt_regs *regs) +noinstr void winchip_machine_check(struct pt_regs *regs) { instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); @@ -30,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; - machine_check_vector = winchip_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4b8813bafffd..bb1c3f5f60c8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(d); + kfree(hw_dom); return; } if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(d); + kfree(hw_dom->ctrl_val); + kfree(hw_dom->mbps_val); + kfree(hw_dom); return; } diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045e82e8945b..a7f617a3981d 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,6 +10,7 @@ #include <linux/crash_dump.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/cc_platform.h> static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf, @@ -73,5 +74,6 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, sev_active()); + return read_from_oldmem(buf, count, ppos, 0, + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 38837dad46e6..391a4e2b8604 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -714,12 +714,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 2954fab15e51..794e70151203 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,7 +2,7 @@ /* * x86 FPU bug checks: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> /* * Boot time CPU/FPU FDIV bug detection code: diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h new file mode 100644 index 000000000000..958accf2ccf0 --- /dev/null +++ b/arch/x86/kernel/fpu/context.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_CONTEXT_H +#define __X86_KERNEL_FPU_CONTEXT_H + +#include <asm/fpu/xstate.h> +#include <asm/trace/fpu.h> + +/* Functions related to FPU context tracking */ + +/* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. + */ +static inline void __cpu_invalidate_fpregs_state(void) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); +} + +static inline void fpregs_activate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); +} + +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + * + * XFD state is handled in restore_fpregs_from_fpstate(). + */ + restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); + + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7ada7bd03a32..8ea306b1bf8e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,8 +6,9 @@ * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/regset.h> +#include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> @@ -15,15 +16,30 @@ #include <linux/hardirq.h> #include <linux/pkeys.h> +#include <linux/vmalloc.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); +#endif + +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union fpregs_state init_fpstate __ro_after_init; +struct fpstate init_fpstate __ro_after_init; /* * Track whether the kernel is using the FPU state @@ -83,7 +99,7 @@ bool irq_fpu_usable(void) EXPORT_SYMBOL(irq_fpu_usable); /* - * Save the FPU register state in fpu->state. The register state is + * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. @@ -99,19 +115,19 @@ EXPORT_SYMBOL(irq_fpu_usable); void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - os_xsave(&fpu->state.xsave); + os_xsave(fpu->fpstate); /* * AVX512 state is tracked here because its use is * known to slow the max clock speed of the core. */ - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512) fpu->avx512_timestamp = jiffies; return; } if (likely(use_fxsr())) { - fxsave(&fpu->state.fxsave); + fxsave(&fpu->fpstate->regs.fxsave); return; } @@ -119,12 +135,11 @@ void save_fpregs_to_fpstate(struct fpu *fpu) * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - frstor(&fpu->state.fsave); + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); } -EXPORT_SYMBOL(save_fpregs_to_fpstate); -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore @@ -141,15 +156,181 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) } if (use_xsave()) { - os_xrstor(&fpstate->xsave, mask); + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + + os_xrstor(fpstate, mask); } else { if (use_fxsr()) - fxrstor(&fpstate->fxsave); + fxrstor(&fpstate->regs.fxsave); else - frstor(&fpstate->fsave); + frstor(&fpstate->regs.fsave); } } -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); + +void fpu_reset_from_exception_fixup(void) +{ + restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); +} + +#if IS_ENABLED(CONFIG_KVM) +static void __fpstate_reset(struct fpstate *fpstate); + +bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fpstate; + unsigned int size; + + size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); + if (!fpstate) + return false; + + __fpstate_reset(fpstate); + fpstate_init_user(fpstate); + fpstate->is_valloc = true; + fpstate->is_guest = true; + + gfpu->fpstate = fpstate; + return true; +} +EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); + +void fpu_free_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fps = gfpu->fpstate; + + if (!fps) + return; + + if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + return; + + gfpu->fpstate = NULL; + vfree(fps); +} +EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; + } + + cur_fps = fpu->fpstate; + + if (!cur_fps->is_confidential) { + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + +void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u32 pkru) +{ + struct fpstate *kstate = gfpu->fpstate; + union fpregs_state *ustate = buf; + struct membuf mb = { .p = buf, .left = size }; + + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { + __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + } else { + memcpy(&ustate->fxsave, &kstate->regs.fxsave, + sizeof(ustate->fxsave)); + /* Make it restorable on a XSAVE enabled host */ + ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; + } +} +EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); + +int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, + u64 xcr0, u32 *vpkru) +{ + struct fpstate *kstate = gfpu->fpstate; + const union fpregs_state *ustate = buf; + struct pkru_state *xpkru; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) + return -EINVAL; + if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + return 0; + } + + if (ustate->xsave.header.xfeatures & ~xcr0) + return -EINVAL; + + ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); + if (ret) + return ret; + + /* Retrieve PKRU if not in init state */ + if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { + xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); + *vpkru = xpkru->pkru; + } + + /* Ensure that XCOMP_BV is set up for XSAVES */ + xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +#endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { @@ -203,52 +384,88 @@ void fpu_sync_fpstate(struct fpu *fpu) fpregs_unlock(); } -static inline void fpstate_init_xstate(struct xregs_state *xsave) +static inline unsigned int init_fpstate_copy_size(void) { - /* - * XRSTORS requires these bits set in xcomp_bv, or it will - * trigger #GP: - */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; + if (!use_xsave()) + return fpu_kernel_cfg.default_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.regs.xsave); } -static inline void fpstate_init_fxstate(struct fxregs_state *fx) +static inline void fpstate_init_fxstate(struct fpstate *fpstate) { - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ -static inline void fpstate_init_fstate(struct fregs_state *fp) +static inline void fpstate_init_fstate(struct fpstate *fpstate) { - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; } -void fpstate_init(union fpregs_state *state) +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(struct fpstate *fpstate) { - if (!static_cpu_has(X86_FEATURE_FPU)) { - fpstate_init_soft(&state->soft); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpstate_init_soft(&fpstate->regs.soft); return; } - memset(state, 0, fpu_kernel_xstate_size); + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); - if (static_cpu_has(X86_FEATURE_XSAVES)) - fpstate_init_xstate(&state->xsave); - if (static_cpu_has(X86_FEATURE_FXSR)) - fpstate_init_fxstate(&state->fxsave); + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpstate_init_fxstate(fpstate); else - fpstate_init_fstate(&state->fsave); + fpstate_init_fstate(fpstate); +} + +static void __fpstate_reset(struct fpstate *fpstate) +{ + /* Initialize sizes and feature masks */ + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->user_size = fpu_user_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->user_xfeatures = fpu_user_cfg.default_features; + fpstate->xfd = init_fpstate.xfd; +} + +void fpstate_reset(struct fpu *fpu) +{ + /* Set the fpstate pointer to the default fpstate */ + fpu->fpstate = &fpu->__fpstate; + __fpstate_reset(fpu->fpstate); + + /* Initialize the permission related info in fpu */ + fpu->perm.__state_perm = fpu_kernel_cfg.default_features; + fpu->perm.__state_size = fpu_kernel_cfg.default_size; + fpu->perm.__user_state_size = fpu_user_cfg.default_size; +} + +static inline void fpu_inherit_perms(struct fpu *dst_fpu) +{ + if (fpu_state_size_dynamic()) { + struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + + spin_lock_irq(¤t->sighand->siglock); + /* Fork also inherits the permissions of the parent */ + dst_fpu->perm = src_fpu->perm; + spin_unlock_irq(¤t->sighand->siglock); + } } -EXPORT_SYMBOL_GPL(fpstate_init); /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -256,30 +473,51 @@ int fpu_clone(struct task_struct *dst) /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; + fpstate_reset(dst_fpu); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* - * Don't let 'init optimized' areas of the XSAVE area - * leak into the child task: + * Enforce reload for user space tasks and prevent kernel threads + * from trying to save the FPU registers on context switch. + */ + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); + + /* + * No FPU state inheritance for kernel threads and IO + * worker threads. + */ + if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + /* Clear out the minimal state */ + memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, + init_fpstate_copy_size()); + return 0; + } + + /* + * If a new feature is added, ensure all dynamic features are + * caller-saved from here! */ - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* - * If the FPU registers are not owned by current just memcpy() the - * state. Otherwise save the FPU registers directly into the - * child's FPU context, without any memory-to-memory copying. + * Save the default portion of the current FPU state into the + * clone. Assume all dynamic features to be defined as caller- + * saved, which enables skipping both the expansion of fpstate + * and the copying of any dynamic state. + * + * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because + * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - - else - save_fpregs_to_fpstate(dst_fpu); + fpregs_restore_userregs(); + save_fpregs_to_fpstate(dst_fpu); + if (!(clone_flags & CLONE_THREAD)) + fpu_inherit_perms(dst_fpu); fpregs_unlock(); - set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -287,6 +525,16 @@ int fpu_clone(struct task_struct *dst) } /* + * Whitelist the FPU register state embedded into task_struct for hardened + * usercopy. + */ +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + *size = fpu_kernel_cfg.default_size; +} + +/* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. @@ -319,28 +567,19 @@ void fpu__drop(struct fpu *fpu) static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) - os_xrstor(&init_fpstate.xsave, features_mask); + os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) - fxrstor(&init_fpstate.fxsave); + fxrstor(&init_fpstate.regs.fxsave); else - frstor(&init_fpstate.fsave); + frstor(&init_fpstate.regs.fsave); pkru_write_default(); } -static inline unsigned int init_fpstate_copy_size(void) -{ - if (!use_xsave()) - return fpu_kernel_xstate_size; - - /* XSAVE(S) just needs the legacy and the xstate header part */ - return sizeof(init_fpstate.xsave); -} - /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpstate(void) +static void fpu_reset_fpregs(void) { struct fpu *fpu = ¤t->thread.fpu; @@ -359,7 +598,7 @@ static void fpu_reset_fpstate(void) * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ - memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size()); + memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } @@ -375,7 +614,7 @@ void fpu__clear_user_states(struct fpu *fpu) fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpstate(); + fpu_reset_fpregs(); fpregs_unlock(); return; } @@ -385,12 +624,11 @@ void fpu__clear_user_states(struct fpu *fpu) * corresponding registers. */ if (xfeatures_mask_supervisor() && - !fpregs_state_valid(fpu, smp_processor_id())) { - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); - } + !fpregs_state_valid(fpu, smp_processor_id())) + os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ - restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user()); + restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU @@ -405,7 +643,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpu_reset_fpstate(); + fpstate_reset(¤t->thread.fpu); + fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. @@ -445,7 +684,6 @@ void fpregs_mark_activate(void) fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } -EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: @@ -468,11 +706,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { - cwd = fpu->state.fxsave.cwd; - swd = fpu->state.fxsave.swd; + cwd = fpu->fpstate->regs.fxsave.cwd; + swd = fpu->fpstate->regs.fxsave.swd; } else { - cwd = (unsigned short)fpu->state.fsave.cwd; - swd = (unsigned short)fpu->state.fsave.swd; + cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; + swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; @@ -486,7 +724,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) - mxcsr = fpu->state.fxsave.mxcsr; + mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 64e29927cc32..621f4b6cac4a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -2,7 +2,7 @@ /* * x86 FPU boot time init code: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -10,6 +10,10 @@ #include <linux/sched/task.h> #include <linux/init.h> +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + /* * Initialize the registers found in all CPUs, CR0 and CR4: */ @@ -34,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); else #endif asm volatile ("fninit"); @@ -121,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void) static void __init fpu__init_system_generic(void) { /* - * Set up the legacy init FPU context. (xstate init might overwrite this - * with a more modern format, if the CPU supports it.) + * Set up the legacy init FPU context. Will be updated when the + * CPU supports XSAVE[S]. */ - fpstate_init(&init_fpstate); + fpstate_init_user(&init_fpstate); fpu__init_system_mxcsr(); } -/* - * Size of the FPU context state. All tasks in the system use the - * same context size, regardless of what portion they use. - * This is inherent to the XSAVE architecture which puts all state - * components into a single, continuous memory block: - */ -unsigned int fpu_kernel_xstate_size __ro_after_init; -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); - /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -162,13 +157,13 @@ static void __init fpu__init_task_struct_size(void) * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + task_size -= sizeof(current->thread.fpu.__fpstate.regs); /* * Add back the dynamically-calculated register state * size. */ - task_size += fpu_kernel_xstate_size; + task_size += fpu_kernel_cfg.default_size; /* * We dynamically size 'struct fpu', so we require that @@ -177,7 +172,7 @@ static void __init fpu__init_task_struct_size(void) * you hit a compile error here, check the structure to * see if something got added to the end. */ - CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); @@ -192,37 +187,34 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { - static int on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + unsigned int size; /* - * Note that xstate sizes might be overwritten later during - * fpu__init_system_xstate(). + * Note that the size configuration might be overwritten later + * during fpu__init_system_xstate(). */ - - if (!boot_cpu_has(X86_FEATURE_FPU)) { - fpu_kernel_xstate_size = sizeof(struct swregs_state); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + size = sizeof(struct swregs_state); + } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) { + size = sizeof(struct fxregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE; } else { - if (boot_cpu_has(X86_FEATURE_FXSR)) - fpu_kernel_xstate_size = - sizeof(struct fxregs_state); - else - fpu_kernel_xstate_size = - sizeof(struct fregs_state); + size = sizeof(struct fregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FP; } - fpu_user_xstate_size = fpu_kernel_xstate_size; + fpu_kernel_cfg.max_size = size; + fpu_kernel_cfg.default_size = size; + fpu_user_cfg.max_size = size; + fpu_user_cfg.default_size = size; + fpstate_reset(¤t->thread.fpu); } -/* Legacy code to initialize eager fpu mode. */ -static void __init fpu__init_system_ctx_switch(void) +static void __init fpu__init_init_fpstate(void) { - static bool on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + /* Bring init_fpstate size and features up to date */ + init_fpstate.size = fpu_kernel_cfg.max_size; + init_fpstate.xfeatures = fpu_kernel_cfg.max_features; } /* @@ -231,6 +223,7 @@ static void __init fpu__init_system_ctx_switch(void) */ void __init fpu__init_system(struct cpuinfo_x86 *c) { + fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(c); /* @@ -241,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); - fpu__init_system_xstate(); + fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - - fpu__init_system_ctx_switch(); + fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h new file mode 100644 index 000000000000..dbdb31f55fc7 --- /dev/null +++ b/arch/x86/kernel/fpu/internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_INTERNAL_H +#define __X86_KERNEL_FPU_INTERNAL_H + +extern struct fpstate init_fpstate; + +/* CPU feature check wrappers */ +static __always_inline __pure bool use_xsave(void) +{ + return cpu_feature_enabled(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return cpu_feature_enabled(X86_FEATURE_FXSR); +} + +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + +/* Used in init.c */ +extern void fpstate_init_user(struct fpstate *fpstate); +extern void fpstate_reset(struct fpu *fpu); + +#endif diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h new file mode 100644 index 000000000000..17c26b164c63 --- /dev/null +++ b/arch/x86/kernel/fpu/legacy.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_LEGACY_H +#define __X86_KERNEL_FPU_LEGACY_H + +#include <asm/fpu/types.h> + +extern unsigned int mxcsr_feature_mask; + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + \ + might_fault(); \ + \ + asm volatile(ASM_STAC "\n" \ + "1: " #insn "\n" \ + "2: " ASM_CLAC "\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn(insn, output, input...) \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ + : output : input) + +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + +} + +static inline void fxrstor(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_safe(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void frstor(struct fregs_state *fx) +{ + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_safe(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + +#endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 66ed317ebc0d..437d7c930c0b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -5,10 +5,14 @@ #include <linux/sched/task_stack.h> #include <linux/vmalloc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -74,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sync_fpstate(fpu); if (!use_xsave()) { - return membuf_write(&to, &fpu->state.fxsave, - sizeof(fpu->state.fxsave)); + return membuf_write(&to, &fpu->fpstate->regs.fxsave, + sizeof(fpu->fpstate->regs.fxsave)); } copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); @@ -110,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); /* Copy the state */ - memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate)); + memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); /* Clear xmm8..15 */ - BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16); - memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16); + BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); + memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return 0; } @@ -149,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * A whole standard-format XSAVE buffer is needed: */ - if (pos != 0 || count != fpu_user_xstate_size) + if (pos != 0 || count != fpu_user_cfg.max_size) return -EFAULT; if (!kbuf) { @@ -164,7 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); out: vfree(tmpbuf); @@ -283,7 +287,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave); + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -326,7 +330,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, return fpregs_soft_get(target, regset, to); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { - return membuf_write(&to, &fpu->state.fsave, + return membuf_write(&to, &fpu->fpstate->regs.fsave, sizeof(struct fregs_state)); } @@ -337,7 +341,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); fx = &fxsave; } else { - fx = &fpu->state.fxsave; + fx = &fpu->fpstate->regs.fxsave; } __convert_from_fxsr(&env, target, fx); @@ -366,16 +370,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); if (cpu_feature_enabled(X86_FEATURE_FXSR)) - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); else - memcpy(&fpu->state.fsave, &env, sizeof(env)); + memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); /* * Update the header bit in the xsave header, indicating the * presence of FP. */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; return 0; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 445c57c9c539..cc977da6e128 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -7,23 +7,25 @@ #include <linux/cpu.h> #include <linux/pagemap.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/sigframe.h> +#include <asm/trapnr.h> #include <asm/trace/fpu.h> -static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; -static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, - struct _fpx_sw_bytes *fx_sw) +static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); @@ -31,12 +33,12 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) - return -EFAULT; + return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > fpu_user_xstate_size || + fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) goto setfx; @@ -47,10 +49,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * in the memory layout. */ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) - return -EFAULT; + return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) - return 0; + return true; setfx: trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); @@ -58,22 +60,22 @@ setfx: fx_sw->magic1 = 0; fx_sw->xstate_size = sizeof(struct fxregs_state); fx_sw->xfeatures = XFEATURE_MASK_FPSSE; - return 0; + return true; } /* * Signal frame handlers. */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.state.fxsave); + fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -81,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; + return false; } else { struct fregs_state __user *fp = buf; u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; + return false; } - return 0; + return true; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed to by the fpstate pointer in the sigcontext. + * This is saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, + struct fpstate *fpstate) +{ + sw_bytes->magic1 = FP_XSTATE_MAGIC1; + sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; + sw_bytes->xfeatures = fpstate->user_xfeatures; + sw_bytes->xstate_size = fpstate->user_size; + + if (ia32_frame) + sw_bytes->extended_size += sizeof(struct fregs_state); } -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, + struct fpstate *fpstate) { struct xregs_state __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; + struct _fpx_sw_bytes sw_bytes; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + save_sw_bytes(&sw_bytes, ia32_frame, fpstate); + err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) - return err; + return !err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -130,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - return err; + return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { - int err; - if (use_xsave()) - err = xsave_to_user_sigframe(buf); - else if (use_fxsr()) - err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf); + return xsave_to_user_sigframe(buf); + if (use_fxsr()) + return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else - err = fnsave_to_user_sigframe((struct fregs_state __user *) buf); - - if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) - err = -EFAULT; - return err; + return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* @@ -159,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Try to save it directly to the user frame with disabled page fault handler. - * If this fails then do the slow path where the FPU state is first saved to - * task's fpu->state and then copy it to the user frame pointed to by the - * aligned pointer 'buf_fx'. + * Save it directly to the user frame with disabled page fault handler. If + * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -170,10 +185,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); + struct fpstate *fpstate = tsk->thread.fpu.fpstate; + bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || @@ -181,13 +197,25 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!static_cpu_has(X86_FEATURE_FPU)) { struct user_i387_ia32_struct fp; + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, .left = sizeof(fp)}); - return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0; + return !copy_to_user(buf, &fp, sizeof(fp)); } if (!access_ok(buf, size)) - return -EACCES; + return false; + + if (use_xsave()) { + struct xregs_state __user *xbuf = buf_fx; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + if (__clear_user(&xbuf->header, sizeof(xbuf->header))) + return false; + } retry: /* * Load the FPU registers if they are not valid for the current task. @@ -205,26 +233,26 @@ retry: fpregs_unlock(); if (ret) { - if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) + if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; - return -EFAULT; + return false; } /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; + if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) + return false; - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) + return false; - return 0; + return true; } -static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only) +static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, + u64 xrestore, bool fx_only) { if (use_xsave()) { - u64 init_bv = xfeatures_mask_uabi() & ~xrestore; + u64 init_bv = ufeatures & ~xrestore; int ret; if (likely(!fx_only)) @@ -233,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) - os_xrstor(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); @@ -246,16 +274,19 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static int restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) { struct fpu *fpu = ¤t->thread.fpu; int ret; retry: fpregs_lock(); + /* Ensure that XFD is up to date */ + xfd_update_state(fpu->fpstate); pagefault_disable(); - ret = __restore_fpregs_from_user(buf, xrestore, fx_only); + ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, + xrestore, fx_only); pagefault_enable(); if (unlikely(ret)) { @@ -275,13 +306,12 @@ retry: fpregs_unlock(); /* Try to handle #PF, but anything else is fatal. */ - if (ret != -EFAULT) - return -EINVAL; + if (ret != X86_TRAP_PF) + return false; - ret = fault_in_pages_readable(buf, size); - if (!ret) + if (!fault_in_pages_readable(buf, size)) goto retry; - return ret; + return false; } /* @@ -294,45 +324,40 @@ retry: * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); + os_xrstor_supervisor(fpu->fpstate); fpregs_mark_activate(); fpregs_unlock(); - return 0; + return true; } -static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, - bool ia32_fxstate) +static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) { - int state_size = fpu_kernel_xstate_size; struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; + bool success, fx_only = false; + union fpregs_state *fpregs; + unsigned int state_size; u64 user_xfeatures = 0; - bool fx_only = false; - int ret; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; - ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user); - if (unlikely(ret)) - return ret; + if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + return false; fx_only = !fx_sw_user.magic1; state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; + state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { - /* - * Attempt to restore the FPU registers directly from user - * memory. For that to succeed, the user access cannot cause page - * faults. If it does, fall back to the slow path below, going - * through the kernel buffer with the enabled pagefault handler. - */ + /* Restore the FPU registers directly from user memory. */ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, state_size); } @@ -342,9 +367,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * to be ignored for histerical raisins. The legacy state is folded * in once the larger state has been copied. */ - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - return ret; + if (__copy_from_user(&env, buf, sizeof(env))) + return false; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is @@ -363,33 +387,38 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) - os_xsave(&fpu->state.xsave); + os_xsave(fpu->fpstate); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); __cpu_invalidate_fpregs_state(); fpregs_unlock(); + fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); - if (ret) - return ret; + if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + return false; } else { - if (__copy_from_user(&fpu->state.fxsave, buf_fx, - sizeof(fpu->state.fxsave))) - return -EFAULT; - - /* Reject invalid MXCSR values. */ - if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) - return -EINVAL; + if (__copy_from_user(&fpregs->fxsave, buf_fx, + sizeof(fpregs->fxsave))) + return false; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) + return false; + } else { + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpregs->fxsave.mxcsr &= mxcsr_feature_mask; + } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; } /* Fold the legacy FP storage */ - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpregs->fxsave, &env); fpregs_lock(); if (use_xsave()) { @@ -404,40 +433,45 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, */ u64 mask = user_xfeatures | xfeatures_mask_supervisor(); - fpu->state.xsave.header.xfeatures &= mask; - ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); + fpregs->xsave.header.xfeatures &= mask; + success = !os_xrstor_safe(fpu->fpstate, + fpu_kernel_cfg.max_features); } else { - ret = fxrstor_safe(&fpu->state.fxsave); + success = !fxrstor_safe(&fpregs->fxsave); } - if (likely(!ret)) + if (likely(success)) fpregs_mark_activate(); fpregs_unlock(); - return ret; + return success; } -static inline int xstate_sigframe_size(void) + +static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { - return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : - fpu_user_xstate_size; + unsigned int size = fpstate->user_size; + + return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* * Restore FPU state from a sigframe: */ -int fpu__restore_sig(void __user *buf, int ia32_frame) +bool fpu__restore_sig(void __user *buf, int ia32_frame) { - unsigned int size = xstate_sigframe_size(); struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; - int ret; + bool success = false; + unsigned int size; if (unlikely(!buf)) { fpu__clear_user_states(fpu); - return 0; + return true; } + size = xstate_sigframe_size(fpu->fpstate); + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -451,30 +485,28 @@ int fpu__restore_sig(void __user *buf, int ia32_frame) ia32_fxstate = true; } - if (!access_ok(buf, size)) { - ret = -EACCES; + if (!access_ok(buf, size)) goto out; - } if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { - ret = fpregs_soft_set(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), - NULL, buf); + success = !fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); } else { - ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: - if (unlikely(ret)) + if (unlikely(!success)) fpu__clear_user_states(fpu); - return ret; + return success; } unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(); + unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { @@ -487,9 +519,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, return sp; } -unsigned long fpu__get_fpstate_size(void) +unsigned long __init fpu__get_fpstate_size(void) { - unsigned long ret = xstate_sigframe_size(); + unsigned long ret = fpu_user_cfg.max_size; + + if (use_xsave()) + ret += FP_XSTATE_MAGIC2_SIZE; /* * This space is needed on (most) 32-bit kernels, or when a 32-bit @@ -505,28 +540,3 @@ unsigned long fpu__get_fpstate_size(void) return ret; } -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -void fpu__init_prepare_fx_sw_frame(void) -{ - int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask_uabi(); - fx_sw_reserved.xstate_size = fpu_user_xstate_size; - - if (IS_ENABLED(CONFIG_IA32_EMULATION) || - IS_ENABLED(CONFIG_X86_32)) { - int fsave_header_size = sizeof(struct fregs_state); - - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size = size + fsave_header_size; - } -} - diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8def1b7f8fb..d28829403ed0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -4,21 +4,33 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ +#include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> +#include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <asm/fpu/api.h> -#include <asm/fpu/internal.h> -#include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/xcr.h> #include <asm/tlbflush.h> -#include <asm/cpufeature.h> +#include <asm/prctl.h> +#include <asm/elf.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace @@ -39,29 +51,32 @@ static const char *xfeature_names[] = "Protection Keys User registers", "PASID state", "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "AMX Tile config" , + "AMX Tile data" , + "unknown xstate feature" , }; -static short xsave_cpuid_features[] __initdata = { - X86_FEATURE_FPU, - X86_FEATURE_XMM, - X86_FEATURE_AVX, - X86_FEATURE_MPX, - X86_FEATURE_MPX, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_INTEL_PT, - X86_FEATURE_PKU, - X86_FEATURE_ENQCMD, +static unsigned short xsave_cpuid_features[] __initdata = { + [XFEATURE_FP] = X86_FEATURE_FPU, + [XFEATURE_SSE] = X86_FEATURE_XMM, + [XFEATURE_YMM] = X86_FEATURE_AVX, + [XFEATURE_BNDREGS] = X86_FEATURE_MPX, + [XFEATURE_BNDCSR] = X86_FEATURE_MPX, + [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, + [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, + [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, + [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, + [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, + [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; -/* - * This represents the full set of bits that should ever be set in a kernel - * XSAVE buffer, both supervisor and user xstates. - */ -u64 xfeatures_mask_all __ro_after_init; -EXPORT_SYMBOL_GPL(xfeatures_mask_all); - static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = @@ -72,20 +87,13 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init { [ 0 ... XFEATURE_MAX - 1] = -1}; /* - * The XSAVE area of kernel can be in standard or compacted format; - * it is always in standard format for user mode. This is the user - * mode standard format size used for signal and ptrace frames. - */ -unsigned int fpu_user_xstate_size __ro_after_init; - -/* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; + u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -135,17 +143,26 @@ static bool xfeature_is_supervisor(int xfeature_nr) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* + * Must happen after CR4 setup and before xsetbv() to allow KVM + * lazy passthrough. Write independent of the dynamic state static + * key as that does not work on the boot CPU. This also ensures + * that any stale state is wiped out from XFD. + */ + if (cpu_feature_enabled(X86_FEATURE_XFD)) + wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + + /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. @@ -158,7 +175,7 @@ void fpu__init_cpu_xstate(void) static bool xfeature_enabled(enum xfeature xfeature) { - return xfeatures_mask_all & BIT_ULL(xfeature); + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* @@ -184,10 +201,7 @@ static void __init setup_xstate_features(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; @@ -236,6 +250,8 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_XTILE_CFG); + print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* @@ -291,20 +307,15 @@ static void __init setup_xstate_comp_offsets(void) xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, xmm_space); - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_offsets[i] = xstate_offsets[i]; - } + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) + xstate_comp_offsets[i] = xstate_offsets[i]; return; } next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (xfeature_is_aligned(i)) next_offset = ALIGN(next_offset, 64); @@ -328,8 +339,8 @@ static void __init setup_supervisor_only_offsets(void) next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!xfeature_is_supervisor(i)) continue; if (xfeature_is_aligned(i)) @@ -347,15 +358,36 @@ static void __init print_xstate_offset_size(void) { int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xstate_comp_offsets[i], i, xstate_sizes[i]); } } /* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static __init void os_xrstor_booting(struct xregs_state *xstate) +{ + u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ + WARN_ON_FPU(err); +} + +/* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch @@ -372,36 +404,30 @@ static void __init print_xstate_offset_size(void) XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PASID) + XFEATURE_MASK_PASID | \ + XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { - static int on_boot_cpu __initdata = 1; - BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask_all; + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); /* * Init all the features state with header.xfeatures being 0x0 */ - os_xrstor_booting(&init_fpstate.xsave); + os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so @@ -419,7 +445,7 @@ static void __init setup_init_fpu_buf(void) * state is all zeroes or if not to add the necessary handling * here. */ - fxsave(&init_fpstate.fxsave); + fxsave(&init_fpstate.regs.fxsave); } static int xfeature_uncompacted_offset(int xfeature_nr) @@ -451,10 +477,11 @@ int xfeature_size(int xfeature_nr) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -static int validate_user_xstate_header(const struct xstate_header *hdr) +static int validate_user_xstate_header(const struct xstate_header *hdr, + struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~xfeatures_mask_uabi()) + if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -474,7 +501,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr) return 0; } -static void __xstate_dump_leaves(void) +static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; @@ -509,12 +536,73 @@ static void __xstate_dump_leaves(void) } \ } while (0) +/** + * check_xtile_data_against_struct - Check tile data state size. + * + * Calculate the state size by multiplying the single tile size which is + * recorded in a C struct, and the number of tiles that the CPU informs. + * Compare the provided size with the calculation. + * + * @size: The tile data state size + * + * Returns: 0 on success, -EINVAL on mismatch. + */ +static int __init check_xtile_data_against_struct(int size) +{ + u32 max_palid, palid, state_size; + u32 eax, ebx, ecx, edx; + u16 max_tile; + + /* + * Check the maximum palette id: + * eax: the highest numbered palette subleaf. + */ + cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + + /* + * Cross-check each tile size and find the maximum number of + * supported tiles. + */ + for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { + u16 tile_size, max; + + /* + * Check the tile size info: + * eax[31:16]: bytes per title + * ebx[31:16]: the max names (or max number of tiles) + */ + cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + tile_size = eax >> 16; + max = ebx >> 16; + + if (tile_size != sizeof(struct xtile_data)) { + pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), + sizeof(struct xtile_data), tile_size); + __xstate_dump_leaves(); + return -EINVAL; + } + + if (max > max_tile) + max_tile = max; + } + + state_size = sizeof(struct xtile_data) * max_tile; + if (size != state_size) { + pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), state_size, size); + __xstate_dump_leaves(); + return -EINVAL; + } + return 0; +} + /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ -static void check_xstate_against_struct(int nr) +static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. @@ -532,6 +620,11 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); + XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); + + /* The tile data size varies between implementations. */ + if (nr == XFEATURE_XTILE_DATA) + check_xtile_data_against_struct(sz); /* * Make *SURE* to add any feature numbers in below if @@ -541,10 +634,39 @@ static void check_xstate_against_struct(int nr) if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); + return false; + } + return true; +} + +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for_each_extended_xfeature(i, xfeatures) { + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + size = ALIGN(size, 64); + /* + * In compacted format the enabled features are packed, + * i.e. disabled features do not occupy space. + * + * In non-compacted format the offsets are fixed and + * disabled states still occupy space in the memory buffer. + */ + if (!compacted) + size = xfeature_uncompacted_offset(i); + /* + * Add the feature size even for non-compacted format + * to make the end result correct + */ + size += xfeature_size(i); } + return size; } /* @@ -556,44 +678,29 @@ static void check_xstate_against_struct(int nr) * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ -static void do_extra_xstate_size_checks(void) +static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - - check_xstate_against_struct(i); + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!check_xstate_against_struct(i)) + return false; /* * Supervisor state components can be managed only by * XSAVES. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_WARN_ON(xfeature_is_supervisor(i)); - - /* Align from the end of the previous feature */ - if (xfeature_is_aligned(i)) - paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); - /* - * The offset of a given state in the non-compacted - * format is given to us in a CPUID leaf. We check - * them for being ordered (increasing offsets) in - * setup_xstate_features(). XSAVES uses compacted format. - */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - paranoid_xstate_size = xfeature_uncompacted_offset(i); - /* - * The compacted-format offset always depends on where - * the previous state ended. - */ - paranoid_xstate_size += xfeature_size(i); + if (!compacted && xfeature_is_supervisor(i)) { + XSTATE_WARN_ON(1); + return false; + } } - XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); + size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); + XSTATE_WARN_ON(size != kernel_size); + return size == kernel_size; } - /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * @@ -644,7 +751,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) return size; } -static unsigned int __init get_xsave_size(void) +static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* @@ -662,44 +769,54 @@ static unsigned int __init get_xsave_size(void) * Will the runtime-enumerated 'xstate_size' fit in the init * task's statically-allocated buffer? */ -static bool is_supported_xstate_size(unsigned int test_xstate_size) +static bool __init is_supported_xstate_size(unsigned int test_xstate_size) { - if (test_xstate_size <= sizeof(union fpregs_state)) + if (test_xstate_size <= sizeof(init_fpstate.regs)) return true; pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(union fpregs_state), test_xstate_size); + sizeof(init_fpstate.regs), test_xstate_size); return false; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size; - unsigned int xsave_size; + unsigned int user_size, kernel_size, kernel_default_size; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); - xsave_size = get_xsave_size(); + /* Uncompacted user space size */ + user_size = get_xsave_size_user(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size_no_independent(); + /* + * XSAVES kernel size includes supervisor states and + * uses compacted format when available. + * + * XSAVE does not support supervisor states so + * kernel and user size is identical. + */ + if (compacted) + kernel_size = get_xsaves_size_no_independent(); else - possible_xstate_size = xsave_size; + kernel_size = user_size; + + kernel_default_size = + xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all enabled: */ - if (!is_supported_xstate_size(possible_xstate_size)) + /* Ensure we have the space to store all default enabled features. */ + if (!is_supported_xstate_size(kernel_default_size)) return -EINVAL; - /* - * The size is OK, we are definitely going to use xsave, - * make it known to the world that we need more space. - */ - fpu_kernel_xstate_size = possible_xstate_size; - do_extra_xstate_size_checks(); + if (!paranoid_xstate_size_valid(kernel_size)) + return -EINVAL; + + fpu_kernel_cfg.max_size = kernel_size; + fpu_user_cfg.max_size = user_size; + + fpu_kernel_cfg.default_size = kernel_default_size; + fpu_user_cfg.default_size = + xstate_calculate_size(fpu_user_cfg.default_features, false); - /* - * User space is always in standard format. - */ - fpu_user_xstate_size = xsave_size; return 0; } @@ -707,28 +824,38 @@ static int __init init_xstate_size(void) * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ -static void fpu__init_disable_system_xstate(void) +static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { - xfeatures_mask_all = 0; + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + /* Restore the legacy size.*/ + fpu_kernel_cfg.max_size = legacy_size; + fpu_kernel_cfg.default_size = legacy_size; + fpu_user_cfg.max_size = legacy_size; + fpu_user_cfg.default_size = legacy_size; + + /* + * Prevent enabling the static branch which enables writes to the + * XFD MSR. + */ + init_fpstate.xfd = 0; + + fpstate_reset(¤t->thread.fpu); } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ -void __init fpu__init_system_xstate(void) +void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu __initdata = 1; u64 xfeatures; int err; int i; - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; @@ -749,22 +876,22 @@ void __init fpu__init_system_xstate(void) * Find user xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all = eax + ((u64)edx << 32); + fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all |= ecx + ((u64)edx << 32); + fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); - if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", - xfeatures_mask_all); + fpu_kernel_cfg.max_features); goto out_disable; } @@ -772,15 +899,39 @@ void __init fpu__init_system_xstate(void) * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { - if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask_all &= ~BIT_ULL(i); + unsigned short cid = xsave_cpuid_features[i]; + + /* Careful: X86_FEATURE_FPU is 0! */ + if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) + fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } - xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED | + if (!cpu_feature_enabled(X86_FEATURE_XFD)) + fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; + fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + + /* Clean out dynamic features from default */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + /* Store it for paranoia check at the end */ - xfeatures = xfeatures_mask_all; + xfeatures = fpu_kernel_cfg.max_features; + + /* + * Initialize the default XFD state in initfp_state and enable the + * dynamic sizing mechanism if dynamic states are available. The + * static key cannot be enabled here because this runs before + * jump_label_init(). This is delayed to an initcall. + */ + init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -788,13 +939,16 @@ void __init fpu__init_system_xstate(void) if (err) goto out_disable; + /* Reset the state for the current task */ + fpstate_reset(¤t->thread.fpu); + /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi()); + update_regset_xstate_info(fpu_user_cfg.max_size, + fpu_user_cfg.max_features); - fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); setup_supervisor_only_offsets(); @@ -803,22 +957,22 @@ void __init fpu__init_system_xstate(void) * Paranoia check whether something in the setup modified the * xfeatures mask. */ - if (xfeatures != xfeatures_mask_all) { + if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", - xfeatures, xfeatures_mask_all); + xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask_all, - fpu_kernel_xstate_size, + fpu_kernel_cfg.max_features, + fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ - fpu__init_disable_system_xstate(); + fpu__init_disable_system_xstate(legacy_size); } /* @@ -830,7 +984,7 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support @@ -840,6 +994,9 @@ void fpu__resume_cpu(void) wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } + + if (fpu_state_size_dynamic()) + wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* @@ -886,7 +1043,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) * We should not ever be requesting features that we * have not enabled. */ - WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -904,7 +1061,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); #ifdef CONFIG_ARCH_HAS_PKEYS @@ -961,9 +1117,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, } /** - * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor - * @tsk: The task from which to copy the saved xstate + * @fpstate: The fpstate buffer from which to copy + * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming @@ -972,14 +1129,15 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * * It supports partial copy but @to.pos always starts from zero. */ -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode copy_mode) +void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct xregs_state *xinit = &init_fpstate.xsave; + struct xregs_state *xinit = &init_fpstate.regs.xsave; + struct xregs_state *xsave = &fpstate->regs.xsave; struct xstate_header header; unsigned int zerofrom; + u64 mask; int i; memset(&header, 0, sizeof(header)); @@ -996,7 +1154,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= xfeatures_mask_uabi(); + header.xfeatures &= fpstate->user_xfeatures; break; } @@ -1033,17 +1191,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, zerofrom = offsetof(struct xregs_state, extended_state_area); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - /* - * The ptrace buffer is in non-compacted XSAVE format. - * In non-compacted format disabled features still occupy - * state space, but there is no state to copy from in the - * compacted init_fpstate. The gap tracking will zero this - * later. - */ - if (!(xfeatures_mask_uabi() & BIT_ULL(i))) - continue; + /* + * The ptrace buffer is in non-compacted XSAVE format. In + * non-compacted format disabled features still occupy state space, + * but there is no state to copy from in the compacted + * init_fpstate. The gap tracking will zero these states. + */ + mask = fpstate->user_xfeatures; + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space * in the destination buffer. @@ -1055,10 +1211,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the - * thread's XSAVE buffer. Fill this part from the - * per-thread storage. + * XSAVE buffer. Use the provided value. */ - pkru.pkru = tsk->thread.pkru; + pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { copy_feature(header.xfeatures & BIT_ULL(i), &to, @@ -1078,6 +1233,25 @@ out: membuf_zero(&to, to.left); } +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @tsk: The task from which to copy the saved xstate + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. + */ +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode copy_mode) +{ + __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.pkru, copy_mode); +} + static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { @@ -1091,9 +1265,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } -static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, +static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf) { + struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; @@ -1103,7 +1278,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; - if (validate_user_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ @@ -1156,12 +1331,11 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] - * format and copy to the target thread. This is called from - * xstateregs_set(). + * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) { - return copy_uabi_to_xstate(xsave, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL); } /* @@ -1169,26 +1343,20 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, +int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf) { - return copy_uabi_to_xstate(xsave, NULL, ubuf); + return copy_uabi_to_xstate(fpstate, NULL, ubuf); } -static bool validate_xsaves_xrstors(u64 mask) +static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; - /* - * Validate that this is either a task->fpstate related component - * subset or an independent one. - */ - if (mask & xfeatures_mask_independent()) - xchk = ~xfeatures_mask_independent(); - else - xchk = ~xfeatures_mask_all; + + xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; @@ -1206,14 +1374,13 @@ static bool validate_xsaves_xrstors(u64 mask) * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); @@ -1231,20 +1398,379 @@ void xsaves(struct xregs_state *xstate, u64 mask) * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask + * can safely operate on the @fpstate buffer. + */ +static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) +{ + u64 xfd = __this_cpu_read(xfd_state); + + if (fpstate->xfd == xfd) + return true; + + /* + * The XFD MSR does not match fpstate->xfd. That's invalid when + * the passed in fpstate is current's fpstate. + */ + if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + return false; + + /* + * XRSTOR(S) from init_fpstate are always correct as it will just + * bring all components into init state and not read from the + * buffer. XSAVE(S) raises #PF after init. + */ + if (fpstate == &init_fpstate) + return rstor; + + /* + * XSAVE(S): clone(), fpu_swap_kvm_fpu() + * XRSTORS(S): fpu_swap_kvm_fpu() + */ + + /* + * No XSAVE/XRSTOR instructions (except XSAVE itself) touch + * the buffer area for XFD-disabled state components. + */ + mask &= ~xfd; + + /* + * Remove features which are valid in fpstate. They + * have space allocated in fpstate. + */ + mask &= ~fpstate->xfeatures; + + /* + * Any remaining state components in 'mask' might be written + * by XSAVE/XRSTOR. Fail validation it found. + */ + return !mask; +} + +void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) +{ + WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); +} +#endif /* CONFIG_X86_DEBUG_FPU */ + +static int __init xfd_update_static_branch(void) +{ + /* + * If init_fpstate.xfd has bits set then dynamic features are + * available and the dynamic sizing must be enabled. + */ + if (init_fpstate.xfd) + static_branch_enable(&__fpu_state_size_dynamic); + return 0; +} +arch_initcall(xfd_update_static_branch) + +void fpstate_free(struct fpu *fpu) +{ + if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) + vfree(fpu->fpstate); +} + +/** + * fpu_install_fpstate - Update the active fpstate in the FPU + * + * @fpu: A struct fpu * pointer + * @newfps: A struct fpstate * pointer + * + * Returns: A null pointer if the last active fpstate is the embedded + * one or the new fpstate is already installed; + * otherwise, a pointer to the old fpstate which has to + * be freed by the caller. + */ +static struct fpstate *fpu_install_fpstate(struct fpu *fpu, + struct fpstate *newfps) +{ + struct fpstate *oldfps = fpu->fpstate; + + if (fpu->fpstate == newfps) + return NULL; + + fpu->fpstate = newfps; + return oldfps != &fpu->__fpstate ? oldfps : NULL; +} + +/** + * fpstate_realloc - Reallocate struct fpstate for the requested new features + * + * @xfeatures: A bitmap of xstate features which extend the enabled features + * of that task + * @ksize: The required size for the kernel buffer + * @usize: The required size for user space buffers + * + * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer + * terminates quickly, vfree()-induced IPIs may be a concern, but tasks + * with large states are likely to live longer. + * + * Returns: 0 on success, -ENOMEM on allocation error. + */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize) +{ + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *curfps, *newfps = NULL; + unsigned int fpsize; + + curfps = fpu->fpstate; + fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); + + newfps = vzalloc(fpsize); + if (!newfps) + return -ENOMEM; + newfps->size = ksize; + newfps->user_size = usize; + newfps->is_valloc = true; + + fpregs_lock(); + /* + * Ensure that the current state is in the registers before + * swapping fpstate as that might invalidate it due to layout + * changes. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + + newfps->xfeatures = curfps->xfeatures | xfeatures; + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + newfps->xfd = curfps->xfd & ~xfeatures; + + curfps = fpu_install_fpstate(fpu, newfps); + + /* Do the final updates within the locked region */ + xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); + xfd_update_state(newfps); + + fpregs_unlock(); + + vfree(curfps); + return 0; +} + +static int validate_sigaltstack(unsigned int usize) +{ + struct task_struct *thread, *leader = current->group_leader; + unsigned long framesize = get_sigframe_size(); + + lockdep_assert_held(¤t->sighand->siglock); + + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ + framesize -= fpu_user_cfg.max_size; + framesize += usize; + for_each_thread(leader, thread) { + if (thread->sas_ss_size && thread->sas_ss_size < framesize) + return -ENOSPC; + } + return 0; +} + +static int __xstate_request_perm(u64 permitted, u64 requested) +{ + /* + * This deliberately does not exclude !XSAVES as we still might + * decide to optionally context switch XCR0 or talk the silicon + * vendors into extending XFD for the pre AMX states, especially + * AVX512. + */ + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + struct fpu *fpu = ¤t->group_leader->thread.fpu; + unsigned int ksize, usize; + u64 mask; + int ret; + + /* Check whether fully enabled */ + if ((permitted & requested) == requested) + return 0; + + /* Calculate the resulting kernel state size */ + mask = permitted | requested; + ksize = xstate_calculate_size(mask, compacted); + + /* Calculate the resulting user state size */ + mask &= XFEATURE_MASK_USER_SUPPORTED; + usize = xstate_calculate_size(mask, false); + + ret = validate_sigaltstack(usize); + if (ret) + return ret; + + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ + WRITE_ONCE(fpu->perm.__state_perm, requested); + /* Protected by sighand lock */ + fpu->perm.__state_size = ksize; + fpu->perm.__user_state_size = usize; + return ret; +} + +/* + * Permissions array to map facilities with more than one component + */ +static const u64 xstate_prctl_req[XFEATURE_MAX] = { + [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, +}; + +static int xstate_request_perm(unsigned long idx) +{ + u64 permitted, requested; + int ret; + + if (idx >= XFEATURE_MAX) + return -EINVAL; + + /* + * Look up the facility mask which can require more than + * one xstate component. + */ + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); + requested = xstate_prctl_req[idx]; + if (!requested) + return -EOPNOTSUPP; + + if ((fpu_user_cfg.max_features & requested) != requested) + return -EOPNOTSUPP; + + /* Lockless quick check */ + permitted = xstate_get_host_group_perm(); + if ((permitted & requested) == requested) + return 0; + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + permitted = xstate_get_host_group_perm(); + ret = __xstate_request_perm(permitted, requested); + spin_unlock_irq(¤t->sighand->siglock); + return ret; +} + +int xfd_enable_feature(u64 xfd_err) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; + } + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; + } + + fpu = ¤t->group_leader->thread.fpu; + ksize = fpu->perm.__state_size; + usize = fpu->perm.__user_state_size; + /* + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). + */ + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Try to allocate a new fpstate. If that fails there is no way + * out. + */ + if (fpstate_realloc(xfd_event, ksize, usize)) + return -EFAULT; + return 0; +} +#else /* CONFIG_X86_64 */ +static inline int xstate_request_perm(unsigned long idx) +{ + return -EPERM; +} +#endif /* !CONFIG_X86_64 */ + +/** + * fpu_xstate_prctl - xstate permission operations + * @tsk: Redundant pointer to current + * @option: A subfunction of arch_prctl() + * @arg2: option argument + * Return: 0 if successful; otherwise, an error code + * + * Option arguments: + * + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info + * ARCH_REQ_XCOMP_PERM: Facility number requested + * + * For facilities which require more than one XSTATE component, the request + * must be the highest state component number related to that facility, + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). + */ +long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +{ + u64 __user *uptr = (u64 __user *)arg2; + u64 permitted, supported; + unsigned long idx = arg2; + + if (tsk != current) + return -EPERM; + + switch (option) { + case ARCH_GET_XCOMP_SUPP: + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; + return put_user(supported, uptr); + + case ARCH_GET_XCOMP_PERM: + /* + * Lockless snapshot as it can also change right after the + * dropping the lock. + */ + permitted = xstate_get_host_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_PERM: + if (!IS_ENABLED(CONFIG_X86_64)) + return -EOPNOTSUPP; + + return xstate_request_perm(idx); + + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h new file mode 100644 index 000000000000..e18210dff88c --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_XSTATE_H +#define __X86_KERNEL_FPU_XSTATE_H + +#include <asm/cpufeature.h> +#include <asm/fpu/xstate.h> + +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u64, xfd_state); +#endif + +static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; +} + +static inline u64 xstate_get_host_group_perm(void) +{ + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ + return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm); +} + +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode); +extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode mode); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); +extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); + + +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system_xstate(unsigned int legacy_size); + +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + +static inline u64 xfeatures_mask_supervisor(void) +{ + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_independent(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_INDEPENDENT; +} + +/* XSAVE/XRSTOR wrapper functions */ + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. + */ +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact + * format and supervisor states in addition to modified optimization in + * XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_2(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + ".pushsection .fixup,\"ax\"\n" \ + "4: movl $-2, %[err]\n" \ + "jmp 3b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(661b, 4b) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + : \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) +extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); +#else +static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } +#endif + +#ifdef CONFIG_X86_64 +static inline void xfd_update_state(struct fpstate *fpstate) +{ + if (fpu_state_size_dynamic()) { + u64 xfd = fpstate->xfd; + + if (__this_cpu_read(xfd_state) != xfd) { + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); + } + } +} +#else +static inline void xfd_update_state(struct fpstate *fpstate) { } +#endif + +/* + * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. + */ +static inline void os_xsave(struct fpstate *fpstate) +{ + u64 mask = fpstate->xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + WARN_ON_FPU(!alternatives_patched); + xfd_validate_state(fpstate, mask, false); + + XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. + */ +static inline void os_xrstor(struct fpstate *fpstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + xfd_validate_state(fpstate, mask, true); + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* Restore of supervisor state. Does not require XFD */ +static inline void os_xrstor_supervisor(struct fpstate *fpstate) +{ + u64 mask = xfeatures_mask_supervisor(); + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. + */ +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +{ + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + struct fpstate *fpstate = current->thread.fpu.fpstate; + u64 mask = fpstate->user_xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(fpstate, mask, false); + + stac(); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(current->thread.fpu.fpstate, mask, true); + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) +{ + struct xregs_state *xstate = &fpstate->regs.xsave; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + /* Ensure that XFD is up to date */ + xfd_update_state(fpstate); + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + + +#endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index de01903c3735..fc5371a7e9d1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -19,7 +19,7 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/pgtable.h> #include <asm/processor.h> @@ -284,8 +284,13 @@ unsigned long __head __startup_64(unsigned long physaddr, * The bss section will be memset to zero later in the initialization so * there is no need to zero it after changing the memory encryption * attribute. + * + * This is early code, use an open coded check for SME instead of + * using cc_platform_has(). This eliminates worries about removing + * instrumentation or checking boot_cpu_data in the cc_platform_has() + * function. */ - if (mem_encrypt_active()) { + if (sme_get_me_mask()) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 42fc41dd0e1f..882213df3713 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -10,6 +10,7 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/mwait.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -916,6 +917,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -929,6 +1007,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 044902d5a3c4..e5dd6da78713 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -148,6 +149,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 8ef35063964b..760e1f293093 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,9 +7,11 @@ /* * unsigned long native_save_fl(void) */ +.pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret SYM_FUNC_END(native_save_fl) +.popsection EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1afbdd1dd777..9ff480e94511 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) * of the priority chain and only used when * all other high priority cpus are out of capacity. */ - smt_prio = prio * smp_num_siblings / i; + smt_prio = prio * smp_num_siblings / (i * i); per_cpu(sched_core_priority, cpu) = smt_prio; i++; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b656456c3a94..8863d1941f1b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include <linux/nmi.h> #include <linux/swait.h> #include <linux/syscore_ops.h> +#include <linux/cc_platform.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -418,7 +419,7 @@ static void __init sev_map_percpu_data(void) { int cpu; - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ad273e5861c1..462dd8e9b03d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -16,9 +16,9 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/set_memory.h> +#include <linux/cc_platform.h> #include <asm/hypervisor.h> -#include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/kvmclock.h> @@ -49,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock __bss_decrypted; -static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static struct pvclock_vsyscall_time_info *hvclock_mem; - -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) -{ - return &this_cpu_read(hv_clock_per_cpu)->pvti; -} - -static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) -{ - return this_cpu_read(hv_clock_per_cpu); -} +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -232,7 +223,7 @@ static void __init kvmclock_init_mem(void) * hvclock is shared between the guest and the hypervisor, must * be mapped decrypted. */ - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { r = set_memory_decrypted((unsigned long) hvclock_mem, 1UL << order); if (r) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 131f30fdcfbd..f5da4a18070a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include <linux/suspend.h> #include <linux/vmalloc.h> #include <linux/efi.h> +#include <linux/cc_platform.h> #include <asm/init.h> #include <asm/tlbflush.h> @@ -166,7 +167,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) } pte = pte_offset_kernel(pmd, vaddr); - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) prot = PAGE_KERNEL_EXEC; set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); @@ -206,7 +207,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; info.kernpg_flag |= _PAGE_ENC; } @@ -358,7 +359,7 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, image->preserve_context, - sme_active()); + cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -569,12 +570,12 @@ void arch_kexec_unprotect_crashkres(void) */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return 0; /* - * If SME is active we need to be sure that kexec pages are - * not encrypted because when we boot to the new kernel the + * If host memory encryption is active we need to be sure that kexec + * pages are not encrypted because when we boot to the new kernel the * pages won't be accessed encrypted (initially). */ return set_memory_decrypted((unsigned long)vaddr, pages); @@ -582,12 +583,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* - * If SME is active we need to reset the pages back to being - * an encrypted mapping before freeing them. + * If host memory encryption is active we need to reset the pages back + * to being an encrypted mapping before freeing them. */ set_memory_encrypted((unsigned long)vaddr, pages); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..169fb6f4cd2e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -251,7 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -267,8 +268,14 @@ int module_finalize(const Elf_Ehdr *hdr, orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 04cafc057bed..ebc45360ffd4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -218,6 +218,36 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } + +static noinstr unsigned long pv_native_read_cr2(void) +{ + return native_read_cr2(); +} + +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} + +static noinstr void pv_native_irq_enable(void) +{ + native_irq_enable(); +} + +static noinstr void pv_native_irq_disable(void) +{ + native_irq_disable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -244,8 +274,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, - .cpu.get_debugreg = native_get_debugreg, - .cpu.set_debugreg = native_set_debugreg, + .cpu.get_debugreg = pv_native_get_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, @@ -281,8 +311,8 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ @@ -298,8 +328,8 @@ struct paravirt_patch_template pv_ops = { .mmu.exit_mmap = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), - .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), + .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, @@ -371,9 +401,6 @@ struct paravirt_patch_template pv_ops = { }; #ifdef CONFIG_PARAVIRT_XXL -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); -NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); void (*paravirt_iret)(void) = native_iret; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index c2cfa5e7c152..814ab46a0dad 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,7 +6,7 @@ #include <linux/swiotlb.h> #include <linux/memblock.h> #include <linux/dma-direct.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <asm/iommu.h> #include <asm/swiotlb.h> @@ -45,11 +45,10 @@ int __init pci_swiotlb_detect_4gb(void) swiotlb = 1; /* - * If SME is active then swiotlb will be set to 1 so that bounce - * buffers are allocated and used for devices that do not support - * the addressing range required for the encryption mask. + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. */ - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) swiotlb = 1; return swiotlb; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d9463e3096b..eb470be0e5ae 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,7 +30,9 @@ #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> +#include <asm/fpu/sched.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> @@ -43,6 +45,7 @@ #include <asm/io_bitmap.h> #include <asm/proto.h> #include <asm/frame.h> +#include <asm/unwind.h> #include "process.h" @@ -87,9 +90,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - return fpu_clone(dst); + /* Drop the copied pointer to current's fpstate */ + dst->thread.fpu.fpstate = NULL; + + return 0; } +#ifdef CONFIG_X86_64 +void arch_release_task_struct(struct task_struct *tsk) +{ + if (fpu_state_size_dynamic()) + fpstate_free(&tsk->thread.fpu); +} +#endif + /* * Free thread data structures etc.. */ @@ -154,6 +168,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->flags = X86_EFLAGS_FIXED; #endif + fpu_clone(p, clone_flags); + /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); @@ -942,70 +958,36 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * because the task might wake up and we might look at a stack * changing under us. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip, ret = 0; - int count = 0; - - if (p == current || task_is_running(p)) - return 0; - - if (!try_get_task_stack(p)) - return 0; - - start = (unsigned long)task_stack_page(p); - if (!start) - goto out; + struct unwind_state state; + unsigned long addr = 0; - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start; - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - goto out; - - fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); - do { - if (fp < bottom || fp > top) - goto out; - ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) { - ret = ip; - goto out; - } - fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && !task_is_running(p)); + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (in_sched_functions(addr)) + continue; + break; + } -out: - put_task_stack(p); - return ret; + return addr; } long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled) + unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, cpuid_enabled); + return set_cpuid_mode(task, arg2); + case ARCH_GET_XCOMP_SUPP: + case ARCH_GET_XCOMP_PERM: + case ARCH_REQ_XCOMP_PERM: + return fpu_xstate_prctl(task, option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4f2f54e1281c..26edb1cd07a4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,7 @@ #include <asm/ldt.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/desc.h> #include <linux/err.h> @@ -160,7 +160,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -213,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ec0d836a13b1..3402edec236c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -42,7 +42,7 @@ #include <asm/processor.h> #include <asm/pkru.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> @@ -559,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && @@ -620,7 +619,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4c208ea3bd9f..6d2244c94799 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -29,9 +29,9 @@ #include <linux/uaccess.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c53271aebb64..c8fe74a28143 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,7 +47,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) * %rsi page_list * %rdx start address * %rcx preserve_context - * %r8 sme_active + * %r8 host_mem_enc_active */ /* Save the CPU context, used for jumping back */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 79f164141116..40ed44ead063 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; e820__memory_setup(); parse_setup_data(); @@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - /* - * Do some memory reservations *before* memory is added to - * memblock, so memblock allocations won't overwrite it. - * Do it after early param, so we could get (unlikely) panic from - * serial. - * - * After this point everything still needed from the boot loader or - * firmware or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - early_reserve_memory(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 78a32b956e81..5afd98559193 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free_ptr(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 9f90f460a28c..ff1e82ff52d9 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { ghcb->save.sw_exit_code = 0; - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } static bool vc_decoding_needed(unsigned long exit_code) @@ -130,6 +130,8 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, } else { ret = ES_VMM_ERROR; } + } else if (ghcb->save.sw_exit_info_1 & 0xffffffff) { + ret = ES_VMM_ERROR; } else { ret = ES_OK; } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a6895e440bc3..88c46136bbda 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -11,7 +11,7 @@ #include <linux/sched/debug.h> /* For show_regs() */ #include <linux/percpu-defs.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/printk.h> #include <linux/mm_types.h> #include <linux/set_memory.h> @@ -23,7 +23,7 @@ #include <asm/stacktrace.h> #include <asm/sev.h> #include <asm/insn-eval.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/xcr.h> #include <asm/processor.h> #include <asm/realmode.h> #include <asm/traps.h> @@ -615,7 +615,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) int cpu; u64 pfn; - if (!sev_es_active()) + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return 0; pflags = _PAGE_NX | _PAGE_RW; @@ -774,7 +774,7 @@ void __init sev_es_init_vc_handling(void) BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - if (!sev_es_active()) + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return; if (!sev_es_check_cpu_features()) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f4d21e470083..ec71e06ae364 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/tracehook.h> @@ -30,8 +31,8 @@ #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> +#include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> @@ -41,6 +42,7 @@ #include <linux/compat.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/fpu/xstate.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> @@ -79,9 +81,9 @@ static void force_valid_ss(struct pt_regs *regs) # define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif -static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) { struct sigcontext sc; @@ -89,7 +91,7 @@ static int restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return -EFAULT; + return false; #ifdef CONFIG_X86_32 set_user_gs(regs, sc.gs); @@ -244,7 +246,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -292,8 +293,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } /* save i387 and extended state */ - ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); - if (ret < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) return (void __user *)-1L; return (void __user *)sp; @@ -643,7 +643,7 @@ SYSCALL_DEFINE0(sigreturn) * x86_32 has no uc_flags bits relevant to restore_sigcontext. * Save a few cycles by skipping the __get_user. */ - if (restore_sigcontext(regs, &frame->sc, 0)) + if (!restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -671,7 +671,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -721,12 +721,15 @@ badframe: /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; +static unsigned int __ro_after_init fpu_default_state_size; void __init init_sigframe_size(void) { + fpu_default_state_size = fpu__get_fpstate_size(); + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; - max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING; + max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); @@ -910,6 +913,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV); } +#ifdef CONFIG_DYNAMIC_SIGFRAME +#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE +static bool strict_sigaltstack_size __ro_after_init = true; +#else +static bool strict_sigaltstack_size __ro_after_init = false; +#endif + +static int __init strict_sas_size(char *arg) +{ + return kstrtobool(arg, &strict_sigaltstack_size); +} +__setup("strict_sas_size", strict_sas_size); + +/* + * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 + * exceeds that size already. As such programs might never use the + * sigaltstack they just continued to work. While always checking against + * the real size would be correct, this might be considered a regression. + * + * Therefore avoid the sanity check, unless enforced by kernel + * configuration or command line option. + * + * When dynamic FPU features are supported, the check is also enforced when + * the task has permissions to use dynamic features. Tasks which have no + * permission are checked against the size of the non-dynamic feature set + * if strict checking is enabled. This avoids forcing all tasks on the + * system to allocate large sigaltstacks even if they are never going + * to use a dynamic feature. As this is serialized via sighand::siglock + * any permission request for a dynamic feature either happened already + * or will see the newly install sigaltstack size in the permission checks. + */ +bool sigaltstack_size_valid(size_t ss_size) +{ + unsigned long fsize = max_frame_size - fpu_default_state_size; + u64 mask; + + lockdep_assert_held(¤t->sighand->siglock); + + if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) + return true; + + fsize += current->group_leader->thread.fpu.perm.__user_state_size; + if (likely(ss_size > fsize)) + return true; + + if (strict_sigaltstack_size) + return ss_size > fsize; + + mask = current->group_leader->thread.fpu.perm.__state_perm; + if (mask & XFEATURE_MASK_USER_DYNAMIC) + return ss_size > fsize; + + return true; +} +#endif /* CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_X86_X32_ABI COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { @@ -929,7 +988,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c453b825a57f..8241927addff 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -70,7 +70,7 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + /* If the arch didn't set up l2c_id, fall back to SMT */ + if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + return match_smt(c, o); + + /* Do not match if L2 cache id does not match: */ + if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + return false; + + return topology_sane(c, o, "l2c"); +} + /* * Unlike the other levels, we do not enforce keeping a * multicore group inside a NUMA node. If this happens, we will @@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -541,12 +558,21 @@ static int x86_smt_flags(void) return cpu_smt_flags() | x86_sched_itmt_flags(); } #endif +#ifdef CONFIG_SCHED_CLUSTER +static int x86_cluster_flags(void) +{ + return cpu_cluster_flags() | x86_sched_itmt_flags(); +} +#endif #endif static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu) if (!has_mp) { cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; @@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_l2c(c, o))) + link_mask(cpu_l2c_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_die(c, o))) link_mask(topology_die_cpumask, cpu, i); } @@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return cpu_llc_shared_mask(cpu); } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return cpu_l2c_shared_mask(cpu); +} + static void impress_friends(void) { int cpu; @@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } /* @@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(cpu_l2c_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a58800973aed..6ca1454a65d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -48,7 +48,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> @@ -1108,10 +1108,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug) */ } +static bool handle_xfd_event(struct pt_regs *regs) +{ + u64 xfd_err; + int err; + + if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) + return false; + + rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + if (!xfd_err) + return false; + + wrmsrl(MSR_IA32_XFD_ERR, 0); + + /* Die if that happens in kernel space */ + if (WARN_ON(!user_mode(regs))) + return false; + + local_irq_enable(); + + err = xfd_enable_feature(xfd_err); + + switch (err) { + case -EPERM: + force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); + break; + case -EFAULT: + force_sig(SIGSEGV); + break; + } + + local_irq_disable(); + return true; +} + DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); + if (handle_xfd_event(regs)) + return; + #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index efd9e9ea17f2..3d6dc12d198f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -272,6 +272,20 @@ SECTIONS __parainstructions_end = .; } +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" |