diff options
Diffstat (limited to 'arch/arm64/kernel')
59 files changed, 2664 insertions, 1350 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 2b112f3b7510..2920b0a51403 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -33,7 +33,8 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o idle.o patching.o pi/ + syscall.o proton-pack.o idle.o patching.o pi/ \ + rsi.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o @@ -77,7 +78,7 @@ $(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so obj-y += probes/ obj-y += head.o -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index e6f66491fbe9..b9a66fc146c9 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -379,7 +379,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, pgprot_val(prot)); + return ioremap_prot(phys, size, prot); } /* diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index b21dd24b8efc..30d4bbe68661 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -12,15 +12,12 @@ #include <linux/ftrace.h> #include <linux/kexec.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> #include <linux/kvm_host.h> -#include <linux/preempt.h> #include <linux/suspend.h> #include <asm/cpufeature.h> #include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> -#include <asm/signal32.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <linux/kbuild.h> @@ -28,8 +25,6 @@ int main(void) { - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); @@ -79,45 +74,27 @@ int main(void) DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); - DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); + DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - DEFINE(FREGS_X0, offsetof(struct ftrace_regs, regs[0])); - DEFINE(FREGS_X2, offsetof(struct ftrace_regs, regs[2])); - DEFINE(FREGS_X4, offsetof(struct ftrace_regs, regs[4])); - DEFINE(FREGS_X6, offsetof(struct ftrace_regs, regs[6])); - DEFINE(FREGS_X8, offsetof(struct ftrace_regs, regs[8])); - DEFINE(FREGS_FP, offsetof(struct ftrace_regs, fp)); - DEFINE(FREGS_LR, offsetof(struct ftrace_regs, lr)); - DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp)); - DEFINE(FREGS_PC, offsetof(struct ftrace_regs, pc)); + DEFINE(FREGS_X0, offsetof(struct __arch_ftrace_regs, regs[0])); + DEFINE(FREGS_X2, offsetof(struct __arch_ftrace_regs, regs[2])); + DEFINE(FREGS_X4, offsetof(struct __arch_ftrace_regs, regs[4])); + DEFINE(FREGS_X6, offsetof(struct __arch_ftrace_regs, regs[6])); + DEFINE(FREGS_X8, offsetof(struct __arch_ftrace_regs, regs[8])); + DEFINE(FREGS_FP, offsetof(struct __arch_ftrace_regs, fp)); + DEFINE(FREGS_LR, offsetof(struct __arch_ftrace_regs, lr)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_PC, offsetof(struct __arch_ftrace_regs, pc)); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct ftrace_regs, direct_tramp)); -#endif - DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); - BLANK(); + DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct __arch_ftrace_regs, direct_tramp)); #endif -#ifdef CONFIG_COMPAT - DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); - DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); + DEFINE(FREGS_SIZE, sizeof(struct __arch_ftrace_regs)); BLANK(); #endif - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); - BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - BLANK(); - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); @@ -202,20 +179,10 @@ int main(void) DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); #endif BLANK(); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - DEFINE(FGRET_REGS_X0, offsetof(struct fgraph_ret_regs, regs[0])); - DEFINE(FGRET_REGS_X1, offsetof(struct fgraph_ret_regs, regs[1])); - DEFINE(FGRET_REGS_X2, offsetof(struct fgraph_ret_regs, regs[2])); - DEFINE(FGRET_REGS_X3, offsetof(struct fgraph_ret_regs, regs[3])); - DEFINE(FGRET_REGS_X4, offsetof(struct fgraph_ret_regs, regs[4])); - DEFINE(FGRET_REGS_X5, offsetof(struct fgraph_ret_regs, regs[5])); - DEFINE(FGRET_REGS_X6, offsetof(struct fgraph_ret_regs, regs[6])); - DEFINE(FGRET_REGS_X7, offsetof(struct fgraph_ret_regs, regs[7])); - DEFINE(FGRET_REGS_FP, offsetof(struct fgraph_ret_regs, fp)); - DEFINE(FGRET_REGS_SIZE, sizeof(struct fgraph_ret_regs)); -#endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif + DEFINE(PIE_E0_ASM, PIE_E0); + DEFINE(PIE_E1_ASM, PIE_E1); return 0; } diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fd..309942b06c5b 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index deff21bfa680..b68e1d328d4c 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -368,6 +368,8 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) return 1; } + if (!handler) + return 1; type = handler(addr, instr, regs); if (type == TYPE_ERROR || type == TYPE_FAULT) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a78f247029ae..59d723c9ab8f 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -14,31 +14,85 @@ #include <asm/kvm_asm.h> #include <asm/smp_plat.h> +static u64 target_impl_cpu_num; +static struct target_impl_cpu *target_impl_cpus; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus) +{ + if (target_impl_cpu_num || !num || !impl_cpus) + return false; + + target_impl_cpu_num = num; + target_impl_cpus = impl_cpus; + return true; +} + +static inline bool is_midr_in_range(struct midr_range const *range) +{ + int i; + + if (!target_impl_cpu_num) + return midr_is_cpu_model_range(read_cpuid_id(), range->model, + range->rv_min, range->rv_max); + + for (i = 0; i < target_impl_cpu_num; i++) { + if (midr_is_cpu_model_range(target_impl_cpus[i].midr, + range->model, + range->rv_min, range->rv_max)) + return true; + } + return false; +} + +bool is_midr_in_range_list(struct midr_range const *ranges) +{ + while (ranges->model) + if (is_midr_in_range(ranges++)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(is_midr_in_range_list); + static bool __maybe_unused -is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +__is_affected_midr_range(const struct arm64_cpu_capabilities *entry, + u32 midr, u32 revidr) { const struct arm64_midr_revidr *fix; - u32 midr = read_cpuid_id(), revidr; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - if (!is_midr_in_range(midr, &entry->midr_range)) + if (!is_midr_in_range(&entry->midr_range)) return false; midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - revidr = read_cpuid(REVIDR_EL1); for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++) if (midr == fix->midr_rv && (revidr & fix->revidr_mask)) return false; - return true; } static bool __maybe_unused +is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +{ + int i; + + if (!target_impl_cpu_num) { + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return __is_affected_midr_range(entry, read_cpuid_id(), + read_cpuid(REVIDR_EL1)); + } + + for (i = 0; i < target_impl_cpu_num; i++) { + if (__is_affected_midr_range(entry, target_impl_cpus[i].midr, + target_impl_cpus[i].midr)) + return true; + } + return false; +} + +static bool __maybe_unused is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry, int scope) { WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list); + return is_midr_in_range_list(entry->midr_range_list); } static bool __maybe_unused @@ -186,12 +240,48 @@ static bool __maybe_unused has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, int scope) { - u32 midr = read_cpuid_id(); bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range(midr, &range) && has_dic; + return is_midr_in_range(&range) && has_dic; +} + +static const struct midr_range impdef_pmuv3_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, +}; + +static bool has_impdef_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; + + if (!is_kernel_in_hyp_mode()) + return false; + + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; + + return is_midr_in_range_list(impdef_pmuv3_cpus); +} + +static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56)); } #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI @@ -245,7 +335,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 -const struct midr_range cavium_erratum_27456_cpus[] = { +static const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), /* Cavium ThunderX, T81 pass 1.0 */ @@ -467,6 +557,13 @@ static const struct midr_range erratum_ac03_cpu_38_list[] = { }; #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 +static const struct midr_range erratum_ac04_cpu_23_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -786,6 +883,28 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 + { + .desc = "AmpereOne erratum AC04_CPU_23", + .capability = ARM64_WORKAROUND_AMPERE_AC04_CPU_23, + ERRATA_MIDR_RANGE_LIST(erratum_ac04_cpu_23_list), + }, +#endif + { + .desc = "Broken CNTVOFF_EL2", + .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, + ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) { + MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1), + {} + })), + }, + { + .desc = "Apple IMPDEF PMUv3 Traps", + .capability = ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_impdef_pmuv3, + .cpu_enable = cpu_enable_impdef_pmuv3_traps, + }, { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..b34044e20128 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include <linux/cpu.h> #include <linux/kasan.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> #include <asm/cpu.h> #include <asm/cpufeature.h> @@ -85,6 +86,7 @@ #include <asm/kvm_host.h> #include <asm/mmu_context.h> #include <asm/mte.h> +#include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/smp.h> #include <asm/sysreg.h> @@ -103,6 +105,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; +unsigned int compat_elf_hwcap3 __read_mostly; #endif DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); @@ -111,7 +114,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); -bool arm64_use_ng_mappings = false; +/* + * arm64_use_ng_mappings must be placed in the .data section, otherwise it + * ends up in the .bss section where it is initialized in early_map_kernel() + * after the MMU (with the idmap) was enabled. create_init_idmap() - which + * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - + * may end up generating an incorrect idmap page table attributes. + */ +bool arm64_use_ng_mappings __read_mostly = false; EXPORT_SYMBOL(arm64_use_ng_mappings); DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; @@ -228,6 +238,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), @@ -266,6 +277,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -291,6 +303,9 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -314,6 +329,8 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F16MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), @@ -326,6 +343,8 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_EltPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), @@ -369,6 +388,16 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SBitPerm_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_AES_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SFEXPA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_STMOP_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMOP4_SHIFT, 1, 0), ARM64_FTR_END, }; @@ -377,6 +406,8 @@ static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0), ARM64_FTR_END, @@ -475,6 +506,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -684,6 +716,14 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), + ARM64_FTR_END, +}; + /* * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of @@ -726,17 +766,17 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override id_aa64mmfr0_override; -struct arm64_ftr_override id_aa64mmfr1_override; -struct arm64_ftr_override id_aa64mmfr2_override; -struct arm64_ftr_override id_aa64pfr0_override; -struct arm64_ftr_override id_aa64pfr1_override; -struct arm64_ftr_override id_aa64zfr0_override; -struct arm64_ftr_override id_aa64smfr0_override; -struct arm64_ftr_override id_aa64isar1_override; -struct arm64_ftr_override id_aa64isar2_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; +struct arm64_ftr_override __read_mostly id_aa64pfr0_override; +struct arm64_ftr_override __read_mostly id_aa64pfr1_override; +struct arm64_ftr_override __read_mostly id_aa64zfr0_override; +struct arm64_ftr_override __read_mostly id_aa64smfr0_override; +struct arm64_ftr_override __read_mostly id_aa64isar1_override; +struct arm64_ftr_override __read_mostly id_aa64isar2_override; -struct arm64_ftr_override arm64_sw_feature_override; +struct arm64_ftr_override __read_mostly arm64_sw_feature_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -804,6 +844,9 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), + /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -989,17 +1032,16 @@ static void init_cpu_ftr_reg(u32 sys_reg, u64 new) /* Override was valid */ ftr_new = tmp; str = "forced"; - } else if (ftr_ovr == tmp) { + } else { /* Override was the safe value */ str = "already set"; } - if (str) - pr_warn("%s[%d:%d]: %s to %llx\n", - reg->name, - ftrp->shift + ftrp->width - 1, - ftrp->shift, str, - tmp & (BIT(ftrp->width) - 1)); + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, + tmp & (BIT(ftrp->width) - 1)); } else if ((ftr_mask & reg->override->val) == ftr_mask) { reg->override->val &= ~ftr_mask; pr_warn("%s[%d:%d]: impossible override, ignored\n", @@ -1152,17 +1194,16 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); - /* - * We mask out SMPS since even if the hardware - * supports priorities the kernel does not at present - * and we block access to them. - */ - info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; vec_init_vq_map(ARM64_VEC_SME); cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } @@ -1372,6 +1413,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, + info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); @@ -1405,13 +1448,6 @@ void update_cpu_features(int cpu, id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); - /* - * We mask out SMPS since even if the hardware - * supports priorities the kernel does not at present - * and we block access to them. - */ - info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; - /* Probe vector lengths */ if (!system_capabilities_finalized()) vec_update_vq_map(ARM64_VEC_SME); @@ -1419,6 +1455,12 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); + } + /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the @@ -1618,6 +1660,11 @@ const struct cpumask *system_32bit_el0_cpumask(void) return cpu_possible_mask; } +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) +{ + return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK)); +} + static int __init parse_32bit_el0_param(char *str) { allow_mismatched_32bit_el0 = true; @@ -1760,7 +1807,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, char const *str = "kpti command line option"; bool meltdown_safe; - meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + meltdown_safe = is_midr_in_range_list(kpti_safe_list); /* Defer to CPU feature registers */ if (has_cpuid_feature(entry, scope)) @@ -1830,7 +1877,7 @@ static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && !(has_cpuid_feature(entry, scope) || - is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); + is_midr_in_range_list(nv1_ni_list))); } #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) @@ -1866,6 +1913,28 @@ static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) } #endif +#ifdef CONFIG_HW_PERF_EVENTS +static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; + + /* + * PMUVer follows the standard ID scheme for an unsigned field with the + * exception of 0xF (IMP_DEF) which is treated specially and implies + * FEAT_PMUv3 is not implemented. + * + * See DDI0487L.a D24.1.3.2 for more details. + */ + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; + + return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; +} +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) @@ -2013,7 +2082,7 @@ static bool cpu_has_broken_dbm(void) {}, }; - return is_midr_in_range_list(read_cpuid_id(), cpus); + return is_midr_in_range_list(cpus); } static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) @@ -2130,7 +2199,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, if (kvm_get_mode() != KVM_MODE_NV) return false; - if (!has_cpuid_feature(cap, scope)) { + if (!cpucap_multi_entry_cap_matches(cap, scope)) { pr_warn("unavailable: %s\n", cap->desc); return false; } @@ -2353,8 +2422,16 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) #ifdef CONFIG_ARM64_POE static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) { - sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1x_E0POE); - sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_E0POE); + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE); +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); } #endif @@ -2377,6 +2454,36 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2449,7 +2556,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, - ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + .match_list = (const struct arm64_cpu_capabilities []){ + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + }, + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) + }, + { /* Sentinel */ } + }, }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, @@ -2591,6 +2708,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, #endif +#ifdef CONFIG_ARM64_HAFT + { + .desc = "Hardware managed Access Flag for Table Descriptors", + /* + * Contrary to the page/block access flag, the table access flag + * cannot be emulated in software (no access fault will occur). + * Therefore this should be used only if it's supported system + * wide. + */ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAFT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) + }, +#endif { .desc = "CRC32 instructions", .capability = ARM64_HAS_CRC32, @@ -2757,6 +2889,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) }, + { + .desc = "Fine Grained Traps 2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT2, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) + }, #ifdef CONFIG_ARM64_SME { .desc = "Scalable Matrix Extension", @@ -2874,6 +3013,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = { }, #endif { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, + { .desc = "NV1", .capability = ARM64_HAS_HCR_NV1, .type = ARM64_CPUCAP_SYSTEM_FEATURE, @@ -2890,6 +3043,24 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, #endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, +#endif +#ifdef CONFIG_HW_PERF_EVENTS + { + .desc = "PMUv3", + .capability = ARM64_HAS_PMUV3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_pmuv3, + }, +#endif {}, }; @@ -2922,6 +3093,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = match, \ } +#define HWCAP_CAP_MATCH_ID(match, reg, field, min_value, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + HWCAP_CPUID_MATCH(reg, field, min_value) \ + .matches = match, \ + } + #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { @@ -2950,6 +3128,13 @@ static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { }; #endif +#ifdef CONFIG_ARM64_SVE +static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sve() && has_user_cpuid_feature(cap, scope); +} +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), @@ -2968,6 +3153,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -2992,19 +3178,27 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), - HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), - HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), - HWCAP_CAP(ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), - HWCAP_CAP(ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), - HWCAP_CAP(ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), - HWCAP_CAP(ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), - HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), - HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), - HWCAP_CAP(ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), - HWCAP_CAP(ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), - HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), - HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), - HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, AES2, CAP_HWCAP, KERNEL_HWCAP_SVE_AES2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F16MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_F16MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, EltPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_ELTPERM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI @@ -3021,6 +3215,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC), + HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, CMPBR, CAP_HWCAP, KERNEL_HWCAP_CMPBR), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES), HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), @@ -3030,6 +3225,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), + HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), @@ -3047,11 +3243,18 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), + HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), + HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), + HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), + HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), + HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), #endif /* CONFIG_ARM64_SME */ HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), #ifdef CONFIG_ARM64_POE @@ -3375,7 +3578,7 @@ static void verify_hyp_capabilities(void) return; safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); /* Verify VMID bits */ @@ -3396,6 +3599,36 @@ static void verify_hyp_capabilities(void) } } +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} + /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. @@ -3422,6 +3655,9 @@ static void verify_local_cpu_capabilities(void) if (is_hyp_mode_available()) verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) @@ -3499,8 +3735,14 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } +unsigned long cpu_get_elf_hwcap3(void) +{ + return elf_hwcap[2]; +} + static void __init setup_boot_cpu_capabilities(void) { + kvm_arm_target_impl_cpu_init(); /* * The boot CPU's feature register values have been recorded. Detect * boot cpucaps and local cpucaps for the boot CPU, then enable and @@ -3601,7 +3843,14 @@ static int enable_mismatched_32bit_el0(unsigned int cpu) static int lucky_winner = -1; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); - bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0); + bool cpu_32bit = false; + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + if (!housekeeping_cpu(cpu, HK_TYPE_TICK)) + pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu); + else + cpu_32bit = true; + } if (cpu_32bit) { cpumask_set_cpu(cpu, cpu_32bit_el0_mask); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 44718d0482b3..c1f2b6b04b41 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -80,6 +80,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SB] = "sb", [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", @@ -144,6 +145,21 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", [KERNEL_HWCAP_POE] = "poe", + [KERNEL_HWCAP_CMPBR] = "cmpbr", + [KERNEL_HWCAP_FPRCVT] = "fprcvt", + [KERNEL_HWCAP_F8MM8] = "f8mm8", + [KERNEL_HWCAP_F8MM4] = "f8mm4", + [KERNEL_HWCAP_SVE_F16MM] = "svef16mm", + [KERNEL_HWCAP_SVE_ELTPERM] = "sveeltperm", + [KERNEL_HWCAP_SVE_AES2] = "sveaes2", + [KERNEL_HWCAP_SVE_BFSCALE] = "svebfscale", + [KERNEL_HWCAP_SVE2P2] = "sve2p2", + [KERNEL_HWCAP_SME2P2] = "sme2p2", + [KERNEL_HWCAP_SME_SBITPERM] = "smesbitperm", + [KERNEL_HWCAP_SME_AES] = "smeaes", + [KERNEL_HWCAP_SME_SFEXPA] = "smesfexpa", + [KERNEL_HWCAP_SME_STMOP] = "smestmop", + [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", }; #ifdef CONFIG_COMPAT @@ -193,80 +209,79 @@ static const char *const compat_hwcap2_str[] = { static int c_show(struct seq_file *m, void *v) { - int i, j; + int j; + int cpu = m->index; bool compat = personality(current->personality) == PER_LINUX32; + struct cpuinfo_arm64 *cpuinfo = v; + u32 midr = cpuinfo->reg_midr; - for_each_online_cpu(i) { - struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i); - u32 midr = cpuinfo->reg_midr; - - /* - * glibc reads /proc/cpuinfo to determine the number of - * online processors, looking for lines beginning with - * "processor". Give glibc what it expects. - */ - seq_printf(m, "processor\t: %d\n", i); - if (compat) - seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", - MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); + /* + * glibc reads /proc/cpuinfo to determine the number of + * online processors, looking for lines beginning with + * "processor". Give glibc what it expects. + */ + seq_printf(m, "processor\t: %d\n", cpu); + if (compat) + seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", + MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000UL/HZ), - loops_per_jiffy / (5000UL/HZ) % 100); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); - /* - * Dump out the common processor features in a single line. - * Userspace should read the hwcaps with getauxval(AT_HWCAP) - * rather than attempting to parse this, but there's a body of - * software which does already (at least for 32-bit). - */ - seq_puts(m, "Features\t:"); - if (compat) { + /* + * Dump out the common processor features in a single line. + * Userspace should read the hwcaps with getauxval(AT_HWCAP) + * rather than attempting to parse this, but there's a body of + * software which does already (at least for 32-bit). + */ + seq_puts(m, "Features\t:"); + if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { - if (compat_elf_hwcap & (1 << j)) { - /* - * Warn once if any feature should not - * have been present on arm64 platform. - */ - if (WARN_ON_ONCE(!compat_hwcap_str[j])) - continue; - - seq_printf(m, " %s", compat_hwcap_str[j]); - } + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + + seq_printf(m, " %s", compat_hwcap_str[j]); } + } - for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) - if (compat_elf_hwcap2 & (1 << j)) - seq_printf(m, " %s", compat_hwcap2_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) + if (compat_elf_hwcap2 & (1 << j)) + seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ - } else { - for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) - if (cpu_have_feature(j)) - seq_printf(m, " %s", hwcap_str[j]); - } - seq_puts(m, "\n"); - - seq_printf(m, "CPU implementer\t: 0x%02x\n", - MIDR_IMPLEMENTOR(midr)); - seq_printf(m, "CPU architecture: 8\n"); - seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); - seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); - seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); + } else { + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) + seq_printf(m, " %s", hwcap_str[j]); } + seq_puts(m, "\n"); + + seq_printf(m, "CPU implementer\t: 0x%02x\n", + MIDR_IMPLEMENTOR(midr)); + seq_puts(m, "CPU architecture: 8\n"); + seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); + seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); + seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? &per_cpu(cpu_data, *pos) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return NULL; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) @@ -312,11 +327,13 @@ static const struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(aidr_el1, aidr); CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, &cpuregs_attr_revidr_el1.attr, + &cpuregs_attr_aidr_el1.attr, NULL }; @@ -453,6 +470,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); + info->reg_aidr = read_cpuid(AIDR_EL1); info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); @@ -478,6 +496,22 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + /* + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. + */ + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + } + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 024a7b245056..58f047de3e1c 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -303,7 +303,6 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; @@ -313,10 +312,10 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) */ list_for_each_entry_rcu(hook, list, node) { if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm) - fn = hook->fn; + return hook->fn(regs, esr); } - return fn ? fn(regs, esr) : DBG_HOOK_ERROR; + return DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); @@ -441,6 +440,11 @@ void kernel_rewind_single_step(struct pt_regs *regs) set_regs_spsr_ss(regs); } +void kernel_fastforward_single_step(struct pt_regs *regs) +{ + clear_regs_spsr_ss(regs); +} + /* ptrace API */ void user_enable_single_step(struct task_struct *task) { diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 11d7f7de202d..329e8df9215f 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -28,7 +28,7 @@ .macro __EFI_PE_HEADER #ifdef CONFIG_EFI .set .Lpe_header_offset, . - .L_head - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE .short IMAGE_FILE_MACHINE_ARM64 // Machine .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp @@ -40,7 +40,7 @@ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics .Loptional_header: - .short PE_OPT_MAGIC_PE32PLUS // PE32+ format + .short IMAGE_NT_OPTIONAL_HDR64_MAGIC // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion .long __initdata_begin - .Lefi_header_end // SizeOfCode @@ -66,7 +66,7 @@ .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics + .short IMAGE_DLLCHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 712718aed5dd..3857fd7ee8d4 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -29,13 +29,21 @@ static bool region_is_misaligned(const efi_memory_desc_t *md) * executable, everything else can be mapped with the XN bits * set. Also take the new (optional) RO/XP bits into account. */ -static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) +static __init ptdesc_t create_mapping_protection(efi_memory_desc_t *md) { u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; @@ -75,7 +83,7 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { - pteval_t prot_val = create_mapping_protection(md); + ptdesc_t prot_val = create_mapping_protection(md); bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); @@ -161,14 +169,14 @@ static DEFINE_RAW_SPINLOCK(efi_rt_lock); void arch_efi_call_virt_setup(void) { efi_virtmap_load(); - __efi_fpsimd_begin(); raw_spin_lock(&efi_rt_lock); + __efi_fpsimd_begin(); } void arch_efi_call_virt_teardown(void) { - raw_spin_unlock(&efi_rt_lock); __efi_fpsimd_end(); + raw_spin_unlock(&efi_rt_lock); efi_virtmap_unload(); } diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 2e94d20c4ac7..b735f4c2fe5e 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm, int ret = 1; unsigned long addr; void *tags = NULL; + int locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { - struct page *page = get_dump_page(addr); + struct page *page = get_dump_page(addr, &locked); /* * get_dump_page() returns NULL when encountering an empty diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3fcd9d080bf2..7c1970b341b8 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -132,7 +132,7 @@ static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) do { local_irq_enable(); - if (thread_flags & _TIF_NEED_RESCHED) + if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (thread_flags & _TIF_UPROBE) @@ -393,20 +393,16 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) * As per the ABI exit SME streaming mode and clear the SVE state not * shared with FPSIMD on syscall entry. */ -static inline void fp_user_discard(void) +static inline void fpsimd_syscall_enter(void) { - /* - * If SME is active then exit streaming mode. If ZA is active - * then flush the SVE registers but leave userspace access to - * both SVE and SME enabled, otherwise disable SME for the - * task and fall through to disabling SVE too. This means - * that after a syscall we never have any streaming mode - * register state to track, if this changes the KVM code will - * need updating. - */ + /* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */ if (system_supports_sme()) sme_smstop_sm(); + /* + * The CPU is not in streaming mode. If non-streaming SVE is not + * supported, there is no SVE state that needs to be discarded. + */ if (!system_supports_sve()) return; @@ -416,6 +412,33 @@ static inline void fp_user_discard(void) sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; sve_flush_live(true, sve_vq_minus_one); } + + /* + * Any live non-FPSIMD SVE state has been zeroed. Allow + * fpsimd_save_user_state() to lazily discard SVE state until either + * the live state is unbound or fpsimd_syscall_exit() is called. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD); +} + +static __always_inline void fpsimd_syscall_exit(void) +{ + if (!system_supports_sve()) + return; + + /* + * The current task's user FPSIMD/SVE/SME state is now bound to this + * CPU. The fpsimd_last_state.to_save value is either: + * + * - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU + * since fpsimd_syscall_enter(). + * + * - FP_STATE_CURRENT, if the state has been reloaded on this CPU at + * any point. + * + * Reset this to FP_STATE_CURRENT to stop lazy discarding. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT); } UNHANDLED(el1t, 64, sync) @@ -463,6 +486,24 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_mops(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -505,6 +546,12 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; + case ESR_ELx_EC_MOPS: + el1_mops(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: case ESR_ELx_EC_WATCHPT_CUR: @@ -684,6 +731,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + exit_to_user_mode(regs); +} + static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { enter_from_user_mode(regs); @@ -707,10 +762,11 @@ static void noinstr el0_svc(struct pt_regs *regs) { enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); - fp_user_discard(); + fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); exit_to_user_mode(regs); + fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) @@ -766,6 +822,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_MOPS: el0_mops(regs, esr); break; + case ESR_ELx_EC_GCS: + el0_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index f0c16640ef21..169ccf600066 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -329,24 +329,28 @@ SYM_FUNC_END(ftrace_stub_graph) * @fp is checked against the value passed by ftrace_graph_caller(). */ SYM_CODE_START(return_to_handler) - /* save return value regs */ - sub sp, sp, #FGRET_REGS_SIZE - stp x0, x1, [sp, #FGRET_REGS_X0] - stp x2, x3, [sp, #FGRET_REGS_X2] - stp x4, x5, [sp, #FGRET_REGS_X4] - stp x6, x7, [sp, #FGRET_REGS_X6] - str x29, [sp, #FGRET_REGS_FP] // parent's fp + /* Make room for ftrace_regs */ + sub sp, sp, #FREGS_SIZE + + /* Save return value regs */ + stp x0, x1, [sp, #FREGS_X0] + stp x2, x3, [sp, #FREGS_X2] + stp x4, x5, [sp, #FREGS_X4] + stp x6, x7, [sp, #FREGS_X6] + + /* Save the callsite's FP */ + str x29, [sp, #FREGS_FP] mov x0, sp - bl ftrace_return_to_handler // addr = ftrace_return_to_hander(regs); + bl ftrace_return_to_handler // addr = ftrace_return_to_hander(fregs); mov x30, x0 // restore the original return address - /* restore return value regs */ - ldp x0, x1, [sp, #FGRET_REGS_X0] - ldp x2, x3, [sp, #FGRET_REGS_X2] - ldp x4, x5, [sp, #FGRET_REGS_X4] - ldp x6, x7, [sp, #FGRET_REGS_X6] - add sp, sp, #FGRET_REGS_SIZE + /* Restore return value regs */ + ldp x0, x1, [sp, #FREGS_X0] + ldp x2, x3, [sp, #FREGS_X2] + ldp x4, x5, [sp, #FREGS_X4] + ldp x6, x7, [sp, #FREGS_X6] + add sp, sp, #FREGS_SIZE ret SYM_CODE_END(return_to_handler) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ef0e127b149..5ae2a34b50bd 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -25,6 +25,7 @@ #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/scs.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> @@ -284,15 +285,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * For exceptions from EL0, create a final frame record. - * For exceptions from EL1, create a synthetic frame record so the - * interrupted code shows up in the backtrace. + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. */ - .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] + .if \el == 0 + mov x0, #FRAME_META_TYPE_FINAL .else - stp x29, x22, [sp, #S_STACKFRAME] + mov x0, #FRAME_META_TYPE_PT_REGS .endif + str x0, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN @@ -315,7 +317,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR_SAVE] + str w20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +344,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR_SAVE] + ldr w20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 77006df20a75..c37f02d7194e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -119,7 +119,7 @@ * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); +DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { #ifdef CONFIG_ARM64_SVE @@ -180,12 +180,12 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static void __percpu *efi_sve_state; +static u8 *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern void __percpu *efi_sve_state; +extern u8 *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ @@ -359,20 +359,15 @@ static void task_fpsimd_load(void) WARN_ON(preemptible()); WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); - if (system_supports_fpmr()) - write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); - if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: /* Stop tracking SVE for this task until next use. */ - if (test_and_clear_thread_flag(TIF_SVE)) - sve_user_disable(); + clear_thread_flag(TIF_SVE); break; case FP_STATE_SVE: - if (!thread_sm_enabled(¤t->thread) && - !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) - sve_user_enable(); + if (!thread_sm_enabled(¤t->thread)) + WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)); if (test_thread_flag(TIF_SVE)) sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); @@ -386,7 +381,7 @@ static void task_fpsimd_load(void) * fpsimd_save_user_state() or memory corruption, we * should always record an explicit format * when we save. We always at least have the - * memory allocated for FPSMID registers so + * memory allocated for FPSIMD registers so * try that and hope for the best. */ WARN_ON_ONCE(1); @@ -413,6 +408,9 @@ static void task_fpsimd_load(void) restore_ffr = system_supports_fa64(); } + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + if (restore_sve_regs) { WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), @@ -453,12 +451,15 @@ static void fpsimd_save_user_state(void) *(last->fpmr) = read_sysreg_s(SYS_FPMR); /* - * If a task is in a syscall the ABI allows us to only - * preserve the state shared with FPSIMD so don't bother - * saving the full SVE state in that case. + * Save SVE state if it is live. + * + * The syscall ABI discards live SVE state at syscall entry. When + * entering a syscall, fpsimd_syscall_enter() sets to_save to + * FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until + * either new SVE state is loaded+bound or fpsimd_syscall_exit() is + * called prior to a return to userspace. */ - if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && - !in_syscall(current_pt_regs())) || + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) || last->to_save == FP_STATE_SVE) { save_sve_regs = true; save_ffr = true; @@ -562,7 +563,7 @@ static int vec_proc_do_default_vl(const struct ctl_table *table, int write, return 0; } -static struct ctl_table sve_default_vl_table[] = { +static const struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, @@ -585,7 +586,7 @@ static int __init sve_sysctl_init(void) { return 0; } #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ #if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) -static struct ctl_table sme_default_vl_table[] = { +static const struct ctl_table sme_default_vl_table[] = { { .procname = "sme_default_vector_length", .mode = 0644, @@ -651,7 +652,7 @@ static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, * task->thread.uw.fpsimd_state must be up to date before calling this * function. */ -static void fpsimd_to_sve(struct task_struct *task) +static inline void fpsimd_to_sve(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; @@ -675,7 +676,7 @@ static void fpsimd_to_sve(struct task_struct *task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. */ -static void sve_to_fpsimd(struct task_struct *task) +static inline void sve_to_fpsimd(struct task_struct *task) { unsigned int vq, vl; void const *sst = task->thread.sve_state; @@ -694,44 +695,39 @@ static void sve_to_fpsimd(struct task_struct *task) } } -void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) +static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd) { - write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, - SYS_SCTLR_EL1); + memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs)); } -#ifdef CONFIG_ARM64_SVE /* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. + * Simulate the effects of an SMSTOP SM instruction. */ -static void __sve_free(struct task_struct *task) +void task_smstop_sm(struct task_struct *task) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; -} + if (!thread_sm_enabled(&task->thread)) + return; -static void sve_free(struct task_struct *task) -{ - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + __fpsimd_zero_vregs(&task->thread.uw.fpsimd_state); + task->thread.uw.fpsimd_state.fpsr = 0x0800009f; + if (system_supports_fpmr()) + task->thread.uw.fpmr = 0; - __sve_free(task); + task->thread.svcr &= ~SVCR_SM_MASK; + task->thread.fp_type = FP_STATE_FPSIMD; } -/* - * Return how many bytes of memory are required to store the full SVE - * state for task, given task's currently configured vector length. - */ -size_t sve_state_size(struct task_struct const *task) +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) { - unsigned int vl = 0; - - if (system_supports_sve()) - vl = task_get_sve_vl(task); - if (system_supports_sme()) - vl = max(vl, task_get_sme_vl(task)); + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +#ifdef CONFIG_ARM64_SVE +static void sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; } /* @@ -758,69 +754,34 @@ void sve_alloc(struct task_struct *task, bool flush) kzalloc(sve_state_size(task), GFP_KERNEL); } - /* - * Force the FPSIMD state shared with SVE to be updated in the SVE state - * even if the SVE state is the current active state. + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to the + * task's currently effective FPSIMD/SVE state. * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void fpsimd_force_sync_to_sve(struct task_struct *task) -{ - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.sve_state is up to date with respect to - * the user task, irrespective of when SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_sync_to_sve(struct task_struct *task) -{ - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.uw.fpsimd_state is up to date with respect to - * the user task, irrespective of whether SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void sve_sync_to_fpsimd(struct task_struct *task) +void fpsimd_sync_from_effective_state(struct task_struct *task) { if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } /* - * Ensure that task->thread.sve_state is up to date with respect to - * the task->thread.uw.fpsimd_state. + * Ensure that the task's currently effective FPSIMD/SVE state is up to date + * with respect to task->thread.uw.fpsimd_state, zeroing any effective + * non-FPSIMD (S)SVE state. * - * This should only be called by ptrace to merge new FPSIMD register - * values into a task for which SVE is currently active. - * task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - * task->thread.uw.fpsimd_state must already have been initialised with - * the new FPSIMD register values to be merged in. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) + if (task->thread.fp_type != FP_STATE_SVE) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); @@ -829,10 +790,73 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) __fpsimd_to_sve(sst, fst, vq); } +static int change_live_vector_length(struct task_struct *task, + enum vec_type type, + unsigned long vl) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + void *sve_state = NULL, *sme_state = NULL; + + if (type == ARM64_VEC_SME) + sme_vl = vl; + else + sve_vl = vl; + + /* + * Allocate the new sve_state and sme_state before freeing the old + * copies so that allocation failure can be handled without needing to + * mutate the task's state in any way. + * + * Changes to the SVE vector length must not discard live ZA state or + * clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64 + * ZA lazy saving scheme may attempt to change the SVE vector length + * while unsaved/dormant ZA state exists. + */ + sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL); + if (!sve_state) + goto out_mem; + + if (type == ARM64_VEC_SME) { + sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL); + if (!sme_state) + goto out_mem; + } + + if (task == current) + fpsimd_save_and_flush_current_state(); + else + fpsimd_flush_task_state(task); + + /* + * Always preserve PSTATE.SM and the effective FPSIMD state, zeroing + * other SVE state. + */ + fpsimd_sync_from_effective_state(task); + task_set_vl(task, type, vl); + kfree(task->thread.sve_state); + task->thread.sve_state = sve_state; + fpsimd_sync_to_effective_state_zeropad(task); + + if (type == ARM64_VEC_SME) { + task->thread.svcr &= ~SVCR_ZA_MASK; + kfree(task->thread.sme_state); + task->thread.sme_state = sme_state; + } + + return 0; + +out_mem: + kfree(sve_state); + kfree(sme_state); + return -ENOMEM; +} + int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { - bool free_sme = false; + bool onexec = flags & PR_SVE_SET_VL_ONEXEC; + bool inherit = flags & PR_SVE_VL_INHERIT; if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) @@ -852,71 +876,17 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, vl = find_supported_vector_length(type, vl); - if (flags & (PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) + if (!onexec && vl != task_get_vl(task, type)) { + if (change_live_vector_length(task, type, vl)) + return -ENOMEM; + } + + if (onexec || inherit) task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ task_set_vl_onexec(task, type, 0); - /* Only actually set the VL if not deferred: */ - if (flags & PR_SVE_SET_VL_ONEXEC) - goto out; - - if (vl == task_get_vl(task, type)) - goto out; - - /* - * To ensure the FPSIMD bits of the SVE vector registers are preserved, - * write any live register state back to task_struct, and convert to a - * regular FPSIMD thread. - */ - if (task == current) { - get_cpu_fpsimd_context(); - - fpsimd_save_user_state(); - } - - fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) { - sve_to_fpsimd(task); - task->thread.fp_type = FP_STATE_FPSIMD; - } - - if (system_supports_sme()) { - if (type == ARM64_VEC_SME || - !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) { - /* - * We are changing the SME VL or weren't using - * SME anyway, discard the state and force a - * reallocation. - */ - task->thread.svcr &= ~(SVCR_SM_MASK | - SVCR_ZA_MASK); - clear_tsk_thread_flag(task, TIF_SME); - free_sme = true; - } - } - - if (task == current) - put_cpu_fpsimd_context(); - - task_set_vl(task, type, vl); - - /* - * Free the changed states if they are not in use, SME will be - * reallocated to the correct size on next use and we just - * allocate SVE now in case it is needed for use in streaming - * mode. - */ - sve_free(task); - sve_alloc(task, true); - - if (free_sme) - sme_free(task); - -out: update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); @@ -1131,15 +1101,15 @@ static void __init sve_efi_setup(void) if (!sve_vl_valid(max_vl)) goto fail; - efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); + efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), + GFP_KERNEL); if (!efi_sve_state) goto fail; return; fail: - panic("Cannot allocate percpu memory for EFI SVE save/restore"); + panic("Cannot allocate memory for EFI SVE save/restore"); } void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) @@ -1212,7 +1182,7 @@ void __init sve_setup(void) */ void fpsimd_release_task(struct task_struct *dead_task) { - __sve_free(dead_task); + sve_free(dead_task); sme_free(dead_task); } @@ -1367,6 +1337,7 @@ static void sve_init_regs(void) } else { fpsimd_to_sve(current); current->thread.fp_type = FP_STATE_SVE; + fpsimd_flush_task_state(current); } } @@ -1435,7 +1406,7 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) * If this not a trap due to SME being disabled then something * is being used in the wrong mode, report as SIGILL. */ - if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } @@ -1459,6 +1430,8 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) sme_set_vq(vq_minus_one); fpsimd_bind_task_to_cpu(); + } else { + fpsimd_flush_task_state(current); } put_cpu_fpsimd_context(); @@ -1572,8 +1545,8 @@ void fpsimd_thread_switch(struct task_struct *next) fpsimd_save_user_state(); if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { - fpsimd_load_kernel_state(next); fpsimd_flush_cpu_state(); + fpsimd_load_kernel_state(next); } else { /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's @@ -1660,6 +1633,9 @@ void fpsimd_flush_thread(void) current->thread.svcr = 0; } + if (system_supports_fpmr()) + current->thread.uw.fpmr = 0; + current->thread.fp_type = FP_STATE_FPSIMD; put_cpu_fpsimd_context(); @@ -1682,43 +1658,6 @@ void fpsimd_preserve_current_state(void) } /* - * Like fpsimd_preserve_current_state(), but ensure that - * current->thread.uw.fpsimd_state is updated so that it can be copied to - * the signal frame. - */ -void fpsimd_signal_preserve_current_state(void) -{ - fpsimd_preserve_current_state(); - if (current->thread.fp_type == FP_STATE_SVE) - sve_to_fpsimd(current); -} - -/* - * Called by KVM when entering the guest. - */ -void fpsimd_kvm_prepare(void) -{ - if (!system_supports_sve()) - return; - - /* - * KVM does not save host SVE state since we can only enter - * the guest from a syscall so the ABI means that only the - * non-saved SVE state needs to be saved. If we have left - * SVE enabled for performance reasons then update the task - * state to be FPSIMD only. - */ - get_cpu_fpsimd_context(); - - if (test_and_clear_thread_flag(TIF_SVE)) { - sve_to_fpsimd(current); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - put_cpu_fpsimd_context(); -} - -/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. @@ -1810,30 +1749,14 @@ void fpsimd_restore_current_state(void) put_cpu_fpsimd_context(); } -/* - * Load an updated userland FPSIMD state for 'current' from memory and set the - * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current'. This is used by the signal code to restore the - * register state when returning from a signal handler in FPSIMD only cases, - * any SVE context will be discarded. - */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (WARN_ON(!system_supports_fpsimd())) return; - get_cpu_fpsimd_context(); - current->thread.uw.fpsimd_state = *state; - if (test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) fpsimd_to_sve(current); - - task_fpsimd_load(); - fpsimd_bind_task_to_cpu(); - - clear_thread_flag(TIF_FOREIGN_FPSTATE); - - put_cpu_fpsimd_context(); } /* @@ -1863,6 +1786,17 @@ void fpsimd_flush_task_state(struct task_struct *t) barrier(); } +void fpsimd_save_and_flush_current_state(void) +{ + if (!system_supports_fpsimd()) + return; + + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + fpsimd_flush_task_state(current); + put_cpu_fpsimd_context(); +} + /* * Save the FPSIMD state to memory and invalidate cpu view. * This function must be called with preemption disabled. @@ -1972,10 +1906,10 @@ EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); -static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); -static DEFINE_PER_CPU(bool, efi_sve_state_used); -static DEFINE_PER_CPU(bool, efi_sm_state); +static struct user_fpsimd_state efi_fpsimd_state; +static bool efi_fpsimd_state_used; +static bool efi_sve_state_used; +static bool efi_sm_state; /* * EFI runtime services support functions @@ -2008,18 +1942,16 @@ void __efi_fpsimd_begin(void) * If !efi_sve_state, SVE can't be in use yet and doesn't need * preserving: */ - if (system_supports_sve() && likely(efi_sve_state)) { - char *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state != NULL) { bool ffr = true; u64 svcr; - __this_cpu_write(efi_sve_state_used, true); + efi_sve_state_used = true; if (system_supports_sme()) { svcr = read_sysreg_s(SYS_SVCR); - __this_cpu_write(efi_sm_state, - svcr & SVCR_SM_MASK); + efi_sm_state = svcr & SVCR_SM_MASK; /* * Unless we have FA64 FFR does not @@ -2029,19 +1961,18 @@ void __efi_fpsimd_begin(void) ffr = !(svcr & SVCR_SM_MASK); } - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); if (system_supports_sme()) sysreg_clear_set_s(SYS_SVCR, SVCR_SM_MASK, 0); } else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_save_state(&efi_fpsimd_state); } - __this_cpu_write(efi_fpsimd_state_used, true); + efi_fpsimd_state_used = true; } } @@ -2053,12 +1984,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { + if (!efi_fpsimd_state_used) { kernel_neon_end(); } else { - if (system_supports_sve() && - likely(__this_cpu_read(efi_sve_state_used))) { - char const *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state_used) { bool ffr = true; /* @@ -2067,7 +1996,7 @@ void __efi_fpsimd_end(void) * streaming mode. */ if (system_supports_sme()) { - if (__this_cpu_read(efi_sm_state)) { + if (efi_sm_state) { sysreg_clear_set_s(SYS_SVCR, 0, SVCR_SM_MASK); @@ -2081,14 +2010,15 @@ void __efi_fpsimd_end(void) } } - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); - __this_cpu_write(efi_sve_state_used, false); + efi_sve_state_used = false; } else { - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_load_state(&efi_fpsimd_state); } + + efi_fpsimd_state_used = false; } } diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index a650f5e11fc5..5a890714ee2e 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,7 +15,7 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS struct fregs_offset { @@ -23,10 +23,10 @@ struct fregs_offset { int offset; }; -#define FREGS_OFFSET(n, field) \ -{ \ - .name = n, \ - .offset = offsetof(struct ftrace_regs, field), \ +#define FREGS_OFFSET(n, field) \ +{ \ + .name = n, \ + .offset = offsetof(struct __arch_ftrace_regs, field), \ } static const struct fregs_offset fregs_offsets[] = { @@ -143,6 +143,69 @@ unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +/* Convert fentry_ip to the symbol address without kallsyms */ +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + u32 insn; + + /* + * When using patchable-function-entry without pre-function NOPS, ftrace + * entry is the address of the first NOP after the function entry point. + * + * The compiler has either generated: + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func-04: BTI C + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at `func + 4` + * bytes in either case. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* + * When using patchable-function-entry with pre-function NOPs, BTI is + * a bit different. + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func+00: func: BTI C + * func+04: NOP // To be patched to MOV X9, LR + * func+08: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at either + * `func + 4` or `func + 8` depends on whether there is a BTI. + */ + + /* If there is no BTI, the func address should be one instruction before. */ + if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < AARCH64_INSN_SIZE * 2) { + if (get_kernel_nofault(insn, (u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2))) + return 0; + } else { + insn = *(u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2); + } + + if (aarch64_insn_is_bti(le32_to_cpu((__le32)insn))) + return fentry_ip - AARCH64_INSN_SIZE * 2; + + return fentry_ip - AARCH64_INSN_SIZE; +} + /* * Replace a single instruction, which may be a branch or NOP. * If @validate == true, a replaced instruction is checked against 'old'. @@ -257,14 +320,13 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, * dealing with an out-of-range condition, we can assume it * is due to a module being loaded far away from the kernel. * - * NOTE: __module_text_address() must be called with preemption - * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * NOTE: __module_text_address() must be called within a RCU read + * section, but we can rely on ftrace_lock to ensure that 'mod' * retains its validity throughout the remainder of this code. */ if (!mod) { - preempt_disable(); + guard(rcu)(); mod = __module_text_address(pc); - preempt_enable(); } if (WARN_ON(!mod)) @@ -481,7 +543,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - prepare_ftrace_return(ip, &fregs->lr, fregs->fp); + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long frame_pointer = arch_ftrace_regs(fregs)->fp; + unsigned long *parent = &arch_ftrace_regs(fregs)->lr; + unsigned long old; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + old = *parent; + + if (!function_graph_enter_regs(old, ip, frame_pointer, + (void *)frame_pointer, fregs)) { + *parent = return_hooker; + } } #else /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index cb68adcabe07..ca04b338cb0d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -32,6 +32,7 @@ #include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/virt.h> @@ -88,7 +89,7 @@ SYM_CODE_START(primary_entry) adrp x1, early_init_stack mov sp, x1 mov x29, xzr - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir mov x1, xzr bl __pi_create_init_idmap @@ -100,7 +101,7 @@ SYM_CODE_START(primary_entry) cbnz x19, 0f dmb sy mov x1, x0 // end of used region - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir adr_l x2, dcache_inval_poc blr x2 b 1f @@ -199,6 +200,8 @@ SYM_CODE_END(preserve_boot_args) sub sp, sp, #PT_REGS_SIZE stp xzr, xzr, [sp, #S_STACKFRAME] + mov \tmp1, #FRAME_META_TYPE_FINAL + str \tmp1, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME scs_load_current @@ -295,25 +298,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el2, x0 isb 0: - mov_q x0, HCR_HOST_NVHE_FLAGS - - /* - * Compliant CPUs advertise their VHE-onlyness with - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be - * RES1 in that case. Publish the E2H bit early so that - * it can be picked up by the init_el2_state macro. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but - * don't advertise it (they predate this relaxation). - */ - mrs_s x1, SYS_ID_AA64MMFR4_EL1 - tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f - - orr x0, x0, #HCR_E2H -1: - msr hcr_el2, x0 - isb + init_el2_hcr HCR_HOST_NVHE_FLAGS init_el2_state /* Hypervisor stub */ @@ -336,7 +322,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el1, x1 mov x2, xzr 3: - __init_el2_nvhe_prepare_eret + mov x0, #INIT_PSTATE_EL1 + msr spsr_el2, x0 mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 @@ -520,7 +507,7 @@ SYM_FUNC_END(__no_granule_support) SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir - adrp x2, init_idmap_pg_dir + adrp x2, __pi_init_idmap_pg_dir bl __enable_mmu adrp x1, early_init_stack diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 7b11d84f533c..18749e9a6c2d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; if (!page) continue; + folio = page_folio(page); + + if (folio_test_hugetlb(folio) && + !folio_test_hugetlb_mte_tagged(folio)) + continue; if (!page_mte_tagged(page)) continue; diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 65f76064c86b..36e2d26b54f5 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -97,7 +97,7 @@ SYM_CODE_START_LOCAL(__finalise_el2) 2: // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS - msr hcr_el2, x0 + msr_hcr_el2 x0 isb // Use the EL1 allocated stack, per-cpu offset @@ -114,8 +114,8 @@ SYM_CODE_START_LOCAL(__finalise_el2) // Use EL2 translations for SPE & TRBE and disable access from EL1 mrs x0, mdcr_el2 - bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + bic x0, x0, #MDCR_EL2_E2PB_MASK + bic x0, x0, #MDCR_EL2_E2TB_MASK msr mdcr_el2, x0 // Transfer the MM state from EL1 to EL2 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8f5422ed1b75..714b0b5ec5ac 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,16 @@ #error This file should only be included in vmlinux.lds.S #endif +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + +#define PI_EXPORT_SYM(sym) \ + __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) +#define __PI_EXPORT_SYM(sym, pisym, msg)\ + PROVIDE(pisym = sym); \ + ASSERT((sym - KIMAGE_VADDR) < (__bss_start - KIMAGE_VADDR), #msg) + PROVIDE(__efistub_primary_entry = primary_entry); /* @@ -36,40 +46,30 @@ PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); -PROVIDE(__pi_id_aa64isar1_override = id_aa64isar1_override); -PROVIDE(__pi_id_aa64isar2_override = id_aa64isar2_override); -PROVIDE(__pi_id_aa64mmfr0_override = id_aa64mmfr0_override); -PROVIDE(__pi_id_aa64mmfr1_override = id_aa64mmfr1_override); -PROVIDE(__pi_id_aa64mmfr2_override = id_aa64mmfr2_override); -PROVIDE(__pi_id_aa64pfr0_override = id_aa64pfr0_override); -PROVIDE(__pi_id_aa64pfr1_override = id_aa64pfr1_override); -PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); -PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); -PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); -PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); -#endif -PROVIDE(__pi__ctype = _ctype); -PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); - -PROVIDE(__pi_init_idmap_pg_dir = init_idmap_pg_dir); -PROVIDE(__pi_init_idmap_pg_end = init_idmap_pg_end); -PROVIDE(__pi_init_pg_dir = init_pg_dir); -PROVIDE(__pi_init_pg_end = init_pg_end); -PROVIDE(__pi_swapper_pg_dir = swapper_pg_dir); - -PROVIDE(__pi__text = _text); -PROVIDE(__pi__stext = _stext); -PROVIDE(__pi__etext = _etext); -PROVIDE(__pi___start_rodata = __start_rodata); -PROVIDE(__pi___inittext_begin = __inittext_begin); -PROVIDE(__pi___inittext_end = __inittext_end); -PROVIDE(__pi___initdata_begin = __initdata_begin); -PROVIDE(__pi___initdata_end = __initdata_end); -PROVIDE(__pi__data = _data); -PROVIDE(__pi___bss_start = __bss_start); -PROVIDE(__pi__end = _end); +PI_EXPORT_SYM(id_aa64isar1_override); +PI_EXPORT_SYM(id_aa64isar2_override); +PI_EXPORT_SYM(id_aa64mmfr0_override); +PI_EXPORT_SYM(id_aa64mmfr1_override); +PI_EXPORT_SYM(id_aa64mmfr2_override); +PI_EXPORT_SYM(id_aa64pfr0_override); +PI_EXPORT_SYM(id_aa64pfr1_override); +PI_EXPORT_SYM(id_aa64smfr0_override); +PI_EXPORT_SYM(id_aa64zfr0_override); +PI_EXPORT_SYM(arm64_sw_feature_override); +PI_EXPORT_SYM(arm64_use_ng_mappings); +PI_EXPORT_SYM(_ctype); + +PI_EXPORT_SYM(swapper_pg_dir); + +PI_EXPORT_SYM(_text); +PI_EXPORT_SYM(_stext); +PI_EXPORT_SYM(_etext); +PI_EXPORT_SYM(__start_rodata); +PI_EXPORT_SYM(__inittext_begin); +PI_EXPORT_SYM(__inittext_end); +PI_EXPORT_SYM(__initdata_begin); +PI_EXPORT_SYM(__initdata_end); +PI_EXPORT_SYM(_data); #ifdef CONFIG_KVM @@ -105,15 +105,13 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); +/* Static key which is set if CNTVOFF_EL2 is unusable */ +KVM_NVHE_ALIAS(broken_cntvoff_key); + /* EL2 exception handling */ KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* PMU available static key */ -#ifdef CONFIG_HW_PERF_EVENTS -KVM_NVHE_ALIAS(kvm_arm_pmu_available); -#endif - /* Position-independent library routines */ KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); @@ -132,6 +130,8 @@ KVM_NVHE_ALIAS(__hyp_text_start); KVM_NVHE_ALIAS(__hyp_text_end); KVM_NVHE_ALIAS(__hyp_bss_start); KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_data_start); +KVM_NVHE_ALIAS(__hyp_data_end); KVM_NVHE_ALIAS(__hyp_rodata_start); KVM_NVHE_ALIAS(__hyp_rodata_end); @@ -144,4 +144,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + +#undef ASSERT + #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index ef48089fbfe1..fe86ada23c7d 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -10,34 +10,6 @@ #include <linux/io.h> /* - * Copy data from IO memory space to "real" memory space. - */ -void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) -{ - while (count && !IS_ALIGNED((unsigned long)from, 8)) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } - - while (count >= 8) { - *(u64 *)to = __raw_readq(from); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } -} -EXPORT_SYMBOL(__memcpy_fromio); - -/* * This generates a memcpy that works on a from/to address which is aligned to * bits. Count is in terms of the number of bits sized quantities to copy. It * optimizes to use the STR groupings when possible so that it is WC friendly. @@ -78,62 +50,3 @@ void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count) dgh(); } EXPORT_SYMBOL(__iowrite32_copy_full); - -/* - * Copy data from "real" memory space to IO memory space. - */ -void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) -{ - while (count && !IS_ALIGNED((unsigned long)to, 8)) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } - - while (count >= 8) { - __raw_writeq(*(u64 *)from, to); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } -} -EXPORT_SYMBOL(__memcpy_toio); - -/* - * "memset" on IO memory space. - */ -void __memset_io(volatile void __iomem *dst, int c, size_t count) -{ - u64 qc = (u8)c; - - qc |= qc << 8; - qc |= qc << 16; - qc |= qc << 32; - - while (count && !IS_ALIGNED((unsigned long)dst, 8)) { - __raw_writeb(c, dst); - dst++; - count--; - } - - while (count >= 8) { - __raw_writeq(qc, dst); - dst += 8; - count -= 8; - } - - while (count) { - __raw_writeb(c, dst); - dst++; - count--; - } -} -EXPORT_SYMBOL(__memset_io); diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index f63ea915d6ad..b345425193d2 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -9,7 +9,7 @@ #include <linux/jump_label.h> #include <linux/smp.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 1da3e25f9d9e..c9503ed45a6c 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -10,8 +10,6 @@ #include <asm/cpufeature.h> #include <asm/memory.h> -u16 __initdata memstart_offset_seed; - bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 4e1f983df3d1..f3c4d3a8a20f 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -17,7 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 82e2203d86a3..6f121a0164a4 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - int ret; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - /* - * First try to remove the active state. If this - * fails, try to EOI the interrupt. - */ - ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); - - if (ret && irqd_irq_inprogress(&desc->irq_data) && - chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - /** * machine_crash_shutdown - shutdown non-crashing cpus and save registers */ diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 36b25af56324..06bb680bfe97 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -462,14 +462,20 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s; + int ret; + s = find_section(hdr, sechdrs, ".altinstructions"); if (s) apply_alternatives_module((void *)s->sh_addr, s->sh_size); if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); - if (s) - __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (s) { + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (ret) + pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", + me->name, ret); + } } return module_init_ftrace_plt(hdr, sechdrs, me); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 6174671be7c1..2fbfd27ff5f2 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -38,7 +38,24 @@ EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); void mte_sync_tags(pte_t pte, unsigned int nr_pages) { struct page *page = pte_page(pte); - unsigned int i; + struct folio *folio = page_folio(page); + unsigned long i; + + if (folio_test_hugetlb(folio)) { + unsigned long nr = folio_nr_pages(folio); + + /* Hugetlb MTE flags are set for head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); + + return; + } /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { @@ -410,6 +427,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, void *maddr; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + struct folio *folio; if (IS_ERR(page)) { err = PTR_ERR(page); @@ -428,7 +446,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!page_mte_tagged(page)); + + folio = page_folio(page); + if (folio_test_hugetlb(folio)) + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + else + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index 945df74005c7..1041bc67a3ee 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -10,7 +10,7 @@ #include <asm/fixmap.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> static DEFINE_RAW_SPINLOCK(patch_lock); @@ -30,20 +30,17 @@ static bool is_image_text(unsigned long addr) static void __kprobes *patch_map(void *addr, int fixmap) { - unsigned long uintaddr = (uintptr_t) addr; - bool image = is_image_text(uintaddr); - struct page *page; + phys_addr_t phys; - if (image) - page = phys_to_page(__pa_symbol(addr)); - else if (IS_ENABLED(CONFIG_EXECMEM)) - page = vmalloc_to_page(addr); - else - return addr; + if (is_image_text((unsigned long)addr)) { + phys = __pa_symbol(addr); + } else { + struct page *page = vmalloc_to_page(addr); + BUG_ON(!page); + phys = page_to_phys(page) + offset_in_page(addr); + } - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); + return (void *)set_fixmap_offset(fixmap, phys); } static void __kprobes patch_unmap(int fixmap) diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index e8ed5673f481..9b7f26b128b5 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -38,31 +38,3 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, arch_stack_walk(callchain_trace, entry, current, regs); } - -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - if (perf_guest_state()) - return perf_guest_get_ip(); - - return instruction_pointer(regs); -} - -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - unsigned int guest_state = perf_guest_state(); - int misc = 0; - - if (guest_state) { - if (guest_state & PERF_GUEST_USER) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } - - return misc; -} diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 29d4b6244a6f..bc57b290e5e7 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -38,6 +38,15 @@ struct ftr_set_desc { #define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } +static const struct ftr_set_desc mmfr0 __prel64_initconst = { + .name = "id_aa64mmfr0", + .override = &id_aa64mmfr0_override, + .fields = { + FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL), + {} + }, +}; + static bool __init mmfr1_vh_filter(u64 val) { /* @@ -74,6 +83,15 @@ static bool __init mmfr2_varange_filter(u64 val) id_aa64mmfr0_override.val |= (ID_AA64MMFR0_EL1_TGRAN_LPA2 - 1) << ID_AA64MMFR0_EL1_TGRAN_SHIFT; id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + + /* + * Override PARange to 48 bits - the override will just be + * ignored if the actual PARange is smaller, but this is + * unlikely to be the case for LPA2 capable silicon. + */ + id_aa64mmfr0_override.val |= + ID_AA64MMFR0_EL1_PARANGE_48 << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_PARANGE_SHIFT; } #endif return true; @@ -109,6 +127,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), {} }, }; @@ -133,8 +152,10 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { .override = &id_aa64pfr1_override, .fields = { FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), {} }, }; @@ -196,6 +217,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = { static const PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr0 }, { &mmfr1 }, { &mmfr2 }, { &pfr0 }, @@ -215,6 +237,7 @@ static const struct { { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nogcs", "id_aa64pfr1.gcs=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " "id_aa64isar1.api=0 id_aa64isar1.apa=0 " @@ -225,6 +248,7 @@ static const struct { { "rodata=off", "arm64_sw.rodataoff=1" }, { "arm64.nolva", "id_aa64mmfr2.varange=0" }, { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, }; static int __init parse_hexdigit(const char *p, u64 *v) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index 0257b43819db..e0e018046a46 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -18,8 +18,6 @@ #include "pi.h" -extern u16 memstart_offset_seed; - static u64 __init get_kaslr_seed(void *fdt, int node) { static char const seed_str[] __initconst = "kaslr-seed"; @@ -53,8 +51,6 @@ u64 __init kaslr_early_init(void *fdt, int chosen) return 0; } - memstart_offset_seed = seed & U16_MAX; - /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index f374a3e5a5fe..0f4bd7771859 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -136,6 +136,12 @@ static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) { u64 sctlr = read_sysreg(sctlr_el1); u64 tcr = read_sysreg(tcr_el1) | TCR_DS; + u64 mmfr0 = read_sysreg(id_aa64mmfr0_el1); + u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + + tcr &= ~TCR_IPS_MASK; + tcr |= parange << TCR_IPS_SHIFT; asm(" msr sctlr_el1, %0 ;" " isb ;" @@ -153,7 +159,7 @@ static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) static void __init remap_idmap_for_lpa2(void) { /* clear the bits that change meaning once LPA2 is turned on */ - pteval_t mask = PTE_SHARED; + ptdesc_t mask = PTE_SHARED; /* * We have to clear bits [9:8] in all block or page descriptors in the @@ -201,6 +207,29 @@ static void __init map_fdt(u64 fdt) dsb(ishst); } +/* + * PI version of the Cavium Eratum 27456 detection, which makes it + * impossible to use non-global mappings. + */ +static bool __init ng_mappings_allowed(void) +{ + static const struct midr_range cavium_erratum_27456_cpus[] __initconst = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, + }; + + for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) { + if (midr_is_cpu_model_range(read_cpuid_id(), r->model, + r->rv_min, r->rv_max)) + return false; + } + + return true; +} + asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) { static char const chosen_str[] __initconst = "/chosen"; @@ -240,7 +269,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) u64 kaslr_seed = kaslr_early_init(fdt, chosen); if (kaslr_seed && kaslr_requires_kpti()) - arm64_use_ng_mappings = true; + arm64_use_ng_mappings = ng_mappings_allowed(); kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); } diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 5410b2cac590..7982788e7b9a 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -30,8 +30,8 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; - u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK; - int lshift = (3 - level) * (PAGE_SHIFT - 3); + ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + int lshift = (3 - level) * PTDESC_TABLE_SHIFT; u64 lmask = (PAGE_SIZE << lshift) - 1; start &= PAGE_MASK; @@ -45,12 +45,12 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, * clearing the mapping */ if (protval) - protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; while (start < end) { u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); - if (level < 3 && (start | next | pa) & lmask) { + if (level < 2 || (level == 2 && (start | next | pa) & lmask)) { /* * This chunk needs a finer grained mapping. Create a * table mapping if necessary and recurse. @@ -87,7 +87,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, } } -asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask) +asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) { u64 ptep = (u64)pg_dir + PAGE_SIZE; pgprot_t text_prot = PAGE_KERNEL_ROX; diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index 49d8b40e61bc..55d0cd64ef71 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -50,6 +50,10 @@ bool dynamic_scs_is_enabled; #define DW_CFA_GNU_negative_offset_extended 0x2f #define DW_CFA_hi_user 0x3f +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_pcrel 0x10 + enum { PACIASP = 0xd503233f, AUTIASP = 0xd50323bf, @@ -120,7 +124,12 @@ struct eh_frame { union { struct { // CIE u8 version; - u8 augmentation_string[]; + u8 augmentation_string[3]; + u8 code_alignment_factor; + u8 data_alignment_factor; + u8 return_address_register; + u8 augmentation_data_size; + u8 fde_pointer_format; }; struct { // FDE @@ -128,29 +137,38 @@ struct eh_frame { s32 range; u8 opcodes[]; }; + + struct { // FDE + s64 initial_loc64; + s64 range64; + u8 opcodes64[]; + }; }; }; static int scs_handle_fde_frame(const struct eh_frame *frame, - bool fde_has_augmentation_data, int code_alignment_factor, + bool use_sdata8, bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; u64 loc = (u64)offset_to_ptr(&frame->initial_loc); const u8 *opcode = frame->opcodes; + int l; - if (fde_has_augmentation_data) { - int l; + if (use_sdata8) { + loc = (u64)&frame->initial_loc64 + frame->initial_loc64; + opcode = frame->opcodes64; + size -= 8; + } - // assume single byte uleb128_t - if (WARN_ON(*opcode & BIT(7))) - return -ENOEXEC; + // assume single byte uleb128_t for augmentation data size + if (*opcode & BIT(7)) + return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; - l = *opcode++; - opcode += l; - size -= l + 1; - } + l = *opcode++; + opcode += l; + size -= l + 1; /* * Starting from 'loc', apply the CFA opcodes that advance the location @@ -201,7 +219,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, break; default: - return -ENOEXEC; + return EDYNSCS_INVALID_CFA_OPCODE; } } return 0; @@ -209,12 +227,12 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, int scs_patch(const u8 eh_frame[], int size) { + int code_alignment_factor = 1; + bool fde_use_sdata8 = false; const u8 *p = eh_frame; while (size > 4) { const struct eh_frame *frame = (const void *)p; - bool fde_has_augmentation_data = true; - int code_alignment_factor = 1; int ret; if (frame->size == 0 || @@ -223,28 +241,47 @@ int scs_patch(const u8 eh_frame[], int size) break; if (frame->cie_id_or_pointer == 0) { - const u8 *p = frame->augmentation_string; - - /* a 'z' in the augmentation string must come first */ - fde_has_augmentation_data = *p == 'z'; + /* + * Require presence of augmentation data (z) with a + * specifier for the size of the FDE initial_loc and + * range fields (R), and nothing else. + */ + if (strcmp(frame->augmentation_string, "zR")) + return EDYNSCS_INVALID_CIE_HEADER; /* * The code alignment factor is a uleb128 encoded field * but given that the only sensible values are 1 or 4, - * there is no point in decoding the whole thing. + * there is no point in decoding the whole thing. Also + * sanity check the size of the data alignment factor + * field, and the values of the return address register + * and augmentation data size fields. */ - p += strlen(p) + 1; - if (!WARN_ON(*p & BIT(7))) - code_alignment_factor = *p; + if ((frame->code_alignment_factor & BIT(7)) || + (frame->data_alignment_factor & BIT(7)) || + frame->return_address_register != 30 || + frame->augmentation_data_size != 1) + return EDYNSCS_INVALID_CIE_HEADER; + + code_alignment_factor = frame->code_alignment_factor; + + switch (frame->fde_pointer_format) { + case DW_EH_PE_pcrel | DW_EH_PE_sdata4: + fde_use_sdata8 = false; + break; + case DW_EH_PE_pcrel | DW_EH_PE_sdata8: + fde_use_sdata8 = true; + break; + default: + return EDYNSCS_INVALID_CIE_SDATA_SIZE; + } } else { - ret = scs_handle_fde_frame(frame, - fde_has_augmentation_data, - code_alignment_factor, - true); + ret = scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, true); if (ret) return ret; - scs_handle_fde_frame(frame, fde_has_augmentation_data, - code_alignment_factor, false); + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index c91e5e965cd3..46cafee7829f 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -22,6 +22,7 @@ static inline void *prel64_to_pointer(const prel64_t *offset) extern bool dynamic_scs_is_enabled; extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; +extern pgd_t init_pg_dir[], init_pg_end[]; void init_feature_override(u64 boot_status, const void *fdt, int chosen); u64 kaslr_early_init(void *fdt, int chosen); @@ -33,4 +34,4 @@ void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, asmlinkage void early_map_kernel(u64 boot_status, void *fdt); -asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask); +asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 3496d6169e59..6438bf62e753 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -58,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * Instructions which load PC relative literals are not going to work * when executed from an XOL slot. Instructions doing an exclusive * load/store are not going to complete successfully when single-step - * exception handling happens in the middle of the sequence. + * exception handling happens in the middle of the sequence. Memory + * copy/set instructions require that all three instructions be placed + * consecutively in memory. */ if (aarch64_insn_uses_literal(insn) || - aarch64_insn_is_exclusive(insn)) + aarch64_insn_is_exclusive(insn) || + aarch64_insn_is_mops(insn)) return false; return true; @@ -73,9 +76,18 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { /* + * While 'nop' instruction can execute in the out-of-line slot, + * simulating them in breakpoint handling offers better performance. + */ + if (aarch64_insn_is_nop(insn)) { + api->handler = simulate_nop; + return INSN_GOOD_NO_SLOT; + } + + /* * Instructions reading or modifying the PC won't work from the XOL * slot. */ @@ -133,8 +145,8 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; - probe_opcode_t insn = le32_to_cpu(*addr); - probe_opcode_t *scan_end = NULL; + u32 insn = le32_to_cpu(*addr); + kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; struct arch_probe_insn *api = &asi->api; diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 8b758c5a2062..0e4195de8206 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -28,6 +28,6 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); #endif enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 4268678d0e86..d9e462eafb95 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -27,7 +27,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> #include <asm/irq.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/ptrace.h> #include <asm/sections.h> #include <asm/system_misc.h> @@ -43,7 +43,7 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - kprobe_opcode_t *addr = p->ainsn.api.insn; + kprobe_opcode_t *addr = p->ainsn.xol_insn; /* * Prepare insn slot, Mark Rutland points out it depends on a coupe of @@ -64,20 +64,20 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) * the BRK exception handler, so it is unnecessary to generate * Contex-Synchronization-Event via ISB again. */ - aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode)); aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.api.restore = (unsigned long) p->addr + + p->ainsn.xol_restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.api.restore = 0; + p->ainsn.xol_restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) @@ -85,7 +85,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (p->ainsn.api.handler) - p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); + p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(p, kcb, regs); @@ -99,7 +99,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; if (search_exception_tables(probe_addr)) return -EINVAL; @@ -110,18 +110,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.api.insn = NULL; + p->ainsn.xol_insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.api.insn = get_insn_slot(); - if (!p->ainsn.api.insn) + p->ainsn.xol_insn = get_insn_slot(); + if (!p->ainsn.xol_insn) return -ENOMEM; break; } /* prepare the instruction */ - if (p->ainsn.api.insn) + if (p->ainsn.xol_insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -142,15 +142,16 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) void __kprobes arch_disarm_kprobe(struct kprobe *p) { void *addr = p->addr; + u32 insn = le32_to_cpu(p->opcode); - aarch64_insn_patch_text(&addr, &p->opcode, 1); + aarch64_insn_patch_text(&addr, &insn, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.api.insn) { - free_insn_slot(p->ainsn.api.insn, 0); - p->ainsn.api.insn = NULL; + if (p->ainsn.xol_insn) { + free_insn_slot(p->ainsn.xol_insn, 0); + p->ainsn.xol_insn = NULL; } } @@ -205,9 +206,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.api.insn) { + if (p->ainsn.xol_insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.api.insn; + slot = (unsigned long)p->ainsn.xol_insn; kprobes_save_local_irqflag(kcb, regs); instruction_pointer_set(regs, slot); @@ -245,8 +246,8 @@ static void __kprobes post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { /* return addr restore if non-branching insn */ - if (cur->ainsn.api.restore != 0) - instruction_pointer_set(regs, cur->ainsn.api.restore); + if (cur->ainsn.xol_restore != 0) + instruction_pointer_set(regs, cur->ainsn.xol_restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -348,7 +349,7 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) struct kprobe *cur = kprobe_running(); if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && - ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { + ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); post_kprobe_handler(cur, kcb, regs); diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index b65334ab79d2..4c6d2d712fbd 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -196,3 +196,9 @@ simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) instruction_pointer_set(regs, instruction_pointer(regs) + 4); } + +void __kprobes +simulate_nop(u32 opcode, long addr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index e065dc92218e..efb2803ec943 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -16,5 +16,6 @@ void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs); +void simulate_nop(u32 opcode, long addr, struct pt_regs *regs); #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */ diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index a2f137a595fc..cb3d05af36e3 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -17,12 +17,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *xol_page_kaddr = kmap_atomic(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + /* + * Initial cache maintenance of the xol page done via set_pte_at(). + * Subsequent CMOs only needed if the xol slot changes. + */ + if (!memcmp(dst, src, len)) + goto done; + /* Initialize the slot */ memcpy(dst, src, len); /* flush caches (dcache/icache) */ sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); +done: kunmap_atomic(xol_page_kaddr); } @@ -34,7 +42,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - probe_opcode_t insn; + u32 insn; /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) @@ -102,7 +110,7 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - probe_opcode_t insn; + u32 insn; unsigned long addr; if (!auprobe->simulate) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 3e7c8c8195c3..5954cec19660 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -49,6 +49,7 @@ #include <asm/cacheflush.h> #include <asm/exec.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mmu_context.h> #include <asm/mte.h> #include <asm/processor.h> @@ -227,7 +228,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr_save: %08llx\n", regs->pmr_save); + printk("pmr: %08x\n", regs->pmr); i = top_reg; @@ -280,6 +281,53 @@ static void flush_poe(void) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + current->thread.gcspr_el0 = 0; + current->thread.gcs_base = 0; + current->thread.gcs_size = 0; + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + return 0; +} + +#else + +static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} + +#endif + void flush_thread(void) { fpsimd_flush_thread(); @@ -287,59 +335,45 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); + gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - if (current->mm) - fpsimd_preserve_current_state(); + /* + * The current/src task's FPSIMD state may or may not be live, and may + * have been altered by ptrace after entry to the kernel. Save the + * effective FPSIMD state so that this will be copied into dst. + */ + fpsimd_save_and_flush_current_state(); + fpsimd_sync_from_effective_state(src); + *dst = *src; /* - * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's copies - * will be allocated on demand later on if dst uses SVE. - * For consistency, also clear TIF_SVE here: this could be done - * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF flags and buffers in - * an inconsistent state, even temporarily. + * Drop stale reference to src's sve_state and convert dst to + * non-streaming FPSIMD mode. */ + dst->thread.fp_type = FP_STATE_FPSIMD; dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + task_smstop_sm(dst); /* - * In the unlikely event that we create a new thread with ZA - * enabled we should retain the ZA and ZT state so duplicate - * it here. This may be shortly freed if we exec() or if - * CLONE_SETTLS but it's simpler to do it here. To avoid - * confusing the rest of the code ensure that we have a - * sve_state allocated whenever sme_state is allocated. + * Drop stale reference to src's sme_state and ensure dst has ZA + * disabled. + * + * When necessary, ZA will be inherited later in copy_thread_za(). */ - if (thread_za_enabled(&src->thread)) { - dst->thread.sve_state = kzalloc(sve_state_size(src), - GFP_KERNEL); - if (!dst->thread.sve_state) - return -ENOMEM; - - dst->thread.sme_state = kmemdup(src->thread.sme_state, - sme_state_size(src), - GFP_KERNEL); - if (!dst->thread.sme_state) { - kfree(dst->thread.sve_state); - dst->thread.sve_state = NULL; - return -ENOMEM; - } - } else { - dst->thread.sme_state = NULL; - clear_tsk_thread_flag(dst, TIF_SME); - } - - dst->thread.fp_type = FP_STATE_FPSIMD; + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr &= ~SVCR_ZA_MASK; /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -347,6 +381,31 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +static int copy_thread_za(struct task_struct *dst, struct task_struct *src) +{ + if (!thread_za_enabled(&src->thread)) + return 0; + + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + + set_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) @@ -355,6 +414,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -378,8 +438,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); - if (system_supports_tpidr2()) - p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (system_supports_poe()) p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); @@ -392,13 +450,43 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) } /* + * Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and + * TPIDR2 need to be manipulated as a pair, and either both + * need to be inherited or both need to be reset. + * + * Within a process, child threads must not inherit their + * parent's TPIDR2 value or they may clobber their parent's + * stack at some later point. + * + * When a process is fork()'d, the child must inherit ZA and + * TPIDR2 from its parent in case there was dormant ZA state. + * + * Use CLONE_VM to determine when the child will share the + * address space with the parent, and cannot safely inherit the + * state. + */ + if (system_supports_sme()) { + if (!(clone_flags & CLONE_VM)) { + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + ret = copy_thread_za(p, current); + if (ret) + return ret; + } else { + p->thread.tpidr2_el0 = 0; + WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK); + } + } + + /* * If a TLS pointer was passed to clone, use it for the new - * thread. We also reset TPIDR2 if it's in use. + * thread. */ - if (clone_flags & CLONE_SETTLS) { + if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; - p->thread.tpidr2_el0 = 0; - } + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { /* * A kthread has no context to ERET to, so ensure any buggy @@ -409,6 +497,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + childregs->stackframe.type = FRAME_META_TYPE_FINAL; p->thread.cpu_context.x19 = (unsigned long)args->fn; p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; @@ -422,7 +511,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * For the benefit of the unwinder, set up childregs->stackframe * as the final frame for the new task. */ - p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; + p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe; ptrace_hw_copy_thread(p); @@ -442,7 +531,7 @@ static void tls_thread_switch(struct task_struct *next) if (is_compat_thread(task_thread_info(next))) write_sysreg(next->thread.uw.tp_value, tpidrro_el0); - else if (!arm64_kernel_unmapped_at_el0()) + else write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); @@ -487,6 +576,46 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* + * Ensure that GCS memory effects of the 'prev' thread are + * ordered before other memory accesses with release semantics + * (or preceded by a DMB) on the current PE. In addition, any + * memory accesses with acquire semantics (or succeeded by a + * DMB) are ordered before GCS memory effects of the 'next' + * thread. This will ensure that the GCS memory effects are + * visible to other PEs in case of migration. + */ + if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next)) + gcsb_dsync(); +} + +#else + +static void gcs_thread_switch(struct task_struct *next) +{ +} + +#endif + /* * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} @@ -583,12 +712,14 @@ struct task_struct *__switch_to(struct task_struct *prev, cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); permission_overlay_switch(next); + gcs_thread_switch(next); /* - * Complete any pending TLB or cache maintenance on this CPU in case - * the thread migrates to a different CPU. - * This full barrier is also required by the membarrier system - * call. + * Complete any pending TLB or cache maintenance on this CPU in case the + * thread migrates to a different CPU. This full barrier is also + * required by the membarrier system call. Additionally it makes any + * in-progress pgtable writes visible to the table walker; See + * emit_pte_barriers(). */ dsb(ish); @@ -764,7 +895,7 @@ long get_tagged_addr_ctrl(struct task_struct *task) * disable it for tasks that already opted in to the relaxed ABI. */ -static struct ctl_table tagged_addr_sysctl_table[] = { +static const struct ctl_table tagged_addr_sysctl_table[] = { { .procname = "tagged_addr_disabled", .mode = 0644, diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index da53722f95d4..edf1783ffc81 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -172,7 +172,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) return SPECTRE_UNAFFECTED; /* Alternatively, we have a list of unaffected CPUs */ - if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + if (is_midr_in_range_list(spectre_v2_safe_list)) return SPECTRE_UNAFFECTED; return SPECTRE_VULNERABLE; @@ -331,7 +331,7 @@ bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) }; WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); + return is_midr_in_range_list(spectre_v3a_unsafe_list); } void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) @@ -475,7 +475,7 @@ static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) { /* sentinel */ }, }; - if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + if (is_midr_in_range_list(spectre_v4_safe_list)) return SPECTRE_UNAFFECTED; /* CPU features are detected first */ @@ -845,52 +845,90 @@ static unsigned long system_bhb_mitigations; * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any * SCOPE_SYSTEM call will give the right answer. */ -u8 spectre_bhb_loop_affected(int scope) +static bool is_spectre_bhb_safe(int scope) +{ + static const struct midr_range spectre_bhb_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + {}, + }; + static bool all_safe = true; + + if (scope != SCOPE_LOCAL_CPU) + return all_safe; + + if (is_midr_in_range_list(spectre_bhb_safe_list)) + return true; + + all_safe = false; + + return false; +} + +static u8 spectre_bhb_loop_affected(void) { u8 k = 0; - static u8 max_bhb_k; - - if (scope == SCOPE_LOCAL_CPU) { - static const struct midr_range spectre_bhb_k32_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), - {}, - }; - static const struct midr_range spectre_bhb_k24_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), - {}, - }; - static const struct midr_range spectre_bhb_k11_list[] = { - MIDR_ALL_VERSIONS(MIDR_AMPERE1), - {}, - }; - static const struct midr_range spectre_bhb_k8_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - {}, - }; - - if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) - k = 32; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) - k = 24; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) - k = 11; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) - k = 8; - - max_bhb_k = max(max_bhb_k, k); - } else { - k = max_bhb_k; - } + + static const struct midr_range spectre_bhb_k132_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + {}, + }; + static const struct midr_range spectre_bhb_k38_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + {}, + }; + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP09), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(spectre_bhb_k132_list)) + k = 132; + else if (is_midr_in_range_list(spectre_bhb_k38_list)) + k = 38; + else if (is_midr_in_range_list(spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(spectre_bhb_k8_list)) + k = 8; return k; } @@ -916,29 +954,13 @@ static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) } } -static bool is_spectre_bhb_fw_affected(int scope) +static bool has_spectre_bhb_fw_mitigation(void) { - static bool system_affected; enum mitigation_state fw_state; bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; - static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - {}, - }; - bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), - spectre_bhb_firmware_mitigated_list); - - if (scope != SCOPE_LOCAL_CPU) - return system_affected; fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { - system_affected = true; - return true; - } - - return false; + return has_smccc && fw_state == SPECTRE_MITIGATED; } static bool supports_ecbhb(int scope) @@ -954,6 +976,8 @@ static bool supports_ecbhb(int scope) ID_AA64MMFR1_EL1_ECBHB_SHIFT); } +static u8 max_bhb_k; + bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope) { @@ -962,16 +986,23 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; - if (supports_clearbhb(scope)) - return true; + if (is_spectre_bhb_safe(scope)) + return false; - if (spectre_bhb_loop_affected(scope)) - return true; + /* + * At this point the core isn't known to be "safe" so we're going to + * assume it's vulnerable. We still need to update `max_bhb_k` though, + * but only if we aren't mitigating with clearbhb though. + */ + if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU)) + max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected()); - if (is_spectre_bhb_fw_affected(scope)) - return true; + return true; +} - return false; +u8 get_spectre_bhb_loop_value(void) +{ + return max_bhb_k; } static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) @@ -991,7 +1022,7 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) isb(); } -static bool __read_mostly __nospectre_bhb; +bool __read_mostly __nospectre_bhb; static int __init parse_spectre_bhb_param(char *str) { __nospectre_bhb = true; @@ -1002,7 +1033,7 @@ early_param("nospectre_bhb", parse_spectre_bhb_param); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) { bp_hardening_cb_t cpu_cb; - enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + enum mitigation_state state = SPECTRE_VULNERABLE; struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) @@ -1028,7 +1059,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); state = SPECTRE_MITIGATED; set_bit(BHB_INSN, &system_bhb_mitigations); - } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + } else if (spectre_bhb_loop_affected()) { /* * Ensure KVM uses the indirect vector which will have the * branchy-loop added. A57/A72-r0 will already have selected @@ -1041,37 +1072,39 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); state = SPECTRE_MITIGATED; set_bit(BHB_LOOP, &system_bhb_mitigations); - } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { - fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (fw_state == SPECTRE_MITIGATED) { - /* - * Ensure KVM uses one of the spectre bp_hardening - * vectors. The indirect vector doesn't include the EL3 - * call, so needs upgrading to - * HYP_VECTOR_SPECTRE_INDIRECT. - */ - if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) - data->slot += 1; - - this_cpu_set_vectors(EL1_VECTOR_BHB_FW); - - /* - * The WA3 call in the vectors supersedes the WA1 call - * made during context-switch. Uninstall any firmware - * bp_hardening callback. - */ - cpu_cb = spectre_v2_get_sw_mitigation_cb(); - if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) - __this_cpu_write(bp_hardening_data.fn, NULL); - - state = SPECTRE_MITIGATED; - set_bit(BHB_FW, &system_bhb_mitigations); - } + } else if (has_spectre_bhb_fw_mitigation()) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); } update_mitigation_state(&spectre_bhb_state, state); } +bool is_spectre_bhb_fw_mitigated(void) +{ + return test_bit(BHB_FW, &system_bhb_mitigations); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, @@ -1100,7 +1133,6 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, { u8 rd; u32 insn; - u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); BUG_ON(nr_inst != 1); /* MOV -> MOV */ @@ -1109,7 +1141,7 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, insn = le32_to_cpu(*origptr); rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); - insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0, AARCH64_INSN_VARIANT_64BIT, AARCH64_INSN_MOVEWIDE_ZERO); *updptr++ = cpu_to_le32(insn); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b756578aeaee..ee94b72bf8fb 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -34,6 +34,7 @@ #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> @@ -140,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; + return READ_ONCE_NOCHECK(*addr); else return 0; } @@ -593,7 +594,7 @@ static int __fpr_get(struct task_struct *target, { struct user_fpsimd_state *uregs; - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; @@ -625,7 +626,7 @@ static int __fpr_set(struct task_struct *target, * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; @@ -652,7 +653,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - sve_sync_from_fpsimd_zeropad(target); + fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; @@ -719,6 +720,8 @@ static int fpmr_set(struct task_struct *target, const struct user_regset *regset if (!system_supports_fpmr()) return -EINVAL; + fpmr = target->thread.uw.fpmr; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count); if (ret) return ret; @@ -772,6 +775,11 @@ static void sve_init_header_from_task(struct user_sve_header *header, task_type = ARM64_VEC_SVE; active = (task_type == type); + if (active && target->thread.fp_type == FP_STATE_SVE) + header->flags = SVE_PT_REGS_SVE; + else + header->flags = SVE_PT_REGS_FPSIMD; + switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) @@ -786,19 +794,14 @@ static void sve_init_header_from_task(struct user_sve_header *header, return; } - if (active) { - if (target->thread.fp_type == FP_STATE_FPSIMD) { - header->flags |= SVE_PT_REGS_FPSIMD; - } else { - header->flags |= SVE_PT_REGS_SVE; - } - } - header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); - header->size = SVE_PT_SIZE(vq, header->flags); + if (active) + header->size = SVE_PT_SIZE(vq, header->flags); + else + header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } @@ -817,18 +820,25 @@ static int sve_get_common(struct task_struct *target, unsigned int vq; unsigned long start, end; + if (target == current) + fpsimd_preserve_current_state(); + /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); - if (target == current) - fpsimd_preserve_current_state(); - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + /* + * When the requested vector type is not active, do not present data + * from the other mode to userspace. + */ + if (header.size == sizeof(header)) + return 0; + switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); @@ -856,7 +866,7 @@ static int sve_get_common(struct task_struct *target, return membuf_zero(&to, end - start); default: - return 0; + BUILD_BUG(); } } @@ -880,6 +890,9 @@ static int sve_set_common(struct task_struct *target, struct user_sve_header header; unsigned int vq; unsigned long start, end; + bool fpsimd; + + fpsimd_flush_task_state(target); /* Header */ if (count < sizeof(header)) @@ -887,7 +900,16 @@ static int sve_set_common(struct task_struct *target, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) - goto out; + return ret; + + /* + * Streaming SVE data is always stored and presented in SVE format. + * Require the user to provide SVE formatted data for consistency, and + * to avoid the risk that we configure the task into an invalid state. + */ + fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; + if (fpsimd && type == ARM64_VEC_SME) + return -EINVAL; /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by @@ -896,88 +918,72 @@ static int sve_set_common(struct task_struct *target, ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) - goto out; + return ret; + + /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ + if (type == ARM64_VEC_SME) { + sme_alloc(target, false); + if (!target->thread.sme_state) + return -ENOMEM; + } + + /* Allocate SVE storage if necessary, zeroing any existing SVE state */ + if (!fpsimd) { + sve_alloc(target, true); + if (!target->thread.sve_state) + return -ENOMEM; + } - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC VL not the + * current VL: + */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ if (system_supports_sme()) { - u64 old_svcr = target->thread.svcr; - switch (type) { case ARM64_VEC_SVE: target->thread.svcr &= ~SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: target->thread.svcr |= SVCR_SM_MASK; - - /* - * Disable traps and ensure there is SME storage but - * preserve any currently set values in ZA/ZT. - */ - sme_alloc(target, false); set_tsk_thread_flag(target, TIF_SME); break; default: WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - - /* - * If we switched then invalidate any existing SVE - * state and ensure there's storage. - */ - if (target->thread.svcr != old_svcr) - sve_alloc(target, true); } + /* Always zero V regs, FPSR, and FPCR */ + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { - ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + if (fpsimd) { clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; - goto out; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + return ret; } - /* - * Otherwise: no registers or full SVE case. For backwards - * compatibility reasons we treat empty flags as SVE registers. - */ + /* Otherwise: no registers or full SVE case. */ + + target->thread.fp_type = FP_STATE_SVE; /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ - if (count && vq != sve_vq_from_vl(header.vl)) { - ret = -EIO; - goto out; - } - - sve_alloc(target, true); - if (!target->thread.sve_state) { - ret = -ENOMEM; - clear_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_FPSIMD; - goto out; - } - - /* - * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing - * registers unmodified. Only enable SVE if we are - * configuring normal SVE, a system with streaming SVE may not - * have normal SVE. - */ - fpsimd_sync_to_sve(target); - if (type == ARM64_VEC_SVE) - set_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_SVE; + if (count && vq != sve_vq_from_vl(header.vl)) + return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; @@ -986,7 +992,7 @@ static int sve_set_common(struct task_struct *target, target->thread.sve_state, start, end); if (ret) - goto out; + return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); @@ -1002,8 +1008,6 @@ static int sve_set_common(struct task_struct *target, &target->thread.uw.fpsimd_state.fpsr, start, end); -out: - fpsimd_flush_task_state(target); return ret; } @@ -1125,7 +1129,11 @@ static int za_set(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC rather than + * current VL: + */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ @@ -1418,7 +1426,7 @@ static int tagged_addr_ctrl_get(struct task_struct *target, { long ctrl = get_tagged_addr_ctrl(target); - if (IS_ERR_VALUE(ctrl)) + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; return membuf_write(&to, &ctrl, sizeof(ctrl)); @@ -1432,6 +1440,10 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct int ret; long ctrl; + ctrl = get_tagged_addr_ctrl(target); + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) + return ctrl; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; @@ -1463,6 +1475,8 @@ static int poe_set(struct task_struct *target, const struct if (!system_supports_poe()) return -EINVAL; + ctrl = target->thread.por_el0; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; @@ -1473,6 +1487,66 @@ static int poe_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_GCS +static void task_gcs_to_user(struct user_gcs *user_gcs, + const struct task_struct *target) +{ + user_gcs->features_enabled = target->thread.gcs_el0_mode; + user_gcs->features_locked = target->thread.gcs_el0_locked; + user_gcs->gcspr_el0 = target->thread.gcspr_el0; +} + +static void task_gcs_from_user(struct task_struct *target, + const struct user_gcs *user_gcs) +{ + target->thread.gcs_el0_mode = user_gcs->features_enabled; + target->thread.gcs_el0_locked = user_gcs->features_locked; + target->thread.gcspr_el0 = user_gcs->gcspr_el0; +} + +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + task_gcs_to_user(&user_gcs, target); + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + task_gcs_to_user(&user_gcs, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + task_gcs_from_user(target, &user_gcs); + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1503,7 +1577,10 @@ enum aarch64_regset { REGSET_TAGGED_ADDR_CTRL, #endif #ifdef CONFIG_ARM64_POE - REGSET_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, #endif }; @@ -1674,6 +1751,16 @@ static const struct user_regset aarch64_regsets[] = { .set = poe_set, }, #endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + .core_note_type = NT_ARM_GCS, + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..ce4778141ec7 --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include <linux/jump_label.h> +#include <linux/memblock.h> +#include <linux/psci.h> +#include <linux/swiotlb.h> +#include <linux/cc_platform.h> +#include <linux/platform_device.h> + +#include <asm/io.h> +#include <asm/mem_encrypt.h> +#include <asm/rsi.h> + +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas != RSI_RIPAS_DEV) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(__arm64_is_protected_mmio); + +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (__arm64_is_protected_mmio(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); + + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + + if (realm_register_memory_enc_ops()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + +static struct platform_device rsi_dev = { + .name = RSI_PDEV_NAME, + .id = PLATFORM_DEVID_NONE +}; + +static int __init arm64_create_dummy_rsi_dev(void) +{ + if (is_realm_world() && + platform_device_register(&rsi_dev)) + pr_err("failed to register rsi platform device\n"); + return 0; +} + +arch_initcall(arm64_create_dummy_rsi_dev) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index b22d28ec8028..77c7926a4df6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> +#include <asm/rsi.h> #include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> @@ -168,19 +169,23 @@ static void __init smp_build_mpidr_hash(void) static void __init setup_machine_fdt(phys_addr_t dt_phys) { - int size; + int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; if (dt_virt) memblock_reserve(dt_phys, size); - if (!dt_virt || !early_init_dt_scan(dt_virt)) { + /* + * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. + * Pass dt_phys directly. + */ + if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" - "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" - "\nPlease check your bootloader.", - &dt_phys, dt_virt); + "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" + "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" + "\nPlease check your bootloader.\n", + &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() @@ -218,9 +223,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); - if (!standard_resources) - panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; @@ -351,6 +354,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..417140cd399b 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -19,12 +19,14 @@ #include <linux/ratelimit.h> #include <linux/rseq.h> #include <linux/syscalls.h> +#include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/exception.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> @@ -34,6 +36,8 @@ #include <asm/traps.h> #include <asm/vdso.h> +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -42,11 +46,6 @@ struct rt_sigframe { struct ucontext uc; }; -struct frame_record { - u64 fp; - u64 lr; -}; - struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; @@ -56,6 +55,7 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; @@ -66,10 +66,62 @@ struct rt_sigframe_user_layout { unsigned long end_offset; }; -#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) +/* + * Holds any EL0-controlled state that influences unprivileged memory accesses. + * This includes both accesses done in userspace and uaccess done in the kernel. + * + * This state needs to be carefully managed to ensure that it doesn't cause + * uaccess to fail when setting up the signal frame, and the signal handler + * itself also expects a well-defined state when entered. + */ +struct user_access_state { + u64 por_el0; +}; + #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) +/* + * Save the user access state into ua_state and reset it to disable any + * restrictions. + */ +static void save_reset_user_access_state(struct user_access_state *ua_state) +{ + if (system_supports_poe()) { + u64 por_enable_all = 0; + + for (int pkey = 0; pkey < arch_max_pkey(); pkey++) + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); + + ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); + write_sysreg_s(por_enable_all, SYS_POR_EL0); + /* Ensure that any subsequent uaccess observes the updated value */ + isb(); + } +} + +/* + * Set the user access state for invoking the signal handler. + * + * No uaccess should be done after that function is called. + */ +static void set_handler_user_access_state(void) +{ + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +/* + * Restore the user access state to the values saved in ua_state. + * + * No uaccess should be done after that function is called. + */ +static void restore_user_access_state(const struct user_access_state *ua_state) +{ + if (system_supports_poe()) + write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); +} + static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = @@ -188,6 +240,8 @@ struct user_ctxs { u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -196,6 +250,8 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) ¤t->thread.uw.fpsimd_state; int err; + fpsimd_sync_from_effective_state(current); + /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -208,37 +264,46 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct user_ctxs *user) +static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, + struct user_ctxs *user) { - struct user_fpsimd_state fpsimd; - int err = 0; + int err; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); - __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); + err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), + sizeof(fpsimd->vregs)); + __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); + + return err ? -EFAULT : 0; +} + +static int restore_fpsimd_context(struct user_ctxs *user) +{ + struct user_fpsimd_state fpsimd; + int err; + + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); - - return err ? -EFAULT : 0; + fpsimd_update_current_state(&fpsimd); + return 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; - current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); - __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); @@ -256,23 +321,25 @@ static int restore_fpmr_context(struct user_ctxs *user) __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) - write_sysreg_s(fpmr, SYS_FPMR); + current->thread.uw.fpmr = fpmr; return err; } -static int preserve_poe_context(struct poe_context __user *ctx) +static int preserve_poe_context(struct poe_context __user *ctx, + const struct user_access_state *ua_state) { int err = 0; __put_user_error(POE_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err); + __put_user_error(ua_state->por_el0, &ctx->por_el0, err); return err; } -static int restore_poe_context(struct user_ctxs *user) +static int restore_poe_context(struct user_ctxs *user, + struct user_access_state *ua_state) { u64 por_el0; int err = 0; @@ -282,7 +349,7 @@ static int restore_poe_context(struct user_ctxs *user) __get_user_error(por_el0, &(user->poe->por_el0), err); if (!err) - write_sysreg_s(por_el0, SYS_POR_EL0); + ua_state->por_el0 = por_el0; return err; } @@ -316,11 +383,6 @@ static int preserve_sve_context(struct sve_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the SVE state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); @@ -335,6 +397,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; + bool sm; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; @@ -344,7 +407,8 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return err; - if (flags & SVE_SIG_FLAG_SM) { + sm = flags & SVE_SIG_FLAG_SM; + if (sm) { if (!system_supports_sme()) return -EINVAL; @@ -364,28 +428,23 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (user_vl != vl) return -EINVAL; - if (user->sve_size == sizeof(*user->sve)) { - clear_thread_flag(TIF_SVE); - current->thread.svcr &= ~SVCR_SM_MASK; - current->thread.fp_type = FP_STATE_FPSIMD; - goto fpsimd_only; - } + /* + * Non-streaming SVE state may be preserved without an SVE payload, in + * which case the SVE context only has a header with VL==0, and all + * state can be restored from the FPSIMD context. + * + * Streaming SVE state is always preserved with an SVE payload. For + * consistency and robustness, reject restoring streaming SVE state + * without an SVE payload. + */ + if (!sm && user->sve_size == sizeof(*user->sve)) + return restore_fpsimd_context(user); vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sve_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); @@ -405,19 +464,14 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; -fpsimd_only: - /* copy the FP and status/control registers */ - /* restore_sigframe() already checked that user->fpsimd != NULL. */ - err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); - __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; - /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); + /* Merge the FPSIMD registers into the SVE state */ + fpsimd_update_current_state(&fpsimd); - return err ? -EFAULT : 0; + return 0; } #else /* ! CONFIG_ARM64_SVE */ @@ -437,13 +491,12 @@ extern int preserve_sve_context(void __user *ctx); static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { + u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); int err = 0; - current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); - __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); + __put_user_error(tpidr2_el0, &ctx->tpidr2, err); return err; } @@ -485,11 +538,6 @@ static int preserve_za_context(struct za_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the ZA state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); @@ -524,16 +572,6 @@ static int restore_za_context(struct user_ctxs *user) if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sme_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; @@ -571,11 +609,6 @@ static int preserve_zt_context(struct zt_context __user *ctx) BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); - /* - * This assumes that the ZT state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); @@ -601,16 +634,6 @@ static int restore_zt_context(struct user_ctxs *user) if (nregs != 1) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.zt_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ - err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, @@ -633,6 +656,82 @@ extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * If GCS is enabled we will add a cap token to the frame, + * include it in the GCSPR_EL0 we report to support stack + * switching via sigreturn if GCS is enabled. We do not allow + * enabling via sigreturn so the token is only relevant for + * threads with GCS enabled. + */ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, + &ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* + * We let userspace set GCSPR_EL0 to anything here, we will + * validate later in gcs_restore_signal(). + */ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -651,6 +750,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->zt = NULL; user->fpmr = NULL; user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -767,6 +867,17 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpmr_size = size; break; + case GCS_MAGIC: + if (!system_supports_gcs()) + goto invalid; + + if (user->gcs) + goto invalid; + + user->gcs = (struct gcs_context __user *)head; + user->gcs_size = size; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -850,7 +961,8 @@ invalid: } static int restore_sigframe(struct pt_regs *regs, - struct rt_sigframe __user *sf) + struct rt_sigframe __user *sf, + struct user_access_state *ua_state) { sigset_t set; int i, err; @@ -872,6 +984,8 @@ static int restore_sigframe(struct pt_regs *regs, */ forget_syscall(regs); + fpsimd_save_and_flush_current_state(); + err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); @@ -886,6 +1000,9 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_gcs() && user.gcs) + err = restore_gcs_context(&user); + if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); @@ -899,15 +1016,68 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_zt_context(&user); if (err == 0 && system_supports_poe() && user.poe) - err = restore_poe_context(&user); + err = restore_poe_context(&user, ua_state); return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + u64 gcspr_el0, cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Ensure that any changes to the GCS done via GCS operations + * are visible to the normal reads we do to validate the + * token. + */ + gcsb_dsync(); + + /* + * GCSPR_EL0 should be pointing at a capped GCS, read the cap. + * We don't enforce that this is in a GCS page, if it is not + * then faults will be generated on GCS operations - the main + * concern is to protect GCS pages. + */ + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); + if (ret) + return -EFAULT; + + /* + * Check that the cap is the actual GCS before replacing it. + */ + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; + struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -924,12 +1094,17 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof (*frame))) goto badframe; - if (restore_sigframe(regs, frame)) + if (restore_sigframe(regs, frame, &ua_state)) + goto badframe; + + if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + restore_user_access_state(&ua_state); + return regs->regs[0]; badframe: @@ -964,6 +1139,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } +#ifdef CONFIG_ARM64_GCS + if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { + err = sigframe_alloc(user, &user->gcs_offset, + sizeof(struct gcs_context)); + if (err) + return err; + } +#endif + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; @@ -1035,7 +1219,8 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, } static int setup_sigframe(struct rt_sigframe_user_layout *user, - struct pt_regs *regs, sigset_t *set) + struct pt_regs *regs, sigset_t *set, + const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; @@ -1071,6 +1256,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } + if (system_supports_gcs() && err == 0 && user->gcs_offset) { + struct gcs_context __user *gcs_ctx = + apply_user_offset(user, user->gcs_offset); + err |= preserve_gcs_context(gcs_ctx); + } + /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { @@ -1093,14 +1284,13 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_fpmr_context(fpmr_ctx); } - if (system_supports_poe() && err == 0 && user->poe_offset) { + if (system_supports_poe() && err == 0) { struct poe_context __user *poe_ctx = apply_user_offset(user, user->poe_offset); - err |= preserve_poe_context(poe_ctx); + err |= preserve_poe_context(poe_ctx, ua_state); } - /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1189,15 +1379,81 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + u64 gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* + * We are entering a signal handler, current register state is + * active. + */ + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Push a cap and the GCS entry for the trampoline onto the GCS. + */ + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); + if (ret != 0) + return ret; + + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); + + return 0; +} +#else + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + return 0; +} + +#endif + +static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; + int err; + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; + else + sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + + err = gcs_signal_entry(sigtramp, ksig); + if (err) + return err; + + /* + * We must not fail from this point onwards. We are going to update + * registers, including SP, in order to invoke the signal handler. If + * we failed and attempted to deliver a nested SIGSEGV to a handler + * after that point, the subsequent sigreturn would end up restoring + * the (partial) state for the original signal handler. + */ regs->regs[0] = usig; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + regs->regs[1] = (unsigned long)&user->sigframe->info; + regs->regs[2] = (unsigned long)&user->sigframe->uc; + } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; - regs->pc = (unsigned long)ka->sa.sa_handler; + regs->regs[30] = (unsigned long)sigtramp; + regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in @@ -1220,32 +1476,12 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { - /* - * If we were in streaming mode the saved register - * state was SVE but we will exit SM and use the - * FPSIMD register state - flush the saved FPSIMD - * register state in case it gets loaded. - */ - if (current->thread.svcr & SVCR_SM_MASK) { - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - current->thread.svcr &= ~(SVCR_ZA_MASK | - SVCR_SM_MASK); - sme_smstop(); + task_smstop_sm(current); + current->thread.svcr &= ~SVCR_ZA_MASK; + write_sysreg_s(0, SYS_TPIDR2_EL0); } - if (system_supports_poe()) - write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); - - if (ka->sa.sa_flags & SA_RESTORER) - sigtramp = ka->sa.sa_restorer; - else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); - - regs->regs[30] = (unsigned long)sigtramp; + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -1253,28 +1489,37 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; + struct user_access_state ua_state; int err = 0; - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; + save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); - err |= setup_sigframe(&user, regs, set); - if (err == 0) { - setup_return(regs, &ksig->ka, &user, usig); - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - regs->regs[1] = (unsigned long)&frame->info; - regs->regs[2] = (unsigned long)&frame->uc; - } - } + err |= setup_sigframe(&user, regs, set, &ua_state); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + err |= copy_siginfo_to_user(&frame->info, &ksig->info); + + if (err == 0) + err = setup_return(regs, ksig, &user, usig); + + /* + * We must not fail if setup_return() succeeded - see comment at the + * beginning of setup_return(). + */ + + if (err == 0) + set_handler_user_access_state(); + else + restore_user_access_state(&ua_state); return err; } diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 81e798b6dada..bb3b526ff43f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -103,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -169,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK; fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + if (err) + return -EFAULT; + /* * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) - fpsimd_update_current_state(&fpsimd); + fpsimd_save_and_flush_current_state(); + current->thread.uw.fpsimd_state = fpsimd; - return err ? -EFAULT : 0; + return 0; } static int compat_restore_sigframe(struct pt_regs *regs, diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 487381164ff6..2def9d0dd3dd 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -7,48 +7,19 @@ #include <asm/asm-offsets.h> #include <asm/assembler.h> -#include <asm/thread_info.h> - -/* - * If we have SMCCC v1.3 and (as is likely) no SVE state in - * the registers then set the SMCCC hint bit to say there's no - * need to preserve it. Do this by directly adjusting the SMCCC - * function value which is already stored in x0 ready to be called. - */ -SYM_FUNC_START(__arm_smccc_sve_check) - - ldr_l x16, smccc_has_sve_hint - cbz x16, 2f - - get_current_task x16 - ldr x16, [x16, #TSK_TI_FLAGS] - tbnz x16, #TIF_FOREIGN_FPSTATE, 1f // Any live FP state? - tbnz x16, #TIF_SVE, 2f // Does that state include SVE? - -1: orr x0, x0, ARM_SMCCC_1_3_SVE_HINT - -2: ret -SYM_FUNC_END(__arm_smccc_sve_check) -EXPORT_SYMBOL(__arm_smccc_sve_check) .macro SMCCC instr - stp x29, x30, [sp, #-16]! - mov x29, sp -alternative_if ARM64_SVE - bl __arm_smccc_sve_check -alternative_else_nop_endif \instr #0 - ldr x4, [sp, #16] + ldr x4, [sp] stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS] - ldr x4, [sp, #24] + ldr x4, [sp, #8] cbz x4, 1f /* no quirk structure */ ldr x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS] cmp x9, #ARM_SMCCC_QUIRK_QCOM_A6 b.ne 1f str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS] -1: ldp x29, x30, [sp], #16 - ret +1: ret .endm /* diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 2729faaee4b4..1d9d51d7627f 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,22 @@ #include <asm/stack_pointer.h> #include <asm/stacktrace.h> +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + /* * Kernel unwind state * @@ -37,6 +53,9 @@ struct kunwind_state { #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif + enum kunwind_source source; + union unwind_flags flags; + struct pt_regs *regs; }; static __always_inline void @@ -45,6 +64,9 @@ kunwind_init(struct kunwind_state *state, { unwind_init_common(&state->common); state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; + state->regs = NULL; } /* @@ -60,8 +82,10 @@ kunwind_init_from_regs(struct kunwind_state *state, { kunwind_init(state, current); + state->regs = regs; state->common.fp = regs->regs[29]; state->common.pc = regs->pc; + state->source = KUNWIND_SOURCE_REGS_PC; } /* @@ -79,6 +103,7 @@ kunwind_init_from_caller(struct kunwind_state *state) state->common.fp = (unsigned long)__builtin_frame_address(1); state->common.pc = (unsigned long)__builtin_return_address(0); + state->source = KUNWIND_SOURCE_CALLER; } /* @@ -99,6 +124,7 @@ kunwind_init_from_task(struct kunwind_state *state, state->common.fp = thread_saved_fp(task); state->common.pc = thread_saved_pc(task); + state->source = KUNWIND_SOURCE_TASK; } static __always_inline int @@ -111,9 +137,12 @@ kunwind_recover_return_address(struct kunwind_state *state) orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->common.pc, (void *)state->common.fp); - if (WARN_ON_ONCE(state->common.pc == orig_pc)) + if (state->common.pc == orig_pc) { + WARN_ON_ONCE(state->task == current); return -EINVAL; + } state->common.pc = orig_pc; + state->flags.fgraph = 1; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -124,12 +153,95 @@ kunwind_recover_return_address(struct kunwind_state *state) (void *)state->common.fp, &state->kr_cur); state->common.pc = orig_pc; + state->flags.kretprobe = 1; } #endif /* CONFIG_KRETPROBES */ return 0; } +static __always_inline +int kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs)); + if (!info) + return -EINVAL; + + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->pc; + state->common.fp = regs->regs[29]; + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_PC; + return 0; +} + +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + struct frame_record_meta *meta; + struct stack_info *info; + + info = unwind_find_stack(&state->common, fp, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)fp; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(tsk == current); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(tsk == current); + return -EINVAL; + } +} + +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + + if (fp & 0x7) + return -EINVAL; + + info = unwind_find_stack(&state->common, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)fp; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->lr); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, fp, sizeof(*record)); + + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + /* * Unwind from one frame record (A) to the next frame record (B). * @@ -140,15 +252,21 @@ kunwind_recover_return_address(struct kunwind_state *state) static __always_inline int kunwind_next(struct kunwind_state *state) { - struct task_struct *tsk = state->task; - unsigned long fp = state->common.fp; int err; - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) - return -ENOENT; + state->flags.all = 0; + + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = kunwind_next_frame_record(state); + break; + default: + err = -EINVAL; + } - err = unwind_next_frame_record(&state->common); if (err) return err; @@ -294,10 +412,32 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6 kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); } -static bool dump_backtrace_entry(void *arg, unsigned long where) +static const char *state_source_string(const struct kunwind_state *state) { + switch (state->source) { + case KUNWIND_SOURCE_FRAME: return NULL; + case KUNWIND_SOURCE_CALLER: return "C"; + case KUNWIND_SOURCE_TASK: return "T"; + case KUNWIND_SOURCE_REGS_PC: return "P"; + default: return "U"; + } +} + +static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) +{ + const char *source = state_source_string(state); + union unwind_flags flags = state->flags; + bool has_info = source || flags.all; char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)where); + + printk("%s %pSb%s%s%s%s%s\n", loglvl, + (void *)state->common.pc, + has_info ? " (" : "", + source ? source : "", + flags.fgraph ? "F" : "", + flags.kretprobe ? "K" : "", + has_info ? ")" : ""); + return true; } @@ -316,7 +456,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, return; printk("%sCall trace:\n", loglvl); - arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); + kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f8..5d07ee85bdae 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,8 +15,11 @@ #include <linux/arch_topology.h> #include <linux/cacheinfo.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> @@ -37,17 +40,28 @@ static bool __init acpi_cpu_is_threaded(int cpu) return !!is_threaded; } +struct cpu_smt_info { + unsigned int thread_num; + int core_id; +}; + /* * Propagate the topology information of the processor_topology_node tree to the * cpu_topology array. */ int __init parse_acpi_topology(void) { + unsigned int max_smt_thread_num = 1; + struct cpu_smt_info *entry; + struct xarray hetero_cpu; + unsigned long hetero_id; int cpu, topology_id; if (acpi_disabled) return 0; + xa_init(&hetero_cpu); + for_each_possible_cpu(cpu) { topology_id = find_acpi_cpu_topology(cpu, 0); if (topology_id < 0) @@ -57,6 +71,34 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = topology_id; topology_id = find_acpi_cpu_topology(cpu, 1); cpu_topology[cpu].core_id = topology_id; + + /* + * In the PPTT, CPUs below a node with the 'identical + * implementation' flag have the same number of threads. + * Count the number of threads for only one CPU (i.e. + * one core_id) among those with the same hetero_id. + * See the comment of find_acpi_cpu_topology_hetero_id() + * for more details. + * + * One entry is created for each node having: + * - the 'identical implementation' flag + * - its parent not having the flag + */ + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); + entry = xa_load(&hetero_cpu, hetero_id); + if (!entry) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + WARN_ON_ONCE(!entry); + + if (entry) { + entry->core_id = topology_id; + entry->thread_num = 1; + xa_store(&hetero_cpu, hetero_id, + entry, GFP_KERNEL); + } + } else if (entry->core_id == topology_id) { + entry->thread_num++; + } } else { cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; @@ -67,6 +109,19 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].package_id = topology_id; } + /* + * This is a short loop since the number of XArray elements is the + * number of heterogeneous CPU clusters. On a homogeneous system + * there's only one entry in the XArray. + */ + xa_for_each(&hetero_cpu, hetero_id, entry) { + max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); + xa_erase(&hetero_cpu, hetero_id); + kfree(entry); + } + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + xa_destroy(&hetero_cpu); return 0; } #endif @@ -88,18 +143,28 @@ int __init parse_acpi_topology(void) * initialized. */ static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); -static DEFINE_PER_CPU(u64, arch_const_cycles_prev); -static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; +struct amu_cntr_sample { + u64 arch_const_cycles_prev; + u64 arch_core_cycles_prev; + unsigned long last_scale_update; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples); + void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, read_corecnt()); - this_cpu_write(arch_const_cycles_prev, read_constcnt()); + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); + + amu_sample->arch_core_cycles_prev = read_corecnt(); + amu_sample->arch_const_cycles_prev = read_constcnt(); } static inline bool freq_counters_valid(int cpu) { + struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) return false; @@ -108,8 +173,8 @@ static inline bool freq_counters_valid(int cpu) return false; } - if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || - !per_cpu(arch_core_cycles_prev, cpu))) { + if (unlikely(!amu_sample->arch_const_cycles_prev || + !amu_sample->arch_core_cycles_prev)) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); return false; } @@ -152,17 +217,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate) static void amu_scale_freq_tick(void) { + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + prev_const_cnt = amu_sample->arch_const_cycles_prev; + prev_core_cnt = amu_sample->arch_core_cycles_prev; update_freq_counters_refs(); - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); + const_cnt = amu_sample->arch_const_cycles_prev; + core_cnt = amu_sample->arch_core_cycles_prev; + /* + * This should not happen unless the AMUs have been reset and the + * counter values have not been restored - unlikely + */ if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) return; @@ -182,6 +252,8 @@ static void amu_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(arch_freq_scale, (unsigned long)scale); + + amu_sample->last_scale_update = jiffies; } static struct scale_freq_data amu_sfd = { @@ -189,17 +261,114 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; +static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) +{ + return cpumask_available(amu_fie_cpus) && + cpumask_test_cpu(cpu, amu_fie_cpus); +} + +void arch_cpu_idle_enter(void) +{ + unsigned int cpu = smp_processor_id(); + + if (!amu_fie_cpu_supported(cpu)) + return; + + /* Kick in AMU update but only if one has not happened already */ + if (housekeeping_cpu(cpu, HK_TYPE_TICK) && + time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu))) + amu_scale_freq_tick(); +} + +#define AMU_SAMPLE_EXP_MS 20 + +int arch_freq_get_on_cpu(int cpu) +{ + struct amu_cntr_sample *amu_sample; + unsigned int start_cpu = cpu; + unsigned long last_update; + unsigned int freq = 0; + u64 scale; + + if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu)) + return -EOPNOTSUPP; + + while (1) { + + amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + + last_update = amu_sample->last_scale_update; + + /* + * For those CPUs that are in full dynticks mode, or those that have + * not seen tick for a while, try an alternative source for the counters + * (and thus freq scale), if available, for given policy: this boils + * down to identifying an active cpu within the same freq domain, if any. + */ + if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || + time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + int ref_cpu; + + if (!policy) + return -EINVAL; + + if (!cpumask_intersects(policy->related_cpus, + housekeeping_cpumask(HK_TYPE_TICK))) { + cpufreq_cpu_put(policy); + return -EOPNOTSUPP; + } + + for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) { + if (ref_cpu == start_cpu) { + /* Prevent verifying same CPU twice */ + ref_cpu = nr_cpu_ids; + break; + } + if (!idle_cpu(ref_cpu)) + break; + } + + cpufreq_cpu_put(policy); + + if (ref_cpu >= nr_cpu_ids) + /* No alternative to pull info from */ + return -EAGAIN; + + cpu = ref_cpu; + } else { + break; + } + } + /* + * Reversed computation to the one used to determine + * the arch_freq_scale value + * (see amu_scale_freq_tick for details) + */ + scale = arch_scale_freq_capacity(cpu); + freq = scale * arch_scale_freq_ref(cpu); + freq >>= SCHED_CAPACITY_SHIFT; + return freq; +} + static void amu_fie_setup(const struct cpumask *cpus) { int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { + for_each_cpu(cpu, cpus) if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +406,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563cbce11126..9bfa5c944379 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -41,7 +41,7 @@ #include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> @@ -172,14 +172,6 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) printk("%sCode: %s\n", lvl, str); } -#ifdef CONFIG_PREEMPT -#define S_PREEMPT " PREEMPT" -#elif defined(CONFIG_PREEMPT_RT) -#define S_PREEMPT " PREEMPT_RT" -#else -#define S_PREEMPT "" -#endif - #define S_SMP " SMP" static int __die(const char *str, long err, struct pt_regs *regs) @@ -187,7 +179,7 @@ static int __die(const char *str, long err, struct pt_regs *regs) static int die_counter; int ret; - pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ @@ -506,6 +498,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr) die("Oops - BTI", regs, esr); } +void do_el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); +} + +void do_el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + die("Oops - GCS", regs, esr); +} + void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); @@ -531,6 +533,13 @@ void do_el0_mops(struct pt_regs *regs, unsigned long esr) user_fastforward_single_step(current); } +void do_el1_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); + + kernel_fastforward_single_step(regs); +} + #define __user_cache_maint(insn, address, res) \ if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ @@ -852,6 +861,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", + [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", @@ -1108,7 +1118,7 @@ static struct break_hook kasan_break_hook = { #ifdef CONFIG_UBSAN_TRAP static int ubsan_handler(struct pt_regs *regs, unsigned long esr) { - die(report_ubsan_failure(regs, esr & UBSAN_BRK_MASK), regs, esr); + die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); return DBG_HOOK_HANDLED; } @@ -1135,7 +1145,7 @@ int __init early_brk64(unsigned long addr, unsigned long esr, return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif #ifdef CONFIG_UBSAN_TRAP - if ((esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) + if (esr_is_ubsan_brk(esr)) return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 706c9c3a7a50..78ddf6bdecad 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,8 +18,7 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> -#include <linux/time_namespace.h> -#include <linux/timekeeper_internal.h> +#include <linux/vdso_datastore.h> #include <linux/vmalloc.h> #include <vdso/datapage.h> #include <vdso/helpers.h> @@ -39,8 +38,6 @@ struct vdso_abi_info { const char *vdso_code_start; const char *vdso_code_end; unsigned long vdso_pages; - /* Data Mapping */ - struct vm_special_mapping *dm; /* Code Mapping */ struct vm_special_mapping *cm; }; @@ -60,12 +57,6 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { #endif /* CONFIG_COMPAT_VDSO */ }; -/* - * The vDSO data page. - */ -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -107,75 +98,6 @@ static int __init __vdso_init(enum vdso_abi abi) return 0; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) - zap_vma_pages(vma); -#ifdef CONFIG_COMPAT_VDSO - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) - zap_vma_pages(vma); -#endif - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -185,11 +107,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -197,20 +119,19 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, - vdso_info[abi].dm); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; if (system_supports_bti_kernel()) gp_flags = VM_ARM64_BTI; - vdso_base += VVAR_NR_PAGES * PAGE_SIZE; + vdso_base += VDSO_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -229,7 +150,6 @@ up_fail: enum aarch32_map { AA32_MAP_VECTORS, /* kuser helpers */ AA32_MAP_SIGPAGE, - AA32_MAP_VVAR, AA32_MAP_VDSO, }; @@ -254,10 +174,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { .pages = &aarch32_sig_page, .mremap = aarch32_sigpage_mremap, }, - [AA32_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, [AA32_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, @@ -307,7 +223,6 @@ static int __init __aarch32_alloc_vdso_pages(void) if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) return 0; - vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; return __vdso_init(VDSO_ABI_AA32); @@ -342,7 +257,8 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC, + VM_MAYREAD | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); @@ -365,7 +281,8 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | - VM_MAYWRITE | VM_MAYEXEC, + VM_MAYWRITE | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; @@ -402,26 +319,14 @@ out: } #endif /* CONFIG_COMPAT */ -enum aarch64_map { - AA64_MAP_VVAR, - AA64_MAP_VDSO, -}; - -static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { - [AA64_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, - [AA64_MAP_VDSO] = { - .name = "[vdso]", - .mremap = vdso_mremap, - }, +static struct vm_special_mapping aarch64_vdso_map __ro_after_init = { + .name = "[vdso]", + .mremap = vdso_mremap, }; static int __init vdso_init(void) { - vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; - vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_map; return __vdso_init(VDSO_ABI_AA64); } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 35685c036044..5e27e46aa496 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -7,7 +7,7 @@ # # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index f204a9ddc833..52314be29191 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -20,12 +20,9 @@ OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -41,6 +38,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 25a2cb6317f3..f2dfdc7dc818 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,7 +3,7 @@ # Makefile for vdso32 # -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 8d95d7d35057..e02b27487ce8 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -12,17 +12,16 @@ #include <asm/page.h> #include <asm/vdso.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 58d89d997d05..ad6133b89e7a 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,7 +13,7 @@ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; -#define HYPERVISOR_DATA_SECTIONS \ +#define HYPERVISOR_RODATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ @@ -23,6 +23,15 @@ __hyp_rodata_end = .; \ } +#define HYPERVISOR_DATA_SECTION \ + HYP_SECTION_NAME(.data) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_start = .; \ + *(HYP_SECTION_NAME(.data)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_end = .; \ + } + #define HYPERVISOR_PERCPU_SECTION \ . = ALIGN(PAGE_SIZE); \ HYP_SECTION_NAME(.data..percpu) : { \ @@ -51,7 +60,8 @@ #define SBSS_ALIGN PAGE_SIZE #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE -#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_RODATA_SECTIONS +#define HYPERVISOR_DATA_SECTION #define HYPERVISOR_PERCPU_SECTION #define HYPERVISOR_RELOC_SECTION #define SBSS_ALIGN 0 @@ -162,6 +172,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; @@ -189,7 +200,7 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) - HYPERVISOR_DATA_SECTIONS + HYPERVISOR_RODATA_SECTIONS .got : { *(.got) } /* @@ -248,9 +259,9 @@ SECTIONS __inittext_end = .; __initdata_begin = .; - init_idmap_pg_dir = .; + __pi_init_idmap_pg_dir = .; . += INIT_IDMAP_DIR_SIZE; - init_idmap_pg_end = .; + __pi_init_idmap_pg_end = .; .init.data : { INIT_DATA @@ -287,10 +298,15 @@ SECTIONS __initdata_end = .; __init_end = .; + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") + _data = .; _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + HYPERVISOR_DATA_SECTION + /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback @@ -315,11 +331,12 @@ SECTIONS /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) + __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); - init_pg_dir = .; + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; - init_pg_end = .; + __pi_init_pg_end = .; /* end of zero-init region */ . += SZ_4K; /* stack for the early C runtime */ @@ -328,6 +345,7 @@ SECTIONS . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; + __pi__end = .; STABS_DEBUG DWARF_DEBUG @@ -343,9 +361,6 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") - - .data.rel.ro : { *(.data.rel.ro) } - ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } #include "image-vars.h" |