diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2020-06-11 16:17:57 +0300 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2020-06-11 16:17:57 +0300 |
commit | f77d26a9fc525286bcef3d4f98b52e17482cf49c (patch) | |
tree | 6b179c9aa84787773cb601a14a64255e2912154b /arch/x86/kernel/cpu | |
parent | b6bea24d41519e8c31e4798f1c1a3f67e540c5d0 (diff) | |
parent | f0178fc01fe46bab6a95415f5647d1a74efcad1b (diff) | |
download | linux-f77d26a9fc525286bcef3d4f98b52e17482cf49c.tar.xz |
Merge branch 'x86/entry' into ras/core
to fixup conflicts in arch/x86/kernel/cpu/mce/core.c so MCE specific follow
up patches can be applied without creating a horrible merge conflict
afterwards.
Diffstat (limited to 'arch/x86/kernel/cpu')
25 files changed, 501 insertions, 221 deletions
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 676022e71791..1da9b1c9a2db 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,10 +10,10 @@ */ #include <linux/interrupt.h> -#include <asm/acrn.h> #include <asm/apic.h> #include <asm/desc.h> #include <asm/hypervisor.h> +#include <asm/idtentry.h> #include <asm/irq_regs.h> static uint32_t __init acrn_detect(void) @@ -24,7 +24,7 @@ static uint32_t __init acrn_detect(void) static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); } static bool acrn_x2apic_available(void) @@ -39,7 +39,7 @@ static bool acrn_x2apic_available(void) static void (*acrn_intr_handler)(void); -__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,13 +50,12 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) acrn_intr_handler(); - exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 547ad7bbf0e0..d4806eac9325 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -18,6 +18,7 @@ #include <asm/pci-direct.h> #include <asm/delay.h> #include <asm/debugreg.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -597,6 +598,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; } } + + resctrl_cpu_detect(c); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -1142,8 +1145,7 @@ static const int amd_erratum_383[] = /* #1054: Instructions Retired Performance Counter May Be Inaccurate */ static const int amd_erratum_1054[] = - AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..b6f887be440c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -15,6 +15,7 @@ #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched/smt.h> +#include <linux/pgtable.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -26,7 +27,6 @@ #include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> -#include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> @@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -108,6 +109,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -398,6 +400,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -1528,6 +1621,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1572,6 +1670,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1618,4 +1719,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..043d93cdcaad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,6 +21,7 @@ #include <linux/smp.h> #include <linux/io.h> #include <linux/syscore_ops.h> +#include <linux/pgtable.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> @@ -35,7 +36,6 @@ #include <asm/vsyscall.h> #include <linux/topology.h> #include <linux/cpumask.h> -#include <asm/pgtable.h> #include <linux/atomic.h> #include <asm/proto.h> #include <asm/setup.h> @@ -387,7 +387,30 @@ set_register: bits_missing); } } -EXPORT_SYMBOL(native_write_cr4); +#if IS_MODULE(CONFIG_LKDTM) +EXPORT_SYMBOL_GPL(native_write_cr4); +#endif + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); void cr4_init(void) { @@ -854,30 +877,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } -static void init_cqm(struct cpuinfo_x86 *c) -{ - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - return; - } - - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = cpuid_ebx(0xf); - - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || - cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { - u32 eax, ebx, ecx, edx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } -} - void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -945,7 +944,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); - init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -1075,9 +1073,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; -static bool __init cpu_matches(unsigned long which) +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -1097,31 +1116,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -1139,7 +1161,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1148,7 +1179,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); @@ -1377,20 +1408,6 @@ static void generic_identify(struct cpuinfo_x86 *c) #endif } -static void x86_init_cache_qos(struct cpuinfo_x86 *c) -{ - /* - * The heavy lifting of max_rmid and cache_occ_scale are handled - * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu - * in case CQM bits really aren't there in this CPU. - */ - if (c != &boot_cpu_data) { - boot_cpu_data.x86_cache_max_rmid = - min(boot_cpu_data.x86_cache_max_rmid, - c->x86_cache_max_rmid); - } -} - /* * Validate that ACPI/mptables have the same information about the * effective APIC id and update the package map. @@ -1503,7 +1520,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif x86_init_rdrand(c); - x86_init_cache_qos(c); setup_pku(c); /* @@ -1591,6 +1607,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } static __init int setup_noclflush(char *arg) @@ -1689,25 +1706,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -DEFINE_PER_CPU(int, debug_stack_usage); -DEFINE_PER_CPU(u32, debug_idt_ctr); - -void debug_stack_set_zero(void) -{ - this_cpu_inc(debug_idt_ctr); - load_current_idt(); -} -NOKPROBE_SYMBOL(debug_stack_set_zero); - -void debug_stack_reset(void) -{ - if (WARN_ON(!this_cpu_read(debug_idt_ctr))) - return; - if (this_cpu_dec_return(debug_idt_ctr) == 0) - load_current_idt(); -} -NOKPROBE_SYMBOL(debug_stack_reset); - #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 37fdefd14f28..fb538fccd24c 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index bf08d4508ecb..63926c94eb5f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> +#include <linux/pgtable.h> #include <linux/string.h> #include <linux/bitops.h> @@ -11,7 +12,6 @@ #include <linux/uaccess.h> #include <asm/cpufeature.h> -#include <asm/pgtable.h> #include <asm/msr.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -22,6 +22,7 @@ #include <asm/cpu_device_id.h> #include <asm/cmdline.h> #include <asm/traps.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -322,6 +323,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) detect_ht_early(c); } +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + #ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 @@ -961,6 +967,7 @@ static const struct cpu_dev intel_cpu_dev = { #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, }; @@ -1119,35 +1126,53 @@ void switch_to_sld(unsigned long tifn) sld_update_msr(!(tifn & _TIF_SLD)); } -#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY} - /* - * The following processors have the split lock detection feature. But - * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot - * be enumerated. Enable it by family and model matching on these - * processors. + * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should + * only be trusted if it is confirmed that a CPU model implements a + * specific feature at a particular bit position. + * + * The possible driver data field values: + * + * - 0: CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + * + * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use + * bit 5 to enumerate the per-core split-lock detection feature. */ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X), - SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), {} }; void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) { - u64 ia32_core_caps = 0; + const struct x86_cpu_id *m; + u64 ia32_core_caps; - if (c->x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; - if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) { - /* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */ + + m = x86_match_cpu(split_lock_cpu_ids); + if (!m) + return; + + switch (m->driver_data) { + case 0: + break; + case 1: + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - } else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - /* Enumerate split lock detection by family and model. */ - if (x86_match_cpu(split_lock_cpu_ids)) - ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT; + if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)) + return; + break; + default: + return; } - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - split_lock_setup(); + split_lock_setup(); } diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index d3482eb43ff3..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index ea3cf714b7ad..99be063fcb1b 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -921,14 +921,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 02e1f165f148..30413325de22 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,8 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/task_work.h> +#include <linux/hardirq.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -128,7 +130,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +noinstr void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -138,12 +140,12 @@ void mce_setup(struct mce *m) m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; - rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - rdmsrl(MSR_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_PPIN); else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) - rdmsrl(MSR_AMD_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_AMD_PPIN); m->microcode = boot_cpu_data.microcode; } @@ -1057,23 +1059,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1086,13 +1071,15 @@ static int do_memory_failure(struct mce *m) * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly. */ -static bool __mc_check_crashing_cpu(int cpu) +static noinstr bool mce_check_crashing_cpu(void) { + unsigned int cpu = smp_processor_id(); + if (cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1100,7 +1087,7 @@ static bool __mc_check_crashing_cpu(int cpu) } if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); return true; } } @@ -1175,6 +1162,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1193,12 +1203,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; - int cpu = smp_processor_id(); struct mce m, *final; char *msg = NULL; int worst = 0; @@ -1227,11 +1236,6 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) */ int lmce = 1; - if (__mc_check_crashing_cpu(cpu)) - return; - - ist_enter(regs); - this_cpu_inc(mce_exception_count); mce_gather_info(&m, regs); @@ -1319,17 +1323,19 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) - goto out_ist; + return; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); - - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - local_irq_disable(); - ist_end_non_atomic(); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { /* * Handle an MCE which has happened in kernel space but from @@ -1341,16 +1347,12 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) * proper one. */ if (m.kflags & MCE_IN_KERNEL_RECOV) { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } } - -out_ist: - ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) @@ -1876,21 +1878,84 @@ bool filter_mce(struct mce *m) } /* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) +static noinstr void unexpected_machine_check(struct pt_regs *regs) { + instrumentation_begin(); pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); + instrumentation_end(); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; + +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) +{ + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ + if (machine_check_vector == do_machine_check && + mce_check_crashing_cpu()) + return; + + nmi_enter(); + /* + * The call targets are marked noinstr, but objtool can't figure + * that out because it's an indirect call. Annotate it. + */ + instrumentation_begin(); + trace_hardirqs_off_finish(); + machine_check_vector(regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + nmi_exit(); +} + +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + idtentry_enter_user(regs); + instrumentation_begin(); + machine_check_vector(regs); + instrumentation_end(); + idtentry_exit_user(regs); +} -dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code) +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) { - machine_check_vector(regs, error_code); + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} + +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_user(regs); + local_db_restore(dr7); } -NOKPROBE_SYMBOL(do_mce); +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3413b41b8d55..0593b192eb8f 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -146,9 +146,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) regs.cs = m->cs; pregs = ®s; } - /* in mcheck exeception handler, irq will be disabled */ + /* do_machine_check() expects interrupts disabled -- at least */ local_irq_save(flags); - do_machine_check(pregs, 0); + do_machine_check(pregs); local_irq_restore(flags); m->finished = 0; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 55f5c7b755f2..6473070b5da4 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -9,7 +9,7 @@ #include <asm/mce.h> /* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *); enum severity_level { MCE_NO_SEVERITY, diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..19e90cae8e97 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -20,12 +21,11 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs, long error_code) +static noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; - ist_enter(regs); - + instrumentation_begin(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -38,8 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - ist_exit(regs); + instrumentation_end(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index f36dc0742085..a7cd2d203ced 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -614,14 +614,13 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) { - entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 28812cc15300..6a059a035021 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -21,12 +21,11 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) { - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..9c9f0abd2d7f 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -16,14 +17,12 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs, long error_code) +static noinstr void winchip_machine_check(struct pt_regs *regs) { - ist_enter(regs); - + instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - ist_exit(regs); + instrumentation_end(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7019d4b2df0c..baec68b7e010 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -545,8 +545,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout) /* * Returns: * < 0 - on error - * 0 - no update done - * 1 - microcode was updated + * 0 - success (no update done or microcode was updated) */ static int __reload_late(void *info) { @@ -573,11 +572,11 @@ static int __reload_late(void *info) else goto wait_for_siblings; - if (err > UCODE_NFOUND) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; - } else if (err == UCODE_UPDATED || err == UCODE_OK) { - ret = 1; } wait_for_siblings: @@ -608,7 +607,7 @@ static int microcode_reload_late(void) atomic_set(&late_cpus_out, 0); ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret > 0) + if (ret == 0) microcode_check(); pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); @@ -649,7 +648,7 @@ static ssize_t reload_store(struct device *dev, put: put_online_cpus(); - if (ret >= 0) + if (ret == 0) ret = size; return ret; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index caa032ce3fe3..af94f05a5c66 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -23,6 +23,7 @@ #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/desc.h> +#include <asm/idtentry.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> @@ -40,11 +41,10 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); @@ -52,7 +52,6 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -73,19 +72,16 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); * Routines to do per-architecture handling of stimer0 * interrupts when in Direct Mode */ - -__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -227,8 +223,8 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("Hyper-V: features 0x%x, hints 0x%x\n", - ms_hyperv.features, ms_hyperv.hints); + pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -263,6 +259,16 @@ static void __init ms_hyperv_init_platform(void) cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); } + /* + * Hyper-V expects to get crash register data or kmsg when + * crash enlightment is available and system crashes. Set + * crash_kexec_post_notifiers to be true to make sure that + * calling crash enlightment interface before running kdump + * kernel. + */ + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) + crash_kexec_post_notifiers = true; + #ifdef CONFIG_X86_LOCAL_APIC if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { @@ -321,17 +327,19 @@ static void __init ms_hyperv_init_platform(void) x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_vector); + asm_sysvec_hyperv_reenlightenment); + } /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { alloc_intr_gate(HYPERV_STIMER0_VECTOR, - hv_stimer0_callback_vector); + asm_sysvec_hyperv_stimer0); + } # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 51b9190c628b..23ad8e953dfb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); @@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..a5ee607a3b89 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 89049b343c7a..12f967c6b603 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" /* Mutex to protect rdtgroup access. */ @@ -578,6 +578,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; cpumask_set_cpu(cpu, &d->cpu_mask); + rdt_domain_reconfigure_cdp(r); + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { kfree(d); return; @@ -956,6 +958,36 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +/* Runs once on the BSP during boot. */ +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + if (c->x86_vendor == X86_VENDOR_INTEL) + c->x86_cache_mbm_width_offset = eax & 0xff; + else + c->x86_cache_mbm_width_offset = -1; + } +} + static int __init resctrl_late_init(void) { struct rdt_resource *r; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 055c8613b531..934c8fb8a64a 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first) +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) { /* * setup the parameters to send to the IPI to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; + rr->r = r; rr->d = d; rr->val = 0; rr->first = first; @@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } - mon_event_read(&rr, d, rdtgrp, evtid, false); + mon_event_read(&rr, r, d, rdtgrp, evtid, false); if (rr.val & RMID_VAL_ERROR) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 181c992f448c..f20a47d120b1 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -31,7 +31,7 @@ #define CQM_LIMBOCHECK_INTERVAL 1000 -#define MBM_CNTR_WIDTH 24 +#define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -40,6 +40,12 @@ #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) struct rdt_fs_context { @@ -87,6 +93,7 @@ union mon_data_bits { struct rmid_read { struct rdtgroup *rgrp; + struct rdt_resource *r; struct rdt_domain *d; int evtid; bool first; @@ -460,6 +467,7 @@ struct rdt_resource { struct list_head evt_list; int num_rmid; unsigned int mon_scale; + unsigned int mbm_width; unsigned long fflags; }; @@ -587,8 +595,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id); void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain *d); -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); @@ -601,5 +610,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r); bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r); +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 773124b0e18a..837d7d012b7b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -214,9 +214,9 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { - u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); return chunks >>= shift; @@ -256,7 +256,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval); + chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width); m->chunks += chunks; m->prev_msr = tval; @@ -278,7 +278,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) return; - chunks = mbm_overflow_count(m->prev_bw_msr, tval); + chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); m->chunks_bw += chunks; m->chunks = m->chunks_bw; cur_bw = (chunks * r->mon_scale) >> 20; @@ -433,11 +433,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) } } -static void mbm_update(struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) { struct rmid_read rr; rr.first = false; + rr.r = r; rr.d = d; /* @@ -510,6 +511,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdtgroup *prgrp, *crgrp; int cpu = smp_processor_id(); struct list_head *head; + struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -517,16 +519,18 @@ void mbm_handle_overflow(struct work_struct *work) if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; - d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + d = get_domain_from_cpu(cpu, r); if (!d) goto out_unlock; list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -614,11 +618,18 @@ static void l3_mon_evt_init(struct rdt_resource *r) int rdt_get_mon_l3_config(struct rdt_resource *r) { + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; unsigned int cl_size = boot_cpu_data.x86_cache_size; int ret; r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + r->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + r->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); /* * A reasonable upper limit on the max threshold is the number diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..0daf2f1cf7a8 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -24,7 +24,7 @@ #include <asm/cacheflush.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/perf_event.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ @@ -1326,9 +1326,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * pseudo-locked region will still be here on return. * * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_sem semaphore which is obtained in - * the device_create() and debugfs_create_dir() callpath below - * as well as before the mmap() callback is called. + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. */ mutex_unlock(&rdtgroup_mutex); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 064e9ef44cd6..23b4b61319d3 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -29,7 +29,7 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); @@ -1859,6 +1859,19 @@ static int set_cache_qos_cfg(int level, bool enable) return 0; } +/* Restore the qos cfg state when a domain comes online */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r) +{ + if (!r->alloc_capable) + return; + + if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA]) + l2_qos_cfg_update(&r->alloc_enabled); + + if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA]) + l3_qos_cfg_update(&r->alloc_enabled); +} + /* * Enable or disable the MBA software controller * which helps user specify bandwidth in MBps. @@ -2459,7 +2472,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, goto out_destroy; if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } kernfs_activate(kn); return 0; @@ -3072,7 +3085,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) * If the rdtgroup is a mon group and parent directory * is a valid "mon_groups" directory, remove the mon group. */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) { + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { ret = rdtgroup_ctrl_remove(kn, rdtgrp); @@ -3185,10 +3199,10 @@ int __init rdtgroup_init(void) * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_sem - * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex * is taken, thus creating dependency: - * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause * issues considering the other two lock dependencies. * By creating the debugfs directory here we avoid a dependency * that may cause deadlock (even though file operations cannot |