summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/alternative.c70
-rw-r--r--arch/x86/kernel/amd_nb.c3
-rw-r--r--arch/x86/kernel/apic/apic.c1
-rw-r--r--arch/x86/kernel/apic/vector.c20
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/common.c40
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/hygon.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c46
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c55
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c12
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h21
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c10
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c1
-rw-r--r--arch/x86/kernel/cpu/tsx.c37
-rw-r--r--arch/x86/kernel/crash.c13
-rw-r--r--arch/x86/kernel/fpu/signal.c80
-rw-r--r--arch/x86/kernel/fpu/xstate.c98
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/idt.c40
-rw-r--r--arch/x86/kernel/jump_label.c81
-rw-r--r--arch/x86/kernel/kprobes/core.c20
-rw-r--r--arch/x86/kernel/process.c4
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/setup.c69
-rw-r--r--arch/x86/kernel/sev-shared.c1
-rw-r--r--arch/x86/kernel/sev.c335
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/signal_compat.c9
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/umip.c10
37 files changed, 659 insertions, 468 deletions
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 49ae4e1ac9cd..7de599eba7f0 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -197,7 +197,8 @@ static int __init ffh_cstate_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_INTEL &&
- c->x86_vendor != X86_VENDOR_AMD)
+ c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6974b5174495..e9da3dc71254 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -75,7 +75,7 @@ do { \
} \
} while (0)
-const unsigned char x86nops[] =
+static const unsigned char x86nops[] =
{
BYTES_NOP1,
BYTES_NOP2,
@@ -183,41 +183,69 @@ done:
}
/*
+ * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
+ *
+ * @instr: instruction byte stream
+ * @instrlen: length of the above
+ * @off: offset within @instr where the first NOP has been detected
+ *
+ * Return: number of NOPs found (and replaced).
+ */
+static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
+{
+ unsigned long flags;
+ int i = off, nnops;
+
+ while (i < instrlen) {
+ if (instr[i] != 0x90)
+ break;
+
+ i++;
+ }
+
+ nnops = i - off;
+
+ if (nnops <= 1)
+ return nnops;
+
+ local_irq_save(flags);
+ add_nops(instr + off, nnops);
+ local_irq_restore(flags);
+
+ DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+
+ return nnops;
+}
+
+/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
- unsigned long flags;
struct insn insn;
- int nop, i = 0;
+ int i = 0;
/*
- * Jump over the non-NOP insns, the remaining bytes must be single-byte
- * NOPs, optimize them.
+ * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
+ * ones.
*/
for (;;) {
if (insn_decode_kernel(&insn, &instr[i]))
return;
+ /*
+ * See if this and any potentially following NOPs can be
+ * optimized.
+ */
if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- break;
-
- if ((i += insn.length) >= a->instrlen)
- return;
- }
+ i += optimize_nops_range(instr, a->instrlen, i);
+ else
+ i += insn.length;
- for (nop = i; i < a->instrlen; i++) {
- if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i]))
+ if (i >= a->instrlen)
return;
}
-
- local_irq_save(flags);
- add_nops(instr + nop, i - nop);
- local_irq_restore(flags);
-
- DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
- instr, nop, a->instrlen);
}
/*
@@ -273,8 +301,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
instr, instr, a->instrlen,
replacement, a->replacementlen);
- DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
- DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 09083094eb57..23dda362dc0f 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -25,6 +25,7 @@
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -57,6 +58,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{}
};
@@ -72,6 +74,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4a39fb429f15..d262811ce14b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2604,6 +2604,7 @@ static void __init apic_bsp_setup(bool upmode)
end_local_APIC_setup();
irq_remap_enable_fault_handling();
setup_IO_APIC();
+ lapic_update_legacy_vectors();
}
#ifdef CONFIG_UP_LATE_INIT
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6dbdc7c22bb7..fb67ed5e7e6a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -738,6 +738,26 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace)
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
}
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
void __init lapic_assign_system_vectors(void)
{
unsigned int i, vector = 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c06ac56eae4d..b7c003013d41 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -646,6 +646,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & BIT(12))
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a1b756c49a93..a99d00393206 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1773,10 +1773,16 @@ void syscall_init(void)
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
- /* Flags to clear on syscall */
+ /*
+ * Flags to clear on syscall; clear as much as possible
+ * to minimize user space-kernel interference.
+ */
wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
+ X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ X86_EFLAGS_AC|X86_EFLAGS_ID);
}
#else /* CONFIG_X86_64 */
@@ -1938,13 +1944,12 @@ void cpu_init_exception_handling(void)
/*
* cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
+ * initialized (naturally) in the bootstrap process, such as the GDT. We
+ * reload it nevertheless, this function acts as a 'CPU state barrier',
+ * nothing should get across.
*/
void cpu_init(void)
{
- struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
@@ -1957,8 +1962,6 @@ void cpu_init(void)
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
- setup_getcpu(cpu);
-
pr_debug("Initializing CPU#%d\n", cpu);
if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
@@ -1970,7 +1973,6 @@ void cpu_init(void)
* and set up the GDT descriptor:
*/
switch_to_new_gdt(cpu);
- load_current_idt();
if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
@@ -1990,12 +1992,6 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, cur);
- /* Initialize the TSS. */
- tss_setup_ist(tss);
- tss_setup_io_bitmap(tss);
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
-
- load_TR_desc();
/*
* sp0 points to the entry trampoline stack regardless of what task
* is running.
@@ -2017,6 +2013,18 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
+#ifdef CONFIG_SMP
+void cpu_init_secondary(void)
+{
+ /*
+ * Relies on the BP having set-up the IDT tables, which are loaded
+ * on this CPU in cpu_init_exception_handling().
+ */
+ cpu_init_exception_handling();
+ cpu_init();
+}
+#endif
+
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 67944128876d..95521302630d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -48,6 +48,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
enum tsx_ctrl_states {
TSX_CTRL_ENABLE,
TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
TSX_CTRL_NOT_SUPPORTED,
};
@@ -56,6 +57,7 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
extern void tsx_enable(void);
extern void tsx_disable(void);
+extern void tsx_clear_cpuid(void);
#else
static inline void tsx_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 0bd6c74e3ba1..6d50136f7ab9 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -260,6 +260,10 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
if (c->x86_power & BIT(12))
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8adffc17fa8b..8321c43554a1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -10,6 +10,7 @@
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/delay.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -41,6 +42,7 @@ enum split_lock_detect_state {
sld_off = 0,
sld_warn,
sld_fatal,
+ sld_ratelimit,
};
/*
@@ -717,8 +719,10 @@ static void init_intel(struct cpuinfo_x86 *c)
if (tsx_ctrl_state == TSX_CTRL_ENABLE)
tsx_enable();
- if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ tsx_clear_cpuid();
split_lock_init();
bus_lock_init();
@@ -997,13 +1001,30 @@ static const struct {
{ "off", sld_off },
{ "warn", sld_warn },
{ "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
};
+static struct ratelimit_state bld_ratelimit;
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
- int len = strlen(opt);
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
- return len == arglen && !strncmp(arg, opt, len);
+ return len == arglen;
}
static bool split_lock_verify_msr(bool on)
@@ -1082,6 +1103,15 @@ static void sld_update_msr(bool on)
static void split_lock_init(void)
{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
if (cpu_model_supports_sld)
split_lock_verify_msr(sld_state != sld_off);
}
@@ -1154,6 +1184,12 @@ void handle_bus_lock(struct pt_regs *regs)
switch (sld_state) {
case sld_off:
break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
case sld_warn:
pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
current->comm, current->pid, regs->ip);
@@ -1259,6 +1295,10 @@ static void sld_state_show(void)
" from non-WB" : "");
}
break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
}
}
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e486f96b3cb3..08831acc1d03 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -77,27 +77,29 @@ struct smca_bank_name {
};
static struct smca_bank_name smca_names[] = {
- [SMCA_LS] = { "load_store", "Load Store Unit" },
- [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
- [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP] = { "psp", "Platform Security Processor" },
- [SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU] = { "smu", "System Management Unit" },
- [SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE] = { "pcie", "PCI Express Unit" },
+ [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_RESERVED] = { "reserved", "Reserved" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
+ [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
+ [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
+ [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -155,6 +157,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Parameter Block MCA type */
{ SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
@@ -175,6 +178,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* PCI Express Unit MCA type */
{ SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
+
+ /* xGMI PCS MCA type */
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
+
+ /* xGMI PHY MCA type */
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
+
+ /* WAFL PHY MCA type */
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
};
struct smca_bank smca_banks[MAX_NR_BANKS];
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index b58b85380ddb..0e3ae64d3b76 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -36,7 +36,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
mce_setup(&m);
m.bank = -1;
/* Fake a memory read error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT;
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 22f13343b5da..01ca94f42e4e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -236,7 +236,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
for_each_present_cpu(i) {
if (i == 0)
continue;
- ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, i);
BUG_ON(ret);
}
@@ -252,6 +252,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
static void __init ms_hyperv_init_platform(void)
{
+ int hv_max_functions_eax;
int hv_host_info_eax;
int hv_host_info_ebx;
int hv_host_info_ecx;
@@ -269,6 +270,8 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+ hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+
pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
ms_hyperv.misc_features);
@@ -298,8 +301,7 @@ static void __init ms_hyperv_init_platform(void)
/*
* Extract host information.
*/
- if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
- HYPERV_CPUID_VERSION) {
+ if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) {
hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
@@ -325,9 +327,11 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
}
- if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ pr_info("Hyper-V: Nested features: 0x%x\n",
+ ms_hyperv.nested_features);
}
/*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 3ef5868ac588..7aecb2fc3186 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,7 +63,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
- fallthrough;
+ break;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_PERFCTR0;
@@ -96,7 +96,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
- fallthrough;
+ break;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_EVENTSEL0;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c4d320d02fd5..6a5f60a37219 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -70,6 +70,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
* @name: name of the event
+ * @list: entry in &rdt_resource->evt_list
*/
struct mon_evt {
u32 evtid;
@@ -78,10 +79,13 @@ struct mon_evt {
};
/**
- * struct mon_data_bits - Monitoring details for each event file
- * @rid: Resource id associated with the event file.
+ * union mon_data_bits - Monitoring details for each event file
+ * @priv: Used to store monitoring event data in @u
+ * as kernfs private data
+ * @rid: Resource id associated with the event file
* @evtid: Event id associated with the event file
* @domid: The domain to which the event file belongs
+ * @u: Name of the bit fields struct
*/
union mon_data_bits {
void *priv;
@@ -119,6 +123,7 @@ enum rdt_group_type {
* @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
* @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
* allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
*
* The mode of a resource group enables control over the allowed overlap
* between allocations associated with different resource groups (classes
@@ -142,7 +147,7 @@ enum rdtgrp_mode {
/**
* struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn kernlfs node for the mon_data directory
+ * @mon_data_kn: kernfs node for the mon_data directory
* @parent: parent rdtgrp
* @crdtgrp_list: child rdtgroup node list
* @rmid: rmid for this rdtgroup
@@ -282,11 +287,11 @@ struct rftype {
/**
* struct mbm_state - status for each MBM counter in each domain
* @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it
* @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
- * @prev_bw The most recent bandwidth in MBps
- * @delta_bw Difference between the current and previous bandwidth
- * @delta_comp Indicates whether to compute the delta_bw
+ * @prev_bw: The most recent bandwidth in MBps
+ * @delta_bw: Difference between the current and previous bandwidth
+ * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 chunks;
@@ -456,11 +461,13 @@ struct rdt_parse_data {
* @data_width: Character width of data when displaying
* @domains: All domains for this resource
* @cache: Cache allocation related data
+ * @membw: If the component has bandwidth controls, their properties.
* @format_str: Per resource format string to show domain value
* @parse_ctrlval: Per resource function pointer to parse control values
* @evt_list: List of monitoring events
* @num_rmid: Number of RMIDs available
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @mbm_width: Monitor width, to detect and correct for overflow.
* @fflags: flags to choose base and info files
*/
struct rdt_resource {
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 05a89e33fde2..2207916cae65 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -49,6 +49,7 @@ static struct class *pseudo_lock_class;
/**
* get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
* pseudo-locking. This includes testing to ensure pseudo-locked regions
@@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor)
}
/**
- * pseudo_lock_pm_req - A power management QoS request list entry
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
* @list: Entry within the @pm_reqs list for a pseudo-locked region
* @req: PM QoS request
*/
@@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
/**
* pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
*
* To prevent the cache from being affected by power management entering
* C6 has to be avoided. This is accomplished by requesting a latency
@@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
* the ACPI latencies need to be considered while keeping in mind that C2
* may be set to map to deeper sleep states. In this case the latency
* requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
*/
static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
{
@@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
/**
* rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @r: resource group being queried
+ * @rdtgrp: resource group being queried
*
* Return: 1 if monitor groups have been created for this resource
* group, 0 otherwise.
@@ -1140,6 +1144,8 @@ out:
/**
* pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
*
* The measurement of latency to access a pseudo-locked region should be
* done from a cpu that is associated with that pseudo-locked region.
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 3be203297988..001808e3901c 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -383,7 +383,7 @@ const struct vm_operations_struct sgx_vm_ops = {
/**
* sgx_encl_release - Destroy an enclave instance
- * @kref: address of a kref inside &sgx_encl
+ * @ref: address of a kref inside &sgx_encl
*
* Used together with kref_put(). Frees all the resources associated with the
* enclave and the instance itself.
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 6ad165a5c0cc..64511c4a5200 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -212,6 +212,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
list_splice_tail(&secs_pages, &zombie_secs_pages);
mutex_unlock(&zombie_secs_pages_lock);
+ xa_destroy(&vepc->page_array);
kfree(vepc);
return 0;
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index e2ad30e474f8..9c7a5f049292 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -2,7 +2,7 @@
/*
* Intel Transactional Synchronization Extensions (TSX) control.
*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
*
* Author:
* Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
@@ -84,13 +84,46 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
return TSX_CTRL_ENABLE;
}
+void tsx_clear_cpuid(void)
+{
+ u64 msr;
+
+ /*
+ * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID
+ * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ rdmsrl(MSR_TSX_FORCE_ABORT, msr);
+ msr |= MSR_TFA_TSX_CPUID_CLEAR;
+ wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ }
+}
+
void __init tsx_init(void)
{
char arg[5] = {};
int ret;
- if (!tsx_ctrl_is_supported())
+ /*
+ * Hardware will always abort a TSX transaction if both CPUID bits
+ * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is
+ * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them
+ * here.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
+ tsx_clear_cpuid();
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
return;
+ }
+
+ if (!tsx_ctrl_is_supported()) {
+ tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
+ return;
+ }
ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
if (ret >= 0) {
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 54ce999ed321..e8326a8d1c5d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
rcu_read_unlock();
}
-/*
- * When the crashkernel option is specified, only use the low
- * 1M for the real mode trampoline.
- */
-void __init crash_reserve_low_1M(void)
-{
- if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
- return;
-
- memblock_reserve(0, 1<<20);
- pr_info("Reserving the low 1M of memory for crashkernel\n");
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..b7b92cdf3add 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -221,28 +221,18 @@ sanitize_restored_user_xstate(union fpregs_state *state,
if (use_xsave()) {
/*
- * Note: we don't need to zero the reserved bits in the
- * xstate_header here because we either didn't copy them at all,
- * or we checked earlier that they aren't set.
+ * Clear all feature bits which are not set in
+ * user_xfeatures and clear all extended features
+ * for fx_only mode.
*/
+ u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures;
/*
- * 'user_xfeatures' might have bits clear which are
- * set in header->xfeatures. This represents features that
- * were in init state prior to a signal delivery, and need
- * to be reset back to the init state. Clear any user
- * feature bits which are set in the kernel buffer to get
- * them back to the init state.
- *
- * Supervisor state is unchanged by input from userspace.
- * Ensure supervisor state bits stay set and supervisor
- * state is not modified.
+ * Supervisor state has to be preserved. The sigframe
+ * restore can only modify user features, i.e. @mask
+ * cannot contain them.
*/
- if (fx_only)
- header->xfeatures = XFEATURE_MASK_FPSSE;
- else
- header->xfeatures &= user_xfeatures |
- xfeatures_mask_supervisor();
+ header->xfeatures &= mask | xfeatures_mask_supervisor();
}
if (use_fxsr()) {
@@ -307,13 +297,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
return 0;
}
- if (!access_ok(buf, size))
- return -EACCES;
+ if (!access_ok(buf, size)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!static_cpu_has(X86_FEATURE_FPU))
- return fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, buf) != 0;
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ ret = fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
+ goto out;
+ }
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
@@ -369,6 +363,25 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpregs_unlock();
return 0;
}
+
+ /*
+ * The above did an FPU restore operation, restricted to
+ * the user portion of the registers, and failed, but the
+ * microcode might have modified the FPU registers
+ * nevertheless.
+ *
+ * If the FPU registers do not belong to current, then
+ * invalidate the FPU register state otherwise the task might
+ * preempt current and return to user space with corrupted
+ * FPU registers.
+ *
+ * In case current owns the FPU registers then no further
+ * action is required. The fixup below will handle it
+ * correctly.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ __cpu_invalidate_fpregs_state();
+
fpregs_unlock();
} else {
/*
@@ -377,7 +390,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
*/
ret = __copy_from_user(&env, buf, sizeof(env));
if (ret)
- goto err_out;
+ goto out;
envp = &env;
}
@@ -405,16 +418,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
if (use_xsave() && !fx_only) {
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
- if (using_compacted_format()) {
- ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
- } else {
- ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
-
- if (!ret && state_size > offsetof(struct xregs_state, header))
- ret = validate_user_xstate_header(&fpu->state.xsave.header);
- }
+ ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
if (ret)
- goto err_out;
+ goto out;
sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
fx_only);
@@ -434,7 +440,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
if (ret) {
ret = -EFAULT;
- goto err_out;
+ goto out;
}
sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
@@ -452,7 +458,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
} else {
ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
if (ret)
- goto err_out;
+ goto out;
fpregs_lock();
ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
@@ -463,7 +469,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpregs_deactivate(fpu);
fpregs_unlock();
-err_out:
+out:
if (ret)
fpu__clear_user_states(fpu);
return ret;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a85c64000218..1cadb2faf740 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -441,12 +441,35 @@ static void __init print_xstate_offset_size(void)
}
/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID)
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void)
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
@@ -1402,60 +1437,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
-
-#ifdef CONFIG_IOMMU_SUPPORT
-void update_pasid(void)
-{
- u64 pasid_state;
- u32 pasid;
-
- if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
- return;
-
- if (!current->mm)
- return;
-
- pasid = READ_ONCE(current->mm->pasid);
- /* Set the valid bit in the PASID MSR/state only for valid pasid. */
- pasid_state = pasid == PASID_DISABLED ?
- pasid : pasid | MSR_IA32_PASID_VALID;
-
- /*
- * No need to hold fregs_lock() since the task's fpstate won't
- * be changed by others (e.g. ptrace) while the task is being
- * switched to or is in IPI.
- */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- /* The MSR is active and can be directly updated. */
- wrmsrl(MSR_IA32_PASID, pasid_state);
- } else {
- struct fpu *fpu = &current->thread.fpu;
- struct ia32_pasid_state *ppasid_state;
- struct xregs_state *xsave;
-
- /*
- * The CPU's xstate registers are not currently active. Just
- * update the PASID state in the memory buffer here. The
- * PASID MSR will be loaded when returning to user mode.
- */
- xsave = &fpu->state.xsave;
- xsave->header.xfeatures |= XFEATURE_MASK_PASID;
- ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
- /*
- * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
- * won't be NULL and no need to check its value.
- *
- * Only update the task's PASID state when it's different
- * from the mm's pasid.
- */
- if (ppasid_state->pasid != pasid_state) {
- /*
- * Invalid fpregs so that state restoring will pick up
- * the PASID state.
- */
- __fpu_invalidate_fpregs_state(fpu);
- ppasid_state->pasid = pasid_state;
- }
- }
-}
-#endif /* CONFIG_IOMMU_SUPPORT */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04bddaaba8e2..d8b3ebd2bb85 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -62,7 +62,7 @@ SYM_CODE_START_NOALIGN(startup_64)
*/
/* Set up the stack for verify_cpu(), similar to initial_stack below */
- leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
+ leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
leaq _text(%rip), %rdi
pushq %rsi
@@ -343,10 +343,10 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
* reliably detect the end of the stack.
*/
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS)
+SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA
__INIT
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 2779f5226dc2..df0fa695bb09 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -35,12 +35,16 @@
#define SYSG(_vector, _addr) \
G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+#ifdef CONFIG_X86_64
/*
* Interrupt gate with interrupt stack. The _ist index is the index in
* the tss.ist[] array, but for the descriptor it needs to start at 1.
*/
#define ISTG(_vector, _addr, _ist) \
G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+#else
+#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr)
+#endif
/* Task gate */
#define TSKG(_vector, _gdt) \
@@ -74,7 +78,7 @@ static const __initconst struct idt_data early_idts[] = {
*/
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, asm_exc_divide_error),
- INTG(X86_TRAP_NMI, asm_exc_nmi),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
INTG(X86_TRAP_BR, asm_exc_bounds),
INTG(X86_TRAP_UD, asm_exc_invalid_op),
INTG(X86_TRAP_NM, asm_exc_device_not_available),
@@ -91,12 +95,16 @@ static const __initconst struct idt_data def_idts[] = {
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
- INTG(X86_TRAP_DF, asm_exc_double_fault),
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
#endif
- INTG(X86_TRAP_DB, asm_exc_debug),
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
#ifdef CONFIG_X86_MCE
- INTG(X86_TRAP_MC, asm_exc_machine_check),
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
@@ -221,22 +229,6 @@ static const __initconst struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF, asm_exc_page_fault),
};
-/*
- * The exceptions which use Interrupt stacks. They are setup after
- * cpu_init() when the TSS has been initialized.
- */
-static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
- ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
- ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
-#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
-#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
-#endif
-};
-
/**
* idt_setup_early_pf - Initialize the idt table with early pagefault handler
*
@@ -254,14 +246,6 @@ void __init idt_setup_early_pf(void)
idt_setup_from_table(idt_table, early_pf_idts,
ARRAY_SIZE(early_pf_idts), true);
}
-
-/**
- * idt_setup_ist_traps - Initialize the idt table with traps using IST
- */
-void __init idt_setup_ist_traps(void)
-{
- idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
-}
#endif
static void __init idt_map_in_cea(void)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 6a2eb62c85e6..674906fad43b 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -15,50 +15,75 @@
#include <asm/kprobes.h>
#include <asm/alternative.h>
#include <asm/text-patching.h>
+#include <asm/insn.h>
-static void bug_at(const void *ip, int line)
+int arch_jump_entry_size(struct jump_entry *entry)
{
- /*
- * The location is not an op that we were expecting.
- * Something went wrong. Crash the box, as something could be
- * corrupting the kernel.
- */
- pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
- BUG();
+ struct insn insn = {};
+
+ insn_decode_kernel(&insn, (void *)jump_entry_code(entry));
+ BUG_ON(insn.length != 2 && insn.length != 5);
+
+ return insn.length;
}
-static const void *
-__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type)
+struct jump_label_patch {
+ const void *code;
+ int size;
+};
+
+static struct jump_label_patch
+__jump_label_patch(struct jump_entry *entry, enum jump_label_type type)
{
- const void *expect, *code;
+ const void *expect, *code, *nop;
const void *addr, *dest;
- int line;
+ int size;
addr = (void *)jump_entry_code(entry);
dest = (void *)jump_entry_target(entry);
- code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ size = arch_jump_entry_size(entry);
+ switch (size) {
+ case JMP8_INSN_SIZE:
+ code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
- if (type == JUMP_LABEL_JMP) {
- expect = x86_nops[5]; line = __LINE__;
- } else {
- expect = code; line = __LINE__;
+ case JMP32_INSN_SIZE:
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ default: BUG();
}
- if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
- bug_at(addr, line);
+ if (type == JUMP_LABEL_JMP)
+ expect = nop;
+ else
+ expect = code;
+
+ if (memcmp(addr, expect, size)) {
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n",
+ addr, addr, addr, expect, size, type);
+ BUG();
+ }
if (type == JUMP_LABEL_NOP)
- code = x86_nops[5];
+ code = nop;
- return code;
+ return (struct jump_label_patch){.code = code, .size = size};
}
static inline void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
{
- const void *opcode = __jump_label_set_jump_code(entry, type);
+ const struct jump_label_patch jlp = __jump_label_patch(entry, type);
/*
* As long as only a single processor is running and the code is still
@@ -72,12 +97,11 @@ static inline void __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
- text_poke_early((void *)jump_entry_code(entry), opcode,
- JUMP_LABEL_NOP_SIZE);
+ text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size);
return;
}
- text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
}
static void __ref jump_label_transform(struct jump_entry *entry,
@@ -98,7 +122,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- const void *opcode;
+ struct jump_label_patch jlp;
if (system_state == SYSTEM_BOOTING) {
/*
@@ -109,9 +133,8 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
}
mutex_lock(&text_mutex);
- opcode = __jump_label_set_jump_code(entry, type);
- text_poke_queue((void *)jump_entry_code(entry),
- opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ jlp = __jump_label_patch(entry, type);
+ text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
mutex_unlock(&text_mutex);
return true;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d3d65545cb8b..c492ad3001ca 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -674,7 +674,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn)
break;
if (insn->addr_bytes != sizeof(unsigned long))
- return -EOPNOTSUPP; /* Don't support differnt size */
+ return -EOPNOTSUPP; /* Don't support different size */
if (X86_MODRM_MOD(opcode) != 3)
return -EOPNOTSUPP; /* TODO: support memory addressing */
@@ -1102,24 +1102,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
- } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
- kcb->kprobe_status == KPROBE_HIT_SSDONE) {
- /*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(cur);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
- return 1;
}
return 0;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5e1f38179f49..e52b208b4641 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -931,7 +931,7 @@ unsigned long get_wchan(struct task_struct *p)
unsigned long start, bottom, top, sp, fp, ip, ret = 0;
int count = 0;
- if (p == current || p->state == TASK_RUNNING)
+ if (p == current || task_is_running(p))
return 0;
if (!try_get_task_stack(p))
@@ -975,7 +975,7 @@ unsigned long get_wchan(struct task_struct *p)
goto out;
}
fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
- } while (count++ < 16 && p->state != TASK_RUNNING);
+ } while (count++ < 16 && !task_is_running(p));
out:
put_task_stack(p);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 87a4143aa7d7..4c208ea3bd9f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -911,7 +911,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
* syscall with TS_COMPAT still set.
*/
regs->orig_ax = value;
- if (syscall_get_nr(child, regs) >= 0)
+ if (syscall_get_nr(child, regs) != -1)
child->thread_info.status |= TS_I386_REGS_POKED;
break;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 72920af0b3c0..85acd22f8022 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -44,6 +44,7 @@
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
+#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
#include <linux/vmalloc.h>
@@ -637,11 +638,11 @@ static void __init trim_snb_memory(void)
* them from accessing certain memory ranges, namely anything below
* 1M and in the pages listed in bad_pages[] above.
*
- * To avoid these pages being ever accessed by SNB gfx devices
- * reserve all memory below the 1 MB mark and bad_pages that have
- * not already been reserved at boot time.
+ * To avoid these pages being ever accessed by SNB gfx devices reserve
+ * bad_pages that have not already been reserved at boot time.
+ * All memory below the 1 MB mark is anyway reserved later during
+ * setup_arch(), so there is no need to reserve it here.
*/
- memblock_reserve(0, 1<<20);
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
@@ -694,30 +695,6 @@ static void __init e820_add_kernel_range(void)
e820__range_add(start, size, E820_TYPE_RAM);
}
-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
-
-static int __init parse_reservelow(char *p)
-{
- unsigned long long size;
-
- if (!p)
- return -EINVAL;
-
- size = memparse(p, &p);
-
- if (size < 4096)
- size = 4096;
-
- if (size > 640*1024)
- size = 640*1024;
-
- reserve_low = size;
-
- return 0;
-}
-
-early_param("reservelow", parse_reservelow);
-
static void __init early_reserve_memory(void)
{
/*
@@ -733,14 +710,14 @@ static void __init early_reserve_memory(void)
* The first 4Kb of memory is a BIOS owned area, but generally it is
* not listed as such in the E820 table.
*
- * Reserve the first memory page and typically some additional
- * memory (64KiB by default) since some BIOSes are known to corrupt
- * low memory. See the Kconfig help text for X86_RESERVE_LOW.
+ * Reserve the first 64K of memory since some BIOSes are known to
+ * corrupt low memory. After the real mode trampoline is allocated the
+ * rest of the memory below 640k is reserved.
*
* In addition, make sure page 0 is always reserved because on
* systems with L1TF its contents can be leaked to user processes.
*/
- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+ memblock_reserve(0, SZ_64K);
early_reserve_initrd();
@@ -751,6 +728,7 @@ static void __init early_reserve_memory(void)
reserve_ibft_region();
reserve_bios_regions();
+ trim_snb_memory();
}
/*
@@ -1081,14 +1059,21 @@ void __init setup_arch(char **cmdline_p)
(max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif
- reserve_real_mode();
-
/*
- * Reserving memory causing GPU hangs on Sandy Bridge integrated
- * graphics devices should be done after we allocated memory under
- * 1M for the real mode trampoline.
+ * Find free memory for the real mode trampoline and place it there. If
+ * there is not enough free memory under 1M, on EFI-enabled systems
+ * there will be additional attempt to reclaim the memory for the real
+ * mode trampoline at efi_free_boot_services().
+ *
+ * Unconditionally reserve the entire first 1M of RAM because BIOSes
+ * are known to corrupt low memory and several hundred kilobytes are not
+ * worth complex detection what memory gets clobbered. Windows does the
+ * same thing for very similar reasons.
+ *
+ * Moreover, on machines with SandyBridge graphics or in setups that use
+ * crashkernel the entire 1M is reserved anyway.
*/
- trim_snb_memory();
+ reserve_real_mode();
init_mem_mapping();
@@ -1226,6 +1211,14 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
+ /*
+ * This needs to run before setup_local_APIC() which soft-disables the
+ * local APIC temporarily and that masks the thermal LVT interrupt,
+ * leading to softlockups on machines which have configured SMI
+ * interrupt delivery.
+ */
+ therm_lvt_init();
+
mcheck_init();
register_refined_jiffies(CLOCK_TICK_RATE);
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 6ec8b3bfd76e..9f90f460a28c 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -63,6 +63,7 @@ static bool sev_es_negotiate_protocol(void)
static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
{
+ ghcb->save.sw_exit_code = 0;
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 9578c82832aa..a6895e440bc3 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -7,12 +7,11 @@
* Author: Joerg Roedel <jroedel@suse.de>
*/
-#define pr_fmt(fmt) "SEV-ES: " fmt
+#define pr_fmt(fmt) "SEV: " fmt
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
#include <linux/mem_encrypt.h>
-#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
@@ -192,19 +191,39 @@ void noinstr __sev_es_ist_exit(void)
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
}
-static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
+ WARN_ON(!irqs_disabled());
+
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
if (unlikely(data->ghcb_active)) {
/* GHCB is already in use - save its contents */
- if (unlikely(data->backup_ghcb_active))
- return NULL;
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ instrumentation_begin();
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
+ }
/* Mark backup_ghcb active before writing to it */
data->backup_ghcb_active = true;
@@ -221,24 +240,6 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
return ghcb;
}
-static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (state->ghcb) {
- /* Restore GHCB from Backup */
- *ghcb = *state->ghcb;
- data->backup_ghcb_active = false;
- state->ghcb = NULL;
- } else {
- data->ghcb_active = false;
- }
-}
-
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);
@@ -266,17 +267,24 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
{
char buffer[MAX_INSN_SIZE];
- int res;
+ int insn_bytes;
- res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
- if (!res) {
+ insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+ if (insn_bytes == 0) {
+ /* Nothing could be copied */
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
ctxt->fi.cr2 = ctxt->regs->ip;
return ES_EXCEPTION;
+ } else if (insn_bytes == -EINVAL) {
+ /* Effective RIP could not be calculated */
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ ctxt->fi.cr2 = 0;
+ return ES_EXCEPTION;
}
- if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res))
+ if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
return ES_DECODE_FAILED;
if (ctxt->insn.immediate.got)
@@ -323,31 +331,44 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
- /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
- if (!user_mode(ctxt->regs) && !access_ok(target, size)) {
- memcpy(dst, buf, size);
- return ES_OK;
- }
-
+ /*
+ * This function uses __put_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __put_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __put_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_to_user() here because
+ * vc_write_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
switch (size) {
case 1:
memcpy(&d1, buf, 1);
- if (put_user(d1, target))
+ if (__put_user(d1, target))
goto fault;
break;
case 2:
memcpy(&d2, buf, 2);
- if (put_user(d2, target))
+ if (__put_user(d2, target))
goto fault;
break;
case 4:
memcpy(&d4, buf, 4);
- if (put_user(d4, target))
+ if (__put_user(d4, target))
goto fault;
break;
case 8:
memcpy(&d8, buf, 8);
- if (put_user(d8, target))
+ if (__put_user(d8, target))
goto fault;
break;
default:
@@ -378,30 +399,43 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
- /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
- if (!user_mode(ctxt->regs) && !access_ok(s, size)) {
- memcpy(buf, src, size);
- return ES_OK;
- }
-
+ /*
+ * This function uses __get_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __get_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __get_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_from_user() here because
+ * vc_read_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
switch (size) {
case 1:
- if (get_user(d1, s))
+ if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
case 2:
- if (get_user(d2, s))
+ if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
case 4:
- if (get_user(d4, s))
+ if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
case 8:
- if (get_user(d8, s))
+ if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
@@ -461,12 +495,37 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
/* Include code shared with pre-decompression boot stage */
#include "sev-shared.c"
+static noinstr void __sev_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ /*
+ * Invalidate the GHCB so a VMGEXIT instruction issued
+ * from userspace won't appear to be valid.
+ */
+ vc_ghcb_invalidate(ghcb);
+ data->ghcb_active = false;
+ }
+}
+
void noinstr __sev_es_nmi_complete(void)
{
struct ghcb_state state;
struct ghcb *ghcb;
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
@@ -476,7 +535,7 @@ void noinstr __sev_es_nmi_complete(void)
sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
VMGEXIT();
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
}
static u64 get_jump_table_addr(void)
@@ -488,7 +547,7 @@ static u64 get_jump_table_addr(void)
local_irq_save(flags);
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
@@ -502,7 +561,7 @@ static u64 get_jump_table_addr(void)
ghcb_sw_exit_info_2_is_valid(ghcb))
ret = ghcb->save.sw_exit_info_2;
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
local_irq_restore(flags);
@@ -627,7 +686,7 @@ static void sev_es_ap_hlt_loop(void)
struct ghcb_state state;
struct ghcb *ghcb;
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
while (true) {
vc_ghcb_invalidate(ghcb);
@@ -644,7 +703,7 @@ static void sev_es_ap_hlt_loop(void)
break;
}
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
}
/*
@@ -734,7 +793,7 @@ void __init sev_es_init_vc_handling(void)
sev_es_setup_play_dead();
/* Secondary CPUs use the runtime #VC handler */
- initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
+ initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
}
static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
@@ -1172,14 +1231,6 @@ static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
return ES_EXCEPTION;
}
-static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
-{
- if (user_mode(regs))
- noist_exc_debug(regs);
- else
- exc_debug(regs);
-}
-
static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
struct ghcb *ghcb,
unsigned long exit_code)
@@ -1255,6 +1306,10 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
case X86_TRAP_UD:
exc_invalid_op(ctxt->regs);
break;
+ case X86_TRAP_PF:
+ write_cr2(ctxt->fi.cr2);
+ exc_page_fault(ctxt->regs, error_code);
+ break;
case X86_TRAP_AC:
exc_alignment_check(ctxt->regs, error_code);
break;
@@ -1271,55 +1326,15 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
-/*
- * Main #VC exception handler. It is called when the entry code was able to
- * switch off the IST to a safe kernel stack.
- *
- * With the current implementation it is always possible to switch to a safe
- * stack because #VC exceptions only happen at known places, like intercepted
- * instructions or accesses to MMIO areas/IO ports. They can also happen with
- * code instrumentation when the hypervisor intercepts #DB, but the critical
- * paths are forbidden to be instrumented, so #DB exceptions currently also
- * only happen in safe places.
- */
-DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
+ bool ret = true;
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
- vc_handle_trap_db(regs);
- return;
- }
-
- irq_state = irqentry_nmi_enter(regs);
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
-
- /*
- * This is invoked through an interrupt gate, so IRQs are disabled. The
- * code below might walk page-tables for user or kernel addresses, so
- * keep the IRQs disabled to protect us against concurrent TLB flushes.
- */
-
- ghcb = sev_es_get_ghcb(&state);
- if (!ghcb) {
- /*
- * Mark GHCBs inactive so that panic() is able to print the
- * message.
- */
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-
- panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
- }
+ ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
result = vc_init_em_ctxt(&ctxt, regs, error_code);
@@ -1327,7 +1342,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, ghcb, error_code);
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
/* Done - now check the result */
switch (result) {
@@ -1335,17 +1350,20 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
vc_finish_insn(&ctxt);
break;
case ES_UNSUPPORTED:
- pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_VMM_ERROR:
pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_DECODE_FAILED:
pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_EXCEPTION:
vc_forward_exception(&ctxt);
break;
@@ -1361,24 +1379,52 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
BUG();
}
-out:
- instrumentation_end();
- irqentry_nmi_exit(regs, irq_state);
+ return ret;
+}
- return;
+static __always_inline bool vc_is_db(unsigned long error_code)
+{
+ return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
+}
-fail:
- if (user_mode(regs)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- } else {
- pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
- result);
+/*
+ * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
+ * and will panic when an error happens.
+ */
+DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
+{
+ irqentry_state_t irq_state;
+
+ /*
+ * With the current implementation it is always possible to switch to a
+ * safe stack because #VC exceptions only happen at known places, like
+ * intercepted instructions or accesses to MMIO areas/IO ports. They can
+ * also happen with code instrumentation when the hypervisor intercepts
+ * #DB, but the critical paths are forbidden to be instrumented, so #DB
+ * exceptions currently also only happen in safe places.
+ *
+ * But keep this here in case the noinstr annotations are violated due
+ * to bug elsewhere.
+ */
+ if (unlikely(on_vc_fallback_stack(regs))) {
+ instrumentation_begin();
+ panic("Can't handle #VC exception from unsupported context\n");
+ instrumentation_end();
+ }
+
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ exc_debug(regs);
+ return;
+ }
+ irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
/* Show some debug info */
show_regs(regs);
@@ -1389,23 +1435,38 @@ fail:
panic("Returned from Terminate-Request to Hypervisor\n");
}
- goto out;
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
}
-/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
-DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
+/*
+ * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
+ * and will kill the current task with SIGBUS when an error happens.
+ */
+DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
{
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ noist_exc_debug(regs);
+ return;
+ }
+
+ irqentry_enter_from_user_mode(regs);
instrumentation_begin();
- panic("Can't handle #VC exception from unsupported context\n");
- instrumentation_end();
-}
-DEFINE_IDTENTRY_VC(exc_vmm_communication)
-{
- if (likely(!on_vc_fallback_stack(regs)))
- safe_stack_exc_vmm_communication(regs, error_code);
- else
- ist_exc_vmm_communication(regs, error_code);
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
}
bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a06cb107c0e8..e12779a2714d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -713,7 +713,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
/* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* If so, check system call restarting.. */
switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
@@ -793,7 +793,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
}
/* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* Restart the system call - no handlers present */
switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 0e5d0a7e203b..06743ec054d2 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -127,6 +127,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10);
+
BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
@@ -138,8 +141,10 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14);
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7770245cc7fa..9320285a5e29 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -232,11 +232,9 @@ static void notrace start_secondary(void *unused)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
- cpu_init_exception_handling();
- cpu_init();
+ cpu_init_secondary();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();
enable_start_cpu0 = 0;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 853ea7a80806..ed540e09a399 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1160,12 +1160,9 @@ void __init trap_init(void)
/* Init GHCB memory pages when running as an SEV-ES guest */
sev_es_init_vc_handling();
+ /* Initialize TSS before setting up traps so ISTs work */
+ cpu_init_exception_handling();
+ /* Setup traps as cpu_init() might #GP */
idt_setup_traps();
-
- /*
- * Should be a barrier for any external CPU state:
- */
cpu_init();
-
- idt_setup_ist_traps();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57ec01192180..2e076a459a0c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs)
static struct clocksource clocksource_tsc_early = {
.name = "tsc-early",
.rating = 299,
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -1152,7 +1153,8 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_VALID_FOR_HRES |
- CLOCK_SOURCE_MUST_VERIFY,
+ CLOCK_SOURCE_MUST_VERIFY |
+ CLOCK_SOURCE_VERIFY_PERCPU,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 8daa70b0d2da..576b47e7523d 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -346,14 +346,12 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (!regs)
return false;
- nr_copied = insn_fetch_from_user(regs, buf);
-
/*
- * The insn_fetch_from_user above could have failed if user code
- * is protected by a memory protection key. Give up on emulation
- * in such a case. Should we issue a page fault?
+ * Give up on emulation if fetching the instruction failed. Should a
+ * page fault or a #GP be issued?
*/
- if (!nr_copied)
+ nr_copied = insn_fetch_from_user(regs, buf);
+ if (nr_copied <= 0)
return false;
if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))