summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S9
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S10
-rw-r--r--arch/x86/kernel/alternative.c132
-rw-r--r--arch/x86/kernel/amd_gart_64.c12
-rw-r--r--arch/x86/kernel/apic/apic.c43
-rw-r--r--arch/x86/kernel/apic/io_apic.c25
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c184
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/bugs.c185
-rw-r--r--arch/x86/kernel/cpu/common.c292
-rw-r--r--arch/x86/kernel/cpu/cpu.h18
-rw-r--r--arch/x86/kernel/cpu/intel.c13
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c93
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c11
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h6
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c251
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c36
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c7
-rw-r--r--arch/x86/kernel/cpu/rdrand.c22
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c4
-rw-r--r--arch/x86/kernel/cpu/tsx.c140
-rw-r--r--arch/x86/kernel/crash.c128
-rw-r--r--arch/x86/kernel/doublefault.c5
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c22
-rw-r--r--arch/x86/kernel/ftrace.c14
-rw-r--r--arch/x86/kernel/ftrace_32.S23
-rw-r--r--arch/x86/kernel/ftrace_64.S89
-rw-r--r--arch/x86/kernel/head_32.S72
-rw-r--r--arch/x86/kernel/head_64.S113
-rw-r--r--arch/x86/kernel/ioport.c209
-rw-r--r--arch/x86/kernel/irqflags.S8
-rw-r--r--arch/x86/kernel/jailhouse.c136
-rw-r--r--arch/x86/kernel/jump_label.c9
-rw-r--r--arch/x86/kernel/kdebugfs.c21
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/kprobes/opt.c11
-rw-r--r--arch/x86/kernel/ksysfs.c31
-rw-r--r--arch/x86/kernel/kvm.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c47
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1586
-rw-r--r--arch/x86/kernel/pci-dma.c6
-rw-r--r--arch/x86/kernel/process.c205
-rw-r--r--arch/x86/kernel/process_32.c77
-rw-r--r--arch/x86/kernel/process_64.c86
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S13
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S13
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/tboot.c15
-rw-r--r--arch/x86/kernel/tce_64.c177
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/kernel/tsc_sync.c8
-rw-r--r--arch/x86/kernel/umip.c18
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S16
-rw-r--r--arch/x86/kernel/x86_init.c24
65 files changed, 2019 insertions, 2773 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3578ad248bc9..32acb970f416 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -134,7 +134,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
-obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
+obj-$(CONFIG_X86_UMIP) += umip.o
obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
@@ -146,7 +146,6 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
- obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index e95e95960156..daf88f8143c5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -9,8 +9,7 @@
.code32
ALIGN
-ENTRY(wakeup_pmode_return)
-wakeup_pmode_return:
+SYM_CODE_START(wakeup_pmode_return)
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %fs
@@ -39,6 +38,7 @@ wakeup_pmode_return:
# jump to place where we left off
movl saved_eip, %eax
jmp *%eax
+SYM_CODE_END(wakeup_pmode_return)
bogus_magic:
jmp bogus_magic
@@ -72,7 +72,7 @@ restore_registers:
popfl
ret
-ENTRY(do_suspend_lowlevel)
+SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
@@ -87,10 +87,11 @@ ret_point:
call restore_registers
call restore_processor_state
ret
+SYM_CODE_END(do_suspend_lowlevel)
.data
ALIGN
-ENTRY(saved_magic) .long 0
+SYM_DATA(saved_magic, .long 0)
saved_eip: .long 0
# saved registers
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 7f9ade13bbcf..c8daa92f38dc 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -14,7 +14,7 @@
/*
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
-ENTRY(wakeup_long64)
+SYM_FUNC_START(wakeup_long64)
movq saved_magic, %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
@@ -40,9 +40,9 @@ ENTRY(wakeup_long64)
movq saved_rip, %rax
jmp *%rax
-ENDPROC(wakeup_long64)
+SYM_FUNC_END(wakeup_long64)
-ENTRY(do_suspend_lowlevel)
+SYM_FUNC_START(do_suspend_lowlevel)
FRAME_BEGIN
subq $8, %rsp
xorl %eax, %eax
@@ -125,7 +125,7 @@ ENTRY(do_suspend_lowlevel)
addq $8, %rsp
FRAME_END
jmp restore_processor_state
-ENDPROC(do_suspend_lowlevel)
+SYM_FUNC_END(do_suspend_lowlevel)
.data
saved_rbp: .quad 0
@@ -136,4 +136,4 @@ saved_rbx: .quad 0
saved_rip: .quad 0
saved_rsp: .quad 0
-ENTRY(saved_magic) .quad 0
+SYM_DATA(saved_magic, .quad 0)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9d3a971ea364..9ec463fe96f2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp);
int poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
- unsigned char int3 = 0xcc;
void *ip;
/*
* Having observed our INT3 instruction, we now must observe
* bp_patching.nr_entries.
*
- * nr_entries != 0 INT3
- * WMB RMB
- * write INT3 if (nr_entries)
+ * nr_entries != 0 INT3
+ * WMB RMB
+ * write INT3 if (nr_entries)
*
* Idem for other elements in bp_patching.
*/
@@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs)
return 0;
/*
- * Discount the sizeof(int3). See text_poke_bp_batch().
+ * Discount the INT3. See text_poke_bp_batch().
*/
- ip = (void *) regs->ip - sizeof(int3);
+ ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
@@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs)
return 0;
}
- /* set up the specified breakpoint detour */
- regs->ip = (unsigned long) tp->detour;
+ ip += tp->len;
+
+ switch (tp->opcode) {
+ case INT3_INSN_OPCODE:
+ /*
+ * Someone poked an explicit INT3, they'll want to handle it,
+ * do not consume.
+ */
+ return 0;
+
+ case CALL_INSN_OPCODE:
+ int3_emulate_call(regs, (long)ip + tp->rel32);
+ break;
+
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ int3_emulate_jmp(regs, (long)ip + tp->rel32);
+ break;
+
+ default:
+ BUG();
+ }
return 1;
}
@@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* synchronization using int3 breakpoint.
*
* The way it is done:
- * - For each entry in the vector:
+ * - For each entry in the vector:
* - add a int3 trap to the address that will be patched
* - sync cores
* - For each entry in the vector:
@@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler);
*/
void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
- int patched_all_but_first = 0;
- unsigned char int3 = 0xcc;
+ unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
+ int do_sync;
lockdep_assert_held(&text_mutex);
@@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
/*
* Second step: update all but the first byte of the patched range.
*/
- for (i = 0; i < nr_entries; i++) {
+ for (do_sync = 0, i = 0; i < nr_entries; i++) {
if (tp[i].len - sizeof(int3) > 0) {
text_poke((char *)tp[i].addr + sizeof(int3),
- (const char *)tp[i].opcode + sizeof(int3),
+ (const char *)tp[i].text + sizeof(int3),
tp[i].len - sizeof(int3));
- patched_all_but_first++;
+ do_sync++;
}
}
- if (patched_all_but_first) {
+ if (do_sync) {
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* Third step: replace the first byte (int3) by the first byte of
* replacing opcode.
*/
- for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
+ for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ if (tp[i].text[0] == INT3_INSN_OPCODE)
+ continue;
+
+ text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+ do_sync++;
+ }
+
+ if (do_sync)
+ on_each_cpu(do_sync_core, NULL, 1);
- on_each_cpu(do_sync_core, NULL, 1);
/*
* sync_core() implies an smp_mb() and orders this store against
* the writing of the new instruction.
@@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
bp_patching.nr_entries = 0;
}
+void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
+ const void *opcode, size_t len, const void *emulate)
+{
+ struct insn insn;
+
+ if (!opcode)
+ opcode = (void *)tp->text;
+ else
+ memcpy((void *)tp->text, opcode, len);
+
+ if (!emulate)
+ emulate = opcode;
+
+ kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+
+ BUG_ON(!insn_complete(&insn));
+ BUG_ON(len != insn.length);
+
+ tp->addr = addr;
+ tp->len = len;
+ tp->opcode = insn.opcode.bytes[0];
+
+ switch (tp->opcode) {
+ case INT3_INSN_OPCODE:
+ break;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ tp->rel32 = insn.immediate.value;
+ break;
+
+ default: /* assume NOP */
+ switch (len) {
+ case 2: /* NOP2 -- emulate as JMP8+0 */
+ BUG_ON(memcmp(emulate, ideal_nops[len], len));
+ tp->opcode = JMP8_INSN_OPCODE;
+ tp->rel32 = 0;
+ break;
+
+ case 5: /* NOP5 -- emulate as JMP32+0 */
+ BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
+ tp->opcode = JMP32_INSN_OPCODE;
+ tp->rel32 = 0;
+ break;
+
+ default: /* unknown instruction */
+ BUG();
+ }
+ break;
+ }
+}
+
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
@@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc tp = {
- .detour = handler,
- .addr = addr,
- .len = len,
- };
-
- if (len > POKE_MAX_OPCODE_SIZE) {
- WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
- return;
- }
-
- memcpy((void *)tp.opcode, opcode, len);
+ struct text_poke_loc tp;
+ text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index a6ac3712db8b..4bbccb9d16dc 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
if (iommu_size < 64*1024*1024) {
- pr_warning(
- "PCI-DMA: Warning: Small IOMMU %luMB."
+ pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
- iommu_size >> 20);
+ iommu_size >> 20);
}
return iommu_size;
@@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
nommu:
/* Should not happen anymore */
- pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
- "falling back to iommu=soft.\n");
+ pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n");
return -1;
}
@@ -733,8 +731,8 @@ int __init gart_iommu_init(void)
!gart_iommu_aperture ||
(no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
- pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
- pr_warning("falling back to iommu=soft.\n");
+ pr_warn("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warn("falling back to iommu=soft.\n");
}
return 0;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2b0faf86da1b..28446fa6bf18 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warning("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+ pr_warn("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n", (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -977,7 +977,7 @@ static int __init calibrate_APIC_clock(void)
*/
if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
- pr_warning("APIC frequency too slow, disabling apic timer\n");
+ pr_warn("APIC frequency too slow, disabling apic timer\n");
return -1;
}
@@ -1021,7 +1021,7 @@ static int __init calibrate_APIC_clock(void)
local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- pr_warning("APIC timer disabled due to verification failure\n");
+ pr_warn("APIC timer disabled due to verification failure\n");
return -1;
}
@@ -1095,8 +1095,8 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
- smp_processor_id());
+ pr_warn("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
lapic_timer_shutdown(evt);
return;
@@ -1811,11 +1811,11 @@ static int __init setup_nox2apic(char *str)
int apicid = native_apic_msr_read(APIC_ID);
if (apicid >= 255) {
- pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
- apicid);
+ pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
return 0;
}
- pr_warning("x2apic already enabled.\n");
+ pr_warn("x2apic already enabled.\n");
__x2apic_disable();
}
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
@@ -1983,7 +1983,7 @@ static int __init apic_verify(void)
*/
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
+ pr_warn("Could not enable APIC!\n");
return -1;
}
set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@ -2337,7 +2337,7 @@ static int cpuid_to_apicid[] = {
#ifdef CONFIG_SMP
/**
* apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
- * @id: APIC ID to check
+ * @apicid: APIC ID to check
*/
bool apic_id_is_primary_thread(unsigned int apicid)
{
@@ -2410,9 +2410,8 @@ int generic_processor_info(int apicid, int version)
disabled_cpu_apicid == apicid) {
int thiscpu = num_processors + disabled_cpus;
- pr_warning("APIC: Disabling requested cpu."
- " Processor %d/0x%x ignored.\n",
- thiscpu, apicid);
+ pr_warn("APIC: Disabling requested cpu."
+ " Processor %d/0x%x ignored.\n", thiscpu, apicid);
disabled_cpus++;
return -ENODEV;
@@ -2426,8 +2425,7 @@ int generic_processor_info(int apicid, int version)
apicid != boot_cpu_physical_apicid) {
int thiscpu = max + disabled_cpus - 1;
- pr_warning(
- "APIC: NR_CPUS/possible_cpus limit of %i almost"
+ pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost"
" reached. Keeping one slot for boot cpu."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
@@ -2438,9 +2436,8 @@ int generic_processor_info(int apicid, int version)
if (num_processors >= nr_cpu_ids) {
int thiscpu = max + disabled_cpus;
- pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
- "reached. Processor %d/0x%x ignored.\n",
- max, thiscpu, apicid);
+ pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. "
+ "Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
return -EINVAL;
@@ -2470,13 +2467,13 @@ int generic_processor_info(int apicid, int version)
* Validate version
*/
if (version == 0x0) {
- pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
- cpu, apicid);
+ pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+ cpu, apicid);
version = 0x10;
}
if (version != boot_cpu_apic_version) {
- pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+ pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
boot_cpu_apic_version, cpu, version);
}
@@ -2845,7 +2842,7 @@ static int __init apic_set_verbosity(char *arg)
apic_verbosity = APIC_VERBOSE;
#ifdef CONFIG_X86_64
else {
- pr_warning("APIC Verbosity level %s not recognised"
+ pr_warn("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d6af97fd170a..913c88617848 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1725,19 +1725,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
- /* If we are moving the irq we need to mask it */
+ /* If we are moving the IRQ we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic_irq(data);
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
return true;
}
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- if (unlikely(masked)) {
+ if (unlikely(moveit)) {
/* Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
@@ -1766,15 +1767,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
*/
if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
}
#endif
@@ -1783,11 +1786,11 @@ static void ioapic_ack_level(struct irq_data *irq_data)
{
struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
- bool masked;
+ bool moveit;
int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(irq_data);
+ moveit = ioapic_prepare_move(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -1842,7 +1845,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
- ioapic_irqd_unmask(irq_data, masked);
+ ioapic_finish_move(irq_data, moveit);
}
static void ioapic_ir_ack_level(struct irq_data *irq_data)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e6230af19864..d5b51a740524 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -14,6 +14,8 @@
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
#include <asm/e820/api.h>
#include <asm/uv/uv_mmrs.h>
@@ -25,12 +27,17 @@
static DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
-static bool uv_hubless_system;
+static int uv_hubbed_system;
+static int uv_hubless_system;
static u64 gru_start_paddr, gru_end_paddr;
static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
static u64 gru_dist_lmask, gru_dist_umask;
static union uvh_apicid uvh_apicid;
+/* Unpack OEM/TABLE ID's to be NULL terminated strings */
+static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
+static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+
/* Information derived from CPUID: */
static struct {
unsigned int apicid_shift;
@@ -248,17 +255,35 @@ static void __init uv_set_apicid_hibit(void)
}
}
-static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static void __init uv_stringify(int len, char *to, char *from)
+{
+ /* Relies on 'to' being NULL chars so result will be NULL terminated */
+ strncpy(to, from, len-1);
+}
+
+static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
{
int pnodeid;
int uv_apic;
+ uv_stringify(sizeof(oem_id), oem_id, _oem_id);
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
if (strncmp(oem_id, "SGI", 3) != 0) {
- if (strncmp(oem_id, "NSGI", 4) == 0) {
- uv_hubless_system = true;
- pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
- oem_id, oem_table_id);
- }
+ if (strncmp(oem_id, "NSGI", 4) != 0)
+ return 0;
+
+ /* UV4 Hubless, CH, (0x11:UV4+Any) */
+ if (strncmp(oem_id, "NSGI4", 5) == 0)
+ uv_hubless_system = 0x11;
+
+ /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */
+ else
+ uv_hubless_system = 0x9;
+
+ pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n",
+ oem_id, oem_table_id, uv_hubless_system);
+
return 0;
}
@@ -286,6 +311,24 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
if (uv_hub_info->hub_revision == 0)
goto badbios;
+ switch (uv_hub_info->hub_revision) {
+ case UV4_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x11;
+ break;
+
+ case UV3_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x9;
+ break;
+
+ case UV2_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x5;
+ break;
+
+ case UV1_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x3;
+ break;
+ }
+
pnodeid = early_get_pnodeid();
early_get_apic_socketid_shift();
@@ -336,9 +379,15 @@ int is_uv_system(void)
}
EXPORT_SYMBOL_GPL(is_uv_system);
-int is_uv_hubless(void)
+int is_uv_hubbed(int uvtype)
+{
+ return (uv_hubbed_system & uvtype);
+}
+EXPORT_SYMBOL_GPL(is_uv_hubbed);
+
+int is_uv_hubless(int uvtype)
{
- return uv_hubless_system;
+ return (uv_hubless_system & uvtype);
}
EXPORT_SYMBOL_GPL(is_uv_hubless);
@@ -1255,7 +1304,8 @@ static int __init decode_uv_systab(void)
struct uv_systab *st;
int i;
- if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+ /* If system is uv3 or lower, there is no extended UVsystab */
+ if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4))
return 0; /* No extended UVsystab required */
st = uv_systab;
@@ -1434,6 +1484,103 @@ static void __init build_socket_tables(void)
}
}
+/* Check which reboot to use */
+static void check_efi_reboot(void)
+{
+ /* If EFI reboot not available, use ACPI reboot */
+ if (!efi_enabled(EFI_BOOT))
+ reboot_type = BOOT_ACPI;
+}
+
+/* Setup user proc fs files */
+static int proc_hubbed_show(struct seq_file *file, void *data)
+{
+ seq_printf(file, "0x%x\n", uv_hubbed_system);
+ return 0;
+}
+
+static int proc_hubless_show(struct seq_file *file, void *data)
+{
+ seq_printf(file, "0x%x\n", uv_hubless_system);
+ return 0;
+}
+
+static int proc_oemid_show(struct seq_file *file, void *data)
+{
+ seq_printf(file, "%s/%s\n", oem_id, oem_table_id);
+ return 0;
+}
+
+static int proc_hubbed_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_hubbed_show, (void *)NULL);
+}
+
+static int proc_hubless_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_hubless_show, (void *)NULL);
+}
+
+static int proc_oemid_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_oemid_show, (void *)NULL);
+}
+
+/* (struct is "non-const" as open function is set at runtime) */
+static struct file_operations proc_version_fops = {
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations proc_oemid_fops = {
+ .open = proc_oemid_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init void uv_setup_proc_files(int hubless)
+{
+ struct proc_dir_entry *pde;
+ char *name = hubless ? "hubless" : "hubbed";
+
+ pde = proc_mkdir(UV_PROC_NODE, NULL);
+ proc_create("oemid", 0, pde, &proc_oemid_fops);
+ proc_create(name, 0, pde, &proc_version_fops);
+ if (hubless)
+ proc_version_fops.open = proc_hubless_open;
+ else
+ proc_version_fops.open = proc_hubbed_open;
+}
+
+/* Initialize UV hubless systems */
+static __init int uv_system_init_hubless(void)
+{
+ int rc;
+
+ /* Setup PCH NMI handler */
+ uv_nmi_setup_hubless();
+
+ /* Init kernel/BIOS interface */
+ rc = uv_bios_init();
+ if (rc < 0)
+ return rc;
+
+ /* Process UVsystab */
+ rc = decode_uv_systab();
+ if (rc < 0)
+ return rc;
+
+ /* Create user access node */
+ if (rc >= 0)
+ uv_setup_proc_files(1);
+
+ check_efi_reboot();
+
+ return rc;
+}
+
static void __init uv_system_init_hub(void)
{
struct uv_hub_info_s hub_info = {0};
@@ -1559,32 +1706,27 @@ static void __init uv_system_init_hub(void)
uv_nmi_setup();
uv_cpu_init();
uv_scir_register_cpu_notifier();
- proc_mkdir("sgi_uv", NULL);
+ uv_setup_proc_files(0);
/* Register Legacy VGA I/O redirection handler: */
pci_register_set_vga_state(uv_set_vga_state);
- /*
- * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
- * EFI is not enabled in the kdump kernel:
- */
- if (is_kdump_kernel())
- reboot_type = BOOT_ACPI;
+ check_efi_reboot();
}
/*
- * There is a small amount of UV specific code needed to initialize a
- * UV system that does not have a "UV HUB" (referred to as "hubless").
+ * There is a different code path needed to initialize a UV system that does
+ * not have a "UV HUB" (referred to as "hubless").
*/
void __init uv_system_init(void)
{
- if (likely(!is_uv_system() && !is_uv_hubless()))
+ if (likely(!is_uv_system() && !is_uv_hubless(1)))
return;
if (is_uv_system())
uv_system_init_hub();
else
- uv_nmi_setup_hubless();
+ uv_system_init_hubless();
}
apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..890f60083eca 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o
+obj-y += intel.o intel_pconfig.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 91c2561b905f..8bf64899f56a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -39,6 +39,8 @@ static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
+static void __init mds_print_mitigation(void);
+static void __init taa_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -105,6 +107,13 @@ void __init check_bugs(void)
ssb_select_mitigation();
l1tf_select_mitigation();
mds_select_mitigation();
+ taa_select_mitigation();
+
+ /*
+ * As MDS and TAA mitigations are inter-related, print MDS
+ * mitigation until after TAA mitigation selection is done.
+ */
+ mds_print_mitigation();
arch_smt_update();
@@ -243,6 +252,12 @@ static void __init mds_select_mitigation(void)
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
}
+}
+
+static void __init mds_print_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
pr_info("%s\n", mds_strings[mds_mitigation]);
}
@@ -269,6 +284,113 @@ static int __init mds_cmdline(char *str)
early_param("mds", mds_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ goto out;
+ }
+
+ if (cpu_mitigations_off()) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /*
+ * TAA mitigation via VERW is turned off if both
+ * tsx_async_abort=off and mds=off are specified.
+ */
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ mds_mitigation == MDS_MITIGATION_OFF)
+ goto out;
+
+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ static_branch_enable(&mds_user_clear);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+
+ /*
+ * Update MDS mitigation, if necessary, as the mds_user_clear is
+ * now enabled for TAA mitigation.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+out:
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -786,13 +908,10 @@ static void update_mds_branch_idle(void)
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
void cpu_bugs_smt_update(void)
{
- /* Enhanced IBRS implies STIBP. No update required. */
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
- return;
-
mutex_lock(&spec_ctrl_mutex);
switch (spectre_v2_user) {
@@ -819,6 +938,17 @@ void cpu_bugs_smt_update(void)
break;
}
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1149,6 +1279,9 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
@@ -1304,11 +1437,24 @@ static ssize_t l1tf_show_state(char *buf)
l1tf_vmx_states[l1tf_vmx_mitigation],
sched_smt_active() ? "vulnerable" : "disabled");
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (itlb_multihit_kvm_mitigation)
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sprintf(buf, "KVM: Vulnerable\n");
+}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sprintf(buf, "Processor vulnerable\n");
+}
#endif
static ssize_t mds_show_state(char *buf)
@@ -1328,6 +1474,21 @@ static ssize_t mds_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1398,6 +1559,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_MDS:
return mds_show_state(buf);
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
default:
break;
}
@@ -1434,4 +1601,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9ae7d1bcd4f4..baa2fed8deb6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -53,10 +53,7 @@
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
-#endif
#include "cpu.h"
@@ -565,8 +562,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
-__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
void load_percpu_segment(int cpu)
{
@@ -1016,13 +1014,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#endif
}
-#define NO_SPECULATION BIT(0)
-#define NO_MELTDOWN BIT(1)
-#define NO_SSB BIT(2)
-#define NO_L1TF BIT(3)
-#define NO_MDS BIT(4)
-#define MSBDS_ONLY BIT(5)
-#define NO_SWAPGS BIT(6)
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -1043,27 +1042,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
-
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1073,15 +1072,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
+ VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT),
+
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
{}
};
@@ -1092,19 +1093,30 @@ static bool __init cpu_matches(unsigned long which)
return m && !!(m->driver_data & which);
}
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+u64 x86_read_arch_cap_msr(void)
{
u64 ia32_cap = 0;
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+ return ia32_cap;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
if (cpu_matches(NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
-
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
@@ -1121,6 +1133,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (!cpu_matches(NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
if (cpu_matches(NO_MELTDOWN))
return;
@@ -1554,6 +1581,8 @@ void __init identify_boot_cpu(void)
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
+
+ tsx_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1749,7 +1778,7 @@ static void wait_for_master_cpu(int cpu)
}
#ifdef CONFIG_X86_64
-static void setup_getcpu(int cpu)
+static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
@@ -1769,7 +1798,59 @@ static void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
+
+static inline void ucode_cpu_init(int cpu)
+{
+ if (cpu)
+ load_ucode_ap();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss)
+{
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+}
+
+static inline void gdt_setup_doublefault_tss(int cpu) { }
+
+#else /* CONFIG_X86_64 */
+
+static inline void setup_getcpu(int cpu) { }
+
+static inline void ucode_cpu_init(int cpu)
+{
+ show_ucode_info_early();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+
+static inline void gdt_setup_doublefault_tss(int cpu)
+{
+#ifdef CONFIG_DOUBLEFAULT
+ /* Set up the doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+}
+#endif /* !CONFIG_X86_64 */
+
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
#endif
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1777,21 +1858,15 @@ static void setup_getcpu(int cpu)
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
void cpu_init(void)
{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- struct task_struct *me;
- struct tss_struct *t;
- int i;
wait_for_master_cpu(cpu);
- if (cpu)
- load_ucode_ap();
-
- t = &per_cpu(cpu_tss_rw, cpu);
+ ucode_cpu_init(cpu);
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
@@ -1800,63 +1875,47 @@ void cpu_init(void)
#endif
setup_getcpu(cpu);
- me = current;
-
pr_debug("Initializing CPU#%d\n", cpu);
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
*/
-
switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
-
load_current_idt();
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
-
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- x86_configure_nx();
- x2apic_setup();
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
- /*
- * set up and load the per-CPU TSS
- */
- if (!t->x86_tss.ist[0]) {
- t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
- t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
- t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
- t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ x2apic_setup();
}
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-
mmgrab(&init_mm);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, me);
+ enter_lazy_tlb(&init_mm, cur);
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
+ /* Initialize the TSS. */
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
load_TR_desc();
+ /*
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
+ */
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1864,6 +1923,8 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
+ gdt_setup_doublefault_tss(cpu);
+
fpu__init_cpu();
if (is_uv_system())
@@ -1872,63 +1933,6 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
-#else
-
-void cpu_init(void)
-{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
-
- wait_for_master_cpu(cpu);
-
- show_ucode_info_early();
-
- pr_info("Initializing CPU#%d\n", cpu);
-
- if (cpu_feature_enabled(X86_FEATURE_VME) ||
- boot_cpu_has(X86_FEATURE_TSC) ||
- boot_cpu_has(X86_FEATURE_DE))
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- load_current_idt();
- switch_to_new_gdt(cpu);
-
- /*
- * Set up and load the per-CPU TSS and LDT
- */
- mmgrab(&init_mm);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, curr);
-
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
- load_TR_desc();
- load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
-
- load_mm_ldt(&init_mm);
-
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-
- clear_all_debug_regs();
- dbg_restore_debug_regs();
-
- fpu__init_cpu();
-
- load_fixmap_gdt(cpu);
-}
-#endif
-
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c0e2407abdd6..38ab6e115eac 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
@@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
+extern u64 x86_read_arch_cap_msr(void);
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c2fdc00df163..4a900804a023 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_tme(c);
init_intel_misc_features(c);
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
}
#ifdef CONFIG_X86_32
@@ -814,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
@@ -842,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
@@ -854,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc)
return;
/* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc && \
- intel_tlb_table[k].descriptor != 0; k++)
+ for (k = 0; intel_tlb_table[k].descriptor != desc &&
+ intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 6ea7fdc82f3c..5167bd2bb6b1 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -583,7 +583,7 @@ bool amd_filter_mce(struct mce *m)
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
* Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
int i, num_msrs;
u64 hwcr;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 743370ee4983..5f42f25bac8f 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -488,8 +488,9 @@ int mce_usable_address(struct mce *m)
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
- /* Checks after this one are Intel-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ /* Checks after this one are Intel/Zhaoxin-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 1;
if (!(m->status & MCI_STATUS_MISCV))
@@ -507,10 +508,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address);
bool mce_is_memory_error(struct mce *m)
{
- if (m->cpuvendor == X86_VENDOR_AMD ||
- m->cpuvendor == X86_VENDOR_HYGON) {
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
return amd_mce_is_memory_error(m);
- } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -527,9 +531,10 @@ bool mce_is_memory_error(struct mce *m)
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
- }
- return false;
+ default:
+ return false;
+ }
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
@@ -1127,6 +1132,12 @@ static bool __mc_check_crashing_cpu(int cpu)
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
return true;
@@ -1277,9 +1288,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/*
* Check if this MCE is signaled to only this logical processor,
- * on Intel only.
+ * on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL)
+ if (m.cpuvendor == X86_VENDOR_INTEL ||
+ m.cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
@@ -1697,6 +1709,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
}
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+ }
+
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
if (cfg->bootlog != 0)
@@ -1760,6 +1784,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
}
}
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+ mce_adjust_timer = cmci_intel_adjust_timer;
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@@ -1781,6 +1834,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_centaur_feature_init(c);
break;
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
default:
break;
}
@@ -1792,6 +1849,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
default:
break;
}
@@ -2014,15 +2076,16 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
- * are socket-wide.
- * Disabling them for just a single offlined CPU is bad, since it will
- * inhibit reporting for all shared resources on the socket like the
- * last level cache (LLC), the integrated memory controller (iMC), etc.
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
- boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
return;
mce_disable_error_reporting();
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 88cd9598fa57..e270d0770134 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -85,8 +85,10 @@ static int cmci_supported(int *banks)
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 0;
+
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -423,7 +425,7 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-static void intel_init_cmci(void)
+void intel_init_cmci(void)
{
int banks;
@@ -442,7 +444,7 @@ static void intel_init_cmci(void)
cmci_recheck();
}
-static void intel_init_lmce(void)
+void intel_init_lmce(void)
{
u64 val;
@@ -455,7 +457,7 @@ static void intel_init_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
-static void intel_clear_lmce(void)
+void intel_clear_lmce(void)
{
u64 val;
@@ -482,6 +484,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_ICELAKE_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 43031db429d2..842b273bce31 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
#endif
void mce_timer_kick(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index 6e2becf547c5..d01e0da0163a 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -40,15 +40,58 @@
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
-/*
- * Current thermal event state:
+/**
+ * struct _thermal_state - Represent the current thermal event state
+ * @next_check: Stores the next timestamp, when it is allowed
+ * to log the next warning message.
+ * @last_interrupt_time: Stores the timestamp for the last threshold
+ * high event.
+ * @therm_work: Delayed workqueue structure
+ * @count: Stores the current running count for thermal
+ * or power threshold interrupts.
+ * @last_count: Stores the previous running count for thermal
+ * or power threshold interrupts.
+ * @max_time_ms: This shows the maximum amount of time CPU was
+ * in throttled state for a single thermal
+ * threshold high to low state.
+ * @total_time_ms: This is a cumulative time during which CPU was
+ * in the throttled state.
+ * @rate_control_active: Set when a throttling message is logged.
+ * This is used for the purpose of rate-control.
+ * @new_event: Stores the last high/low status of the
+ * THERM_STATUS_PROCHOT or
+ * THERM_STATUS_POWER_LIMIT.
+ * @level: Stores whether this _thermal_state instance is
+ * for a CORE level or for PACKAGE level.
+ * @sample_index: Index for storing the next sample in the buffer
+ * temp_samples[].
+ * @sample_count: Total number of samples collected in the buffer
+ * temp_samples[].
+ * @average: The last moving average of temperature samples
+ * @baseline_temp: Temperature at which thermal threshold high
+ * interrupt was generated.
+ * @temp_samples: Storage for temperature samples to calculate
+ * moving average.
+ *
+ * This structure is used to represent data related to thermal state for a CPU.
+ * There is a separate storage for core and package level for each CPU.
*/
struct _thermal_state {
- bool new_event;
- int event;
u64 next_check;
+ u64 last_interrupt_time;
+ struct delayed_work therm_work;
unsigned long count;
unsigned long last_count;
+ unsigned long max_time_ms;
+ unsigned long total_time_ms;
+ bool rate_control_active;
+ bool new_event;
+ u8 level;
+ u8 sample_index;
+ u8 sample_count;
+ u8 average;
+ u8 baseline_temp;
+ u8 temp_samples[3];
};
struct thermal_state {
@@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(core_throttle, max_time_ms);
+define_therm_throt_device_one_ro(core_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, max_time_ms);
+define_therm_throt_device_one_ro(package_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(core_throttle, total_time_ms);
+define_therm_throt_device_one_ro(core_throttle_total_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, total_time_ms);
+define_therm_throt_device_one_ro(package_throttle_total_time_ms);
+
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_max_time_ms.attr,
+ &dev_attr_core_throttle_total_time_ms.attr,
NULL
};
@@ -135,6 +192,105 @@ static const struct attribute_group thermal_attr_group = {
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
+#define THERM_THROT_POLL_INTERVAL HZ
+#define THERM_STATUS_PROCHOT_LOG BIT(1)
+
+static void clear_therm_status_log(int level)
+{
+ int msr;
+ u64 msr_val;
+
+ if (level == CORE_LEVEL)
+ msr = MSR_IA32_THERM_STATUS;
+ else
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+
+ rdmsrl(msr, msr_val);
+ wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
+}
+
+static void get_therm_status(int level, bool *proc_hot, u8 *temp)
+{
+ int msr;
+ u64 msr_val;
+
+ if (level == CORE_LEVEL)
+ msr = MSR_IA32_THERM_STATUS;
+ else
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+
+ rdmsrl(msr, msr_val);
+ if (msr_val & THERM_STATUS_PROCHOT_LOG)
+ *proc_hot = true;
+ else
+ *proc_hot = false;
+
+ *temp = (msr_val >> 16) & 0x7F;
+}
+
+static void throttle_active_work(struct work_struct *work)
+{
+ struct _thermal_state *state = container_of(to_delayed_work(work),
+ struct _thermal_state, therm_work);
+ unsigned int i, avg, this_cpu = smp_processor_id();
+ u64 now = get_jiffies_64();
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /* temperature value is offset from the max so lesser means hotter */
+ if (!hot && temp > state->baseline_temp) {
+ if (state->rate_control_active)
+ pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+
+ state->rate_control_active = false;
+ return;
+ }
+
+ if (time_before64(now, state->next_check) &&
+ state->rate_control_active)
+ goto re_arm;
+
+ state->next_check = now + CHECK_INTERVAL;
+
+ if (state->count != state->last_count) {
+ /* There was one new thermal interrupt */
+ state->last_count = state->count;
+ state->average = 0;
+ state->sample_count = 0;
+ state->sample_index = 0;
+ }
+
+ state->temp_samples[state->sample_index] = temp;
+ state->sample_count++;
+ state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
+ if (state->sample_count < ARRAY_SIZE(state->temp_samples))
+ goto re_arm;
+
+ avg = 0;
+ for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
+ avg += state->temp_samples[i];
+
+ avg /= ARRAY_SIZE(state->temp_samples);
+
+ if (state->average > avg) {
+ pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ state->rate_control_active = true;
+ }
+
+ state->average = avg;
+
+re_arm:
+ clear_therm_status_log(state->level);
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+}
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -178,27 +334,33 @@ static void therm_throt_process(bool new_event, int event, int level)
if (new_event)
state->count++;
- if (time_before64(now, state->next_check) &&
- state->count != state->last_count)
+ if (event != THERMAL_THROTTLING_EVENT)
return;
- state->next_check = now + CHECK_INTERVAL;
- state->last_count = state->count;
+ if (new_event && !state->last_interrupt_time) {
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /*
+ * Ignore short temperature spike as the system is not close
+ * to PROCHOT. 10C offset is large enough to ignore. It is
+ * already dropped from the high threshold temperature.
+ */
+ if (temp > 10)
+ return;
- /* if we just entered the thermal event */
- if (new_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- return;
- }
- if (old_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
- return;
+ state->baseline_temp = temp;
+ state->last_interrupt_time = now;
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+ } else if (old_event && state->last_interrupt_time) {
+ unsigned long throttle_time;
+
+ throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
+ if (throttle_time > state->max_time_ms)
+ state->max_time_ms = throttle_time;
+ state->total_time_ms += throttle_time;
+ state->last_interrupt_time = 0;
}
}
@@ -244,20 +406,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
if (err)
return err;
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
+
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_max_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_total_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
}
+ return 0;
+
+del_group:
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+
return err;
}
@@ -269,15 +458,29 @@ static void thermal_throttle_remove_dev(struct device *dev)
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ state->package_throttle.level = PACKAGE_LEVEL;
+ state->core_throttle.level = CORE_LEVEL;
+
+ INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
+ INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
+
return thermal_throttle_add_dev(dev, cpu);
}
static int thermal_throttle_offline(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ cancel_delayed_work(&state->package_throttle.therm_work);
+ cancel_delayed_work(&state->core_throttle.therm_work);
+
+ state->package_throttle.rate_control_active = false;
+ state->core_throttle.rate_control_active = false;
+
thermal_throttle_remove_dev(dev);
return 0;
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a0e52bd00ecc..3f6b137ef4e6 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
mc = (struct microcode_amd *)amd_ucode_patch;
@@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
BUG_ON(raw_smp_processor_id() != cpu);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index cb0fdcaf1415..7019d4b2df0c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache);
*/
static DEFINE_MUTEX(microcode_mutex);
-/*
- * Serialize late loading so that CPUs get updated one-by-one.
- */
-static DEFINE_RAW_SPINLOCK(update_lock);
-
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
struct cpu_info_ctx {
@@ -566,11 +561,18 @@ static int __reload_late(void *info)
if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
return -1;
- raw_spin_lock(&update_lock);
- apply_microcode_local(&err);
- raw_spin_unlock(&update_lock);
+ /*
+ * On an SMT system, it suffices to load the microcode on one sibling of
+ * the core because the microcode engine is shared between the threads.
+ * Synchronization still needs to take place so that no concurrent
+ * loading attempts happen on multiple threads of an SMT core. See
+ * below.
+ */
+ if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
+ apply_microcode_local(&err);
+ else
+ goto wait_for_siblings;
- /* siblings return UCODE_OK because their engine got updated already */
if (err > UCODE_NFOUND) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
ret = -1;
@@ -578,14 +580,18 @@ static int __reload_late(void *info)
ret = 1;
}
+wait_for_siblings:
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
+ panic("Timeout during microcode update!\n");
+
/*
- * Increase the wait timeout to a safe value here since we're
- * serializing the microcode update and that could take a while on a
- * large number of CPUs. And that is fine as the *actual* timeout will
- * be determined by the last CPU finished updating and thus cut short.
+ * At least one thread has completed update on each core.
+ * For others, simply call the update to make sure the
+ * per-cpu cpuinfo can be updated with right microcode
+ * revision.
*/
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
- panic("Timeout during microcode update!\n");
+ if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
+ apply_microcode_local(&err);
return ret;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ce799cfe9434..6a99535d7f37 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
struct microcode_intel *mc;
enum ucode_state ret;
static int prev_rev;
@@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
return UCODE_ERROR;
}
- if (rev != prev_rev) {
+ if (bsp && rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
rev,
mc->hdr.date & 0xffff,
@@ -852,7 +853,7 @@ out:
c->microcode = rev;
/* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (c->cpu_index == boot_cpu_data.cpu_index)
+ if (bsp)
boot_cpu_data.microcode = rev;
return ret;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c656d92cd708..caa032ce3fe3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -290,7 +290,12 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.shutdown = hv_machine_shutdown;
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
- mark_tsc_unstable("running on Hyper-V");
+ if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) {
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ } else {
+ mark_tsc_unstable("running on Hyper-V");
+ }
/*
* Generation 2 instances don't support reading the NMI status from
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 5c900f9527ff..c4be62058dd9 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup);
#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
- unsigned long tmp;
+ unsigned int changed = 0;
+ unsigned long tmp, prev;
int i;
if (!cpu_has(c, X86_FEATURE_RDRAND))
@@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
return;
}
}
+
+ /*
+ * Stupid sanity-check whether RDRAND does *actually* generate
+ * some at least random-looking data.
+ */
+ prev = tmp;
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (rdrand_long(&tmp)) {
+ if (prev != tmp)
+ changed++;
+
+ prev = tmp;
+ }
+ }
+
+ if (WARN_ON_ONCE(!changed))
+ pr_emerg(
+"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\"");
+
}
#endif
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index a46dee8e78db..2e3b06d6bbc6 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
}
rdtgrp = rdtgroup_kn_lock_live(of->kn);
- rdt_last_cmd_clear();
if (!rdtgrp) {
ret = -ENOENT;
- rdt_last_cmd_puts("Directory was removed\n");
goto unlock;
}
@@ -2648,10 +2646,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
int ret;
prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
- rdt_last_cmd_clear();
if (!prdtgrp) {
ret = -ENODEV;
- rdt_last_cmd_puts("Directory was removed\n");
goto out_unlock;
}
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..3e20d322bc98
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+void __init tsx_init(void)
+{
+ char arg[5] = {};
+ int ret;
+
+ if (!tsx_ctrl_is_supported())
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+ if (ret >= 0) {
+ if (!strcmp(arg, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(arg, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(arg, "auto")) {
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("tsx: invalid option, defaulting to off\n");
+ }
+ } else {
+ /* tsx= not provided */
+ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ else
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the
+ * RTM CPUID bit. Clear it here since it is now
+ * expected to be not set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the
+ * RTM CPUID bit. Force it here since it is now
+ * expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ }
+}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index eb651fbde92a..00fc55ac7ffa 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/memblock.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -39,6 +40,7 @@
#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
+#include <asm/cmdline.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
rcu_read_unlock();
}
+/*
+ * When the crashkernel option is specified, only use the low
+ * 1M for the real mode trampoline.
+ */
+void __init crash_reserve_low_1M(void)
+{
+ if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
+ return;
+
+ memblock_reserve(0, 1<<20);
+ pr_info("Reserving the low 1M of memory for crashkernel\n");
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -173,8 +188,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_KEXEC_FILE
-static unsigned long crash_zero_bytes;
-
static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -189,8 +202,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
unsigned int nr_ranges = 0;
struct crash_mem *cmem;
- walk_system_ram_res(0, -1, &nr_ranges,
- get_nr_ram_ranges_callback);
+ walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
if (!nr_ranges)
return NULL;
@@ -217,15 +229,19 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
{
int ret = 0;
+ /* Exclude the low 1M because it is always reserved */
+ ret = crash_exclude_mem_range(cmem, 0, 1<<20);
+ if (ret)
+ return ret;
+
/* Exclude crashkernel region */
ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
if (ret)
return ret;
- if (crashk_low_res.end) {
+ if (crashk_low_res.end)
ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
- crashk_low_res.end);
- }
+ crashk_low_res.end);
return ret;
}
@@ -246,16 +262,13 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
unsigned long *sz)
{
struct crash_mem *cmem;
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- int ret, i;
+ int ret;
cmem = fill_up_crash_elf_data();
if (!cmem)
return -ENOMEM;
- ret = walk_system_ram_res(0, -1, cmem,
- prepare_elf64_ram_headers_callback);
+ ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
if (ret)
goto out;
@@ -265,24 +278,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
goto out;
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem,
- IS_ENABLED(CONFIG_X86_64), addr, sz);
- if (ret)
- goto out;
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
- /*
- * If a range matches backup region, adjust offset to backup
- * segment.
- */
- ehdr = (Elf64_Ehdr *)*addr;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- for (i = 0; i < ehdr->e_phnum; phdr++, i++)
- if (phdr->p_type == PT_LOAD &&
- phdr->p_paddr == image->arch.backup_src_start &&
- phdr->p_memsz == image->arch.backup_src_sz) {
- phdr->p_offset = image->arch.backup_load_addr;
- break;
- }
out:
vfree(cmem);
return ret;
@@ -296,8 +293,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
return 1;
- memcpy(&params->e820_table[nr_e820_entries], entry,
- sizeof(struct e820_entry));
+ memcpy(&params->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry));
params->e820_entries++;
return 0;
}
@@ -321,19 +317,11 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
unsigned long long mend)
{
unsigned long start, end;
- int ret = 0;
cmem->ranges[0].start = mstart;
cmem->ranges[0].end = mend;
cmem->nr_ranges = 1;
- /* Exclude Backup region */
- start = image->arch.backup_load_addr;
- end = start + image->arch.backup_src_sz - 1;
- ret = crash_exclude_mem_range(cmem, start, end);
- if (ret)
- return ret;
-
/* Exclude elf header region */
start = image->arch.elf_load_addr;
end = start + image->arch.elf_headers_sz - 1;
@@ -356,28 +344,28 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
- /* Add first 640K segment */
- ei.addr = image->arch.backup_src_start;
- ei.size = image->arch.backup_src_sz;
- ei.type = E820_TYPE_RAM;
- add_e820_entry(params, &ei);
+ /* Add the low 1M */
+ cmd.type = E820_TYPE_RAM;
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd,
+ memmap_entry_callback);
/* Add ACPI tables */
cmd.type = E820_TYPE_ACPI;
flags = IORESOURCE_MEM | IORESOURCE_BUSY;
walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add ACPI Non-volatile Storage */
cmd.type = E820_TYPE_NVS;
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add e820 reserved ranges */
cmd.type = E820_TYPE_RESERVED;
flags = IORESOURCE_MEM;
walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add crashk_low_res region */
if (crashk_low_res.end) {
@@ -388,8 +376,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
}
/* Exclude some ranges from crashk_res and add rest to memmap */
- ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
- crashk_res.end);
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end);
if (ret)
goto out;
@@ -409,55 +396,12 @@ out:
return ret;
}
-static int determine_backup_region(struct resource *res, void *arg)
-{
- struct kimage *image = arg;
-
- image->arch.backup_src_start = res->start;
- image->arch.backup_src_sz = resource_size(res);
-
- /* Expecting only one range for backup region */
- return 1;
-}
-
int crash_load_segments(struct kimage *image)
{
int ret;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
.buf_max = ULONG_MAX, .top_down = false };
- /*
- * Determine and load a segment for backup area. First 640K RAM
- * region is backup source
- */
-
- ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
- image, determine_backup_region);
-
- /* Zero or postive return values are ok */
- if (ret < 0)
- return ret;
-
- /* Add backup segment. */
- if (image->arch.backup_src_sz) {
- kbuf.buffer = &crash_zero_bytes;
- kbuf.bufsz = sizeof(crash_zero_bytes);
- kbuf.memsz = image->arch.backup_src_sz;
- kbuf.buf_align = PAGE_SIZE;
- /*
- * Ideally there is no source for backup segment. This is
- * copied in purgatory after crash. Just add a zero filled
- * segment for now to make sure checksum logic works fine.
- */
- ret = kexec_add_buffer(&kbuf);
- if (ret)
- return ret;
- image->arch.backup_load_addr = kbuf.mem;
- pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
- image->arch.backup_load_addr,
- image->arch.backup_src_start, kbuf.memsz);
- }
-
/* Prepare elf headers and add a segment */
ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
if (ret)
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0b8cedb20d6d..0d6c657593f8 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -54,7 +54,7 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
@@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
+#ifndef CONFIG_X86_32_LAZY_GS
+ .gs = __KERNEL_STACK_CANARY,
+#endif
.__cr3 = __pa_nodebug(swapper_pg_dir),
};
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7da2bcd2b8eb..c5399e80c59c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type)
case E820_TYPE_RAM: /* Fall through: */
case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
case E820_TYPE_RESERVED: pr_cont("reserved"); break;
+ case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
@@ -999,6 +1000,17 @@ void __init e820__reserve_setup_data(void)
data = early_memremap(pa_data, sizeof(*data));
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
+ e820__range_update(((struct setup_indirect *)data->data)->addr,
+ ((struct setup_indirect *)data->data)->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ e820__range_update_kexec(((struct setup_indirect *)data->data)->addr,
+ ((struct setup_indirect *)data->data)->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ }
+
pa_data = data->next;
early_memunmap(data, sizeof(*data));
}
@@ -1037,6 +1049,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
case E820_TYPE_PRAM: return "Persistent Memory (legacy)";
case E820_TYPE_PMEM: return "Persistent Memory";
case E820_TYPE_RESERVED: return "Reserved";
+ case E820_TYPE_SOFT_RESERVED: return "Soft Reserved";
default: return "Unknown E820 type";
}
}
@@ -1052,6 +1065,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
case E820_TYPE_PRAM: /* Fall-through: */
case E820_TYPE_PMEM: /* Fall-through: */
case E820_TYPE_RESERVED: /* Fall-through: */
+ case E820_TYPE_SOFT_RESERVED: /* Fall-through: */
default: return IORESOURCE_MEM;
}
}
@@ -1064,6 +1078,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
+ case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE: /* Fall-through: */
@@ -1078,11 +1093,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
return true;
/*
- * Treat persistent memory like device memory, i.e. reserve it
- * for exclusive use of a driver
+ * Treat persistent memory and other special memory ranges like
+ * device memory, i.e. reserve it for exclusive use of a driver
*/
switch (type) {
case E820_TYPE_RESERVED:
+ case E820_TYPE_SOFT_RESERVED:
case E820_TYPE_PRAM:
case E820_TYPE_PMEM:
return false;
@@ -1285,6 +1301,9 @@ void __init e820__memblock_setup(void)
if (end != (resource_size_t)end)
continue;
+ if (entry->type == E820_TYPE_SOFT_RESERVED)
+ memblock_reserve(entry->addr, entry->size);
+
if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
continue;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 6f6b1d04dadf..4cba91ec8049 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -710,6 +710,8 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_INTEL, 0x3ec4,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e5cb67d67c03..319be936c348 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -254,10 +254,13 @@ static void __init setup_xstate_features(void)
* in the fixed offsets in the xsave area in either compacted form
* or standard form.
*/
- xstate_offsets[0] = 0;
- xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
- xstate_offsets[1] = xstate_sizes[0];
- xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
+ xstate_offsets[XFEATURE_FP] = 0;
+ xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
+ xmm_space);
+
+ xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
+ xstate_sizes[XFEATURE_SSE] = FIELD_SIZEOF(struct fxregs_state,
+ xmm_space);
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
if (!xfeature_enabled(i))
@@ -342,7 +345,7 @@ static int xfeature_is_aligned(int xfeature_nr)
*/
static void __init setup_xstate_comp(void)
{
- unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+ unsigned int xstate_comp_sizes[XFEATURE_MAX];
int i;
/*
@@ -350,8 +353,9 @@ static void __init setup_xstate_comp(void)
* in the fixed offsets in the xsave area in either compacted form
* or standard form.
*/
- xstate_comp_offsets[0] = 0;
- xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+ xstate_comp_offsets[XFEATURE_FP] = 0;
+ xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
+ xmm_space);
if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
@@ -840,7 +844,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
/*
* We should not ever be requesting features that we
- * have not enabled. Remember that pcntxt_mask is
+ * have not enabled. Remember that xfeatures_mask is
* what we write to the XCR0 register.
*/
WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 024c3053dbba..060a361d9d11 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1043,6 +1043,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
return;
/*
+ * If the return location is actually pointing directly to
+ * the start of a direct trampoline (if we trace the trampoline
+ * it will still be offset by MCOUNT_INSN_SIZE), then the
+ * return address is actually off by one word, and we
+ * need to adjust for that.
+ */
+ if (ftrace_direct_func_count) {
+ if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) {
+ self_addr = *parent;
+ parent++;
+ }
+ }
+
+ /*
* Protect against fault, even if it shouldn't
* happen. This tool is too much intrusive to
* ignore such a protection.
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index 073aab525d80..e8a9f8370112 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -12,20 +12,18 @@
#include <asm/frame.h>
#include <asm/asm-offsets.h>
-# define function_hook __fentry__
-EXPORT_SYMBOL(__fentry__)
-
#ifdef CONFIG_FRAME_POINTER
# define MCOUNT_FRAME 1 /* using frame = true */
#else
# define MCOUNT_FRAME 0 /* using frame = false */
#endif
-ENTRY(function_hook)
+SYM_FUNC_START(__fentry__)
ret
-END(function_hook)
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
-ENTRY(ftrace_caller)
+SYM_CODE_START(ftrace_caller)
#ifdef CONFIG_FRAME_POINTER
/*
@@ -85,11 +83,11 @@ ftrace_graph_call:
#endif
/* This is weak to keep gas from relaxing the jumps */
-WEAK(ftrace_stub)
+SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
ret
-END(ftrace_caller)
+SYM_CODE_END(ftrace_caller)
-ENTRY(ftrace_regs_caller)
+SYM_CODE_START(ftrace_regs_caller)
/*
* We're here from an mcount/fentry CALL, and the stack frame looks like:
*
@@ -138,7 +136,7 @@ ENTRY(ftrace_regs_caller)
movl function_trace_op, %ecx # 3rd argument: ftrace_pos
pushl %esp # 4th argument: pt_regs
-GLOBAL(ftrace_regs_call)
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
call ftrace_stub
addl $4, %esp # skip 4th argument
@@ -163,9 +161,10 @@ GLOBAL(ftrace_regs_call)
popl %eax
jmp .Lftrace_ret
+SYM_CODE_END(ftrace_regs_caller)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
+SYM_CODE_START(ftrace_graph_caller)
pushl %eax
pushl %ecx
pushl %edx
@@ -179,7 +178,7 @@ ENTRY(ftrace_graph_caller)
popl %ecx
popl %eax
ret
-END(ftrace_graph_caller)
+SYM_CODE_END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 809d54397dba..369e61faacfe 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -14,9 +14,6 @@
.code64
.section .entry.text, "ax"
-# define function_hook __fentry__
-EXPORT_SYMBOL(__fentry__)
-
#ifdef CONFIG_FRAME_POINTER
/* Save parent and function stack frames (rip and rbp) */
# define MCOUNT_FRAME_SIZE (8+16*2)
@@ -88,6 +85,7 @@ EXPORT_SYMBOL(__fentry__)
movq %rdi, RDI(%rsp)
movq %r8, R8(%rsp)
movq %r9, R9(%rsp)
+ movq $0, ORIG_RAX(%rsp)
/*
* Save the original RBP. Even though the mcount ABI does not
* require this, it helps out callers.
@@ -114,7 +112,11 @@ EXPORT_SYMBOL(__fentry__)
subq $MCOUNT_INSN_SIZE, %rdi
.endm
-.macro restore_mcount_regs
+.macro restore_mcount_regs save=0
+
+ /* ftrace_regs_caller or frame pointers require this */
+ movq RBP(%rsp), %rbp
+
movq R9(%rsp), %r9
movq R8(%rsp), %r8
movq RDI(%rsp), %rdi
@@ -123,31 +125,29 @@ EXPORT_SYMBOL(__fentry__)
movq RCX(%rsp), %rcx
movq RAX(%rsp), %rax
- /* ftrace_regs_caller can modify %rbp */
- movq RBP(%rsp), %rbp
-
- addq $MCOUNT_REG_SIZE, %rsp
+ addq $MCOUNT_REG_SIZE-\save, %rsp
.endm
#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(function_hook)
+SYM_FUNC_START(__fentry__)
retq
-ENDPROC(function_hook)
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
-ENTRY(ftrace_caller)
+SYM_FUNC_START(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
-GLOBAL(ftrace_caller_op_ptr)
+SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
/* regs go into 4th parameter (but make it NULL) */
movq $0, %rcx
-GLOBAL(ftrace_call)
+SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
call ftrace_stub
restore_mcount_regs
@@ -157,10 +157,10 @@ GLOBAL(ftrace_call)
* think twice before adding any new code or changing the
* layout here.
*/
-GLOBAL(ftrace_epilogue)
+SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
+SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
jmp ftrace_stub
#endif
@@ -168,19 +168,21 @@ GLOBAL(ftrace_graph_call)
* This is weak to keep gas from relaxing the jumps.
* It is also used to copy the retq for trampolines.
*/
-WEAK(ftrace_stub)
+SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
retq
-ENDPROC(ftrace_caller)
+SYM_FUNC_END(ftrace_caller)
-ENTRY(ftrace_regs_caller)
+SYM_FUNC_START(ftrace_regs_caller)
/* Save the current flags before any operations that can change them */
pushfq
+ UNWIND_HINT_SAVE
+
/* added 8 bytes to save flags */
save_mcount_regs 8
/* save_mcount_regs fills in first two parameters */
-GLOBAL(ftrace_regs_caller_op_ptr)
+SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
@@ -209,7 +211,7 @@ GLOBAL(ftrace_regs_caller_op_ptr)
/* regs go into 4th parameter */
leaq (%rsp), %rcx
-GLOBAL(ftrace_regs_call)
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
call ftrace_stub
/* Copy flags back to SS, to restore them */
@@ -228,7 +230,33 @@ GLOBAL(ftrace_regs_call)
movq R10(%rsp), %r10
movq RBX(%rsp), %rbx
- restore_mcount_regs
+ movq ORIG_RAX(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE-8(%rsp)
+
+ /* If ORIG_RAX is anything but zero, make this a call to that */
+ movq ORIG_RAX(%rsp), %rax
+ cmpq $0, %rax
+ je 1f
+
+ /* Swap the flags with orig_rax */
+ movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs 8
+
+ jmp 2f
+
+1: restore_mcount_regs
+
+
+2:
+ /*
+ * The stack layout is nondetermistic here, depending on which path was
+ * taken. This confuses objtool and ORC, rightfully so. For now,
+ * pretend the stack always looks like the non-direct case.
+ */
+ UNWIND_HINT_RESTORE
/* Restore flags */
popfq
@@ -239,16 +267,16 @@ GLOBAL(ftrace_regs_call)
* The trampoline will add the code to jump
* to the return.
*/
-GLOBAL(ftrace_regs_caller_end)
+SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
-ENDPROC(ftrace_regs_caller)
+SYM_FUNC_END(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(function_hook)
+SYM_FUNC_START(__fentry__)
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
@@ -261,7 +289,7 @@ fgraph_trace:
jnz ftrace_graph_caller
#endif
-GLOBAL(ftrace_stub)
+SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL)
retq
trace:
@@ -279,11 +307,12 @@ trace:
restore_mcount_regs
jmp fgraph_trace
-ENDPROC(function_hook)
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
+SYM_FUNC_START(ftrace_graph_caller)
/* Saves rbp into %rdx and fills first parameter */
save_mcount_regs
@@ -294,9 +323,9 @@ ENTRY(ftrace_graph_caller)
restore_mcount_regs
retq
-ENDPROC(ftrace_graph_caller)
+SYM_FUNC_END(ftrace_graph_caller)
-ENTRY(return_to_handler)
+SYM_CODE_START(return_to_handler)
UNWIND_HINT_EMPTY
subq $24, %rsp
@@ -312,5 +341,5 @@ ENTRY(return_to_handler)
movq (%rsp), %rax
addq $24, %rsp
JMP_NOSPEC %rdi
-END(return_to_handler)
+SYM_CODE_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30f9cb2c0b55..3923ab4630d7 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -64,7 +64,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
* can.
*/
__HEAD
-ENTRY(startup_32)
+SYM_CODE_START(startup_32)
movl pa(initial_stack),%ecx
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -156,7 +156,7 @@ ENTRY(startup_32)
jmp *%eax
.Lbad_subarch:
-WEAK(xen_entry)
+SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK)
/* Unknown implementation; there's really
nothing we can do at this point. */
ud2a
@@ -172,6 +172,7 @@ num_subarch_entries = (. - subarch_entries) / 4
#else
jmp .Ldefault_entry
#endif /* CONFIG_PARAVIRT */
+SYM_CODE_END(startup_32)
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -179,12 +180,12 @@ num_subarch_entries = (. - subarch_entries) / 4
* up already except stack. We just set up stack here. Then call
* start_secondary().
*/
-ENTRY(start_cpu0)
+SYM_FUNC_START(start_cpu0)
movl initial_stack, %ecx
movl %ecx, %esp
call *(initial_code)
1: jmp 1b
-ENDPROC(start_cpu0)
+SYM_FUNC_END(start_cpu0)
#endif
/*
@@ -195,7 +196,7 @@ ENDPROC(start_cpu0)
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-ENTRY(startup_32_smp)
+SYM_FUNC_START(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
@@ -362,7 +363,7 @@ ENTRY(startup_32_smp)
call *(initial_code)
1: jmp 1b
-ENDPROC(startup_32_smp)
+SYM_FUNC_END(startup_32_smp)
#include "verify_cpu.S"
@@ -392,7 +393,7 @@ setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
ret
-ENTRY(early_idt_handler_array)
+SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
# 28(%esp) %eip
@@ -407,9 +408,9 @@ ENTRY(early_idt_handler_array)
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handler_array)
+SYM_FUNC_END(early_idt_handler_array)
-early_idt_handler_common:
+SYM_CODE_START_LOCAL(early_idt_handler_common)
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -460,10 +461,10 @@ early_idt_handler_common:
decl %ss:early_recursion_flag
addl $4, %esp /* pop pt_regs->orig_ax */
iret
-ENDPROC(early_idt_handler_common)
+SYM_CODE_END(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
-ENTRY(early_ignore_irq)
+SYM_FUNC_START(early_ignore_irq)
cld
#ifdef CONFIG_PRINTK
pushl %eax
@@ -498,19 +499,16 @@ ENTRY(early_ignore_irq)
hlt_loop:
hlt
jmp hlt_loop
-ENDPROC(early_ignore_irq)
+SYM_FUNC_END(early_ignore_irq)
__INITDATA
.align 4
-GLOBAL(early_recursion_flag)
- .long 0
+SYM_DATA(early_recursion_flag, .long 0)
__REFDATA
.align 4
-ENTRY(initial_code)
- .long i386_start_kernel
-ENTRY(setup_once_ref)
- .long setup_once
+SYM_DATA(initial_code, .long i386_start_kernel)
+SYM_DATA(setup_once_ref, .long setup_once)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
@@ -553,7 +551,7 @@ EXPORT_SYMBOL(empty_zero_page)
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PGD_ALIGN
-ENTRY(initial_page_table)
+SYM_DATA_START(initial_page_table)
.long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
.long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
@@ -571,17 +569,28 @@ ENTRY(initial_page_table)
# error "Kernel PMDs should be 1, 2 or 3"
# endif
.align PAGE_SIZE /* needs to be page-sized too */
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * PTI needs another page so sync_initial_pagetable() works correctly
+ * and does not scribble over the data which is placed behind the
+ * actual initial_page_table. See clone_pgd_range().
+ */
+ .fill 1024, 4, 0
+#endif
+
+SYM_DATA_END(initial_page_table)
#endif
.data
.balign 4
-ENTRY(initial_stack)
- /*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
- * unwinder reliably detect the end of the stack.
- */
- .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
- TOP_OF_KERNEL_STACK_PADDING;
+/*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * reliably detect the end of the stack.
+ */
+SYM_DATA(initial_stack,
+ .long init_thread_union + THREAD_SIZE -
+ SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING)
__INITRODATA
int_msg:
@@ -597,27 +606,28 @@ int_msg:
*/
.data
-.globl boot_gdt_descr
-
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
-boot_gdt_descr:
+SYM_DATA_START_LOCAL(boot_gdt_descr)
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
+SYM_DATA_END(boot_gdt_descr)
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-ENTRY(early_gdt_descr)
+SYM_DATA_START(early_gdt_descr)
.word GDT_ENTRIES*8-1
.long gdt_page /* Overwritten for secondary CPUs */
+SYM_DATA_END(early_gdt_descr)
/*
* The boot_gdt must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
-ENTRY(boot_gdt)
+SYM_DATA_START(boot_gdt)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+SYM_DATA_END(boot_gdt)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index f3d3e9646a99..4bbc770af632 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -49,8 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
__HEAD
.code64
- .globl startup_64
-startup_64:
+SYM_CODE_START_NOALIGN(startup_64)
UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -90,7 +89,9 @@ startup_64:
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
-ENTRY(secondary_startup_64)
+SYM_CODE_END(startup_64)
+
+SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -240,7 +241,7 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-END(secondary_startup_64)
+SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
@@ -250,30 +251,28 @@ END(secondary_startup_64)
* up already except stack. We just set up stack here. Then call
* start_secondary() via .Ljump_to_C_code.
*/
-ENTRY(start_cpu0)
+SYM_CODE_START(start_cpu0)
UNWIND_HINT_EMPTY
movq initial_stack(%rip), %rsp
jmp .Ljump_to_C_code
-END(start_cpu0)
+SYM_CODE_END(start_cpu0)
#endif
/* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
.balign 8
- GLOBAL(initial_code)
- .quad x86_64_start_kernel
- GLOBAL(initial_gs)
- .quad INIT_PER_CPU_VAR(fixed_percpu_data)
- GLOBAL(initial_stack)
- /*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
- * unwinder reliably detect the end of the stack.
- */
- .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
+SYM_DATA(initial_code, .quad x86_64_start_kernel)
+SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
+
+/*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * reliably detect the end of the stack.
+ */
+SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS)
__FINITDATA
__INIT
-ENTRY(early_idt_handler_array)
+SYM_CODE_START(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
@@ -289,9 +288,9 @@ ENTRY(early_idt_handler_array)
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
UNWIND_HINT_IRET_REGS offset=16
-END(early_idt_handler_array)
+SYM_CODE_END(early_idt_handler_array)
-early_idt_handler_common:
+SYM_CODE_START_LOCAL(early_idt_handler_common)
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -333,17 +332,11 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
jmp restore_regs_and_return_to_kernel
-END(early_idt_handler_common)
+SYM_CODE_END(early_idt_handler_common)
- __INITDATA
- .balign 4
-GLOBAL(early_recursion_flag)
- .long 0
-
-#define NEXT_PAGE(name) \
- .balign PAGE_SIZE; \
-GLOBAL(name)
+#define SYM_DATA_START_PAGE_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
@@ -358,11 +351,11 @@ GLOBAL(name)
*/
#define PTI_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
-#define NEXT_PGD_PAGE(name) \
- .balign 2 * PAGE_SIZE; \
-GLOBAL(name)
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE)
#else
-#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_DATA_START_PAGE_ALIGNED(name)
#define PTI_USER_PGD_FILL 0
#endif
@@ -375,17 +368,23 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PGD_PAGE(early_top_pgt)
+ .balign 4
+
+SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(early_top_pgt)
-NEXT_PAGE(early_dynamic_pgts)
+SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+SYM_DATA_END(early_dynamic_pgts)
+
+SYM_DATA(early_recursion_flag, .long 0)
.data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
-NEXT_PGD_PAGE(init_top_pgt)
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -393,11 +392,13 @@ NEXT_PGD_PAGE(init_top_pgt)
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
-NEXT_PAGE(level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.fill 511, 8, 0
-NEXT_PAGE(level2_ident_pgt)
+SYM_DATA_END(level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
/*
* Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
@@ -407,25 +408,29 @@ NEXT_PAGE(level2_ident_pgt)
* the CPU should ignore the bit.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+SYM_DATA_END(level2_ident_pgt)
#else
-NEXT_PGD_PAGE(init_top_pgt)
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
#endif
#ifdef CONFIG_X86_5LEVEL
-NEXT_PAGE(level4_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level4_kernel_pgt)
#endif
-NEXT_PAGE(level3_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level3_kernel_pgt)
-NEXT_PAGE(level2_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
* anyway.
@@ -442,8 +447,9 @@ NEXT_PAGE(level2_kernel_pgt)
*/
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)
+SYM_DATA_END(level2_kernel_pgt)
-NEXT_PAGE(level2_fixmap_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
.fill (512 - 4 - FIXMAP_PMD_NUM),8,0
pgtno = 0
.rept (FIXMAP_PMD_NUM)
@@ -453,31 +459,32 @@ NEXT_PAGE(level2_fixmap_pgt)
.endr
/* 6 MB reserved space + a 2MB hole */
.fill 4,8,0
+SYM_DATA_END(level2_fixmap_pgt)
-NEXT_PAGE(level1_fixmap_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
.rept (FIXMAP_PMD_NUM)
.fill 512,8,0
.endr
+SYM_DATA_END(level1_fixmap_pgt)
#undef PMDS
.data
.align 16
- .globl early_gdt_descr
-early_gdt_descr:
- .word GDT_ENTRIES*8-1
-early_gdt_descr_base:
- .quad INIT_PER_CPU_VAR(gdt_page)
-
-ENTRY(phys_base)
- /* This must match the first entry in level2_kernel_pgt */
- .quad 0x0000000000000000
+
+SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
+SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))
+
+ .align 16
+/* This must match the first entry in level2_kernel_pgt */
+SYM_DATA(phys_base, .quad 0x0)
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
-NEXT_PAGE(empty_zero_page)
+SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
+SYM_DATA_END(empty_zero_page)
EXPORT_SYMBOL(empty_zero_page)
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 61a89d3c0382..8abeee0dd7bf 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -3,32 +3,69 @@
* This contains the io-permission bitmap code - written by obz, with changes
* by Linus. 32/64 bits code unification by Miguel Botón.
*/
-
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/kernel.h>
#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
#include <linux/security.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
#include <linux/syscalls.h>
#include <linux/bitmap.h>
-#include <asm/syscalls.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/io_bitmap.h>
#include <asm/desc.h>
+#ifdef CONFIG_X86_IOPL_IOPERM
+
+static atomic64_t io_bitmap_sequence;
+
+void io_bitmap_share(struct task_struct *tsk)
+{
+ /* Can be NULL when current->thread.iopl_emul == 3 */
+ if (current->thread.io_bitmap) {
+ /*
+ * Take a refcount on current's bitmap. It can be used by
+ * both tasks as long as none of them changes the bitmap.
+ */
+ refcount_inc(&current->thread.io_bitmap->refcnt);
+ tsk->thread.io_bitmap = current->thread.io_bitmap;
+ }
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+}
+
+static void task_update_io_bitmap(void)
+{
+ struct thread_struct *t = &current->thread;
+
+ if (t->iopl_emul == 3 || t->io_bitmap) {
+ /* TSS update is handled on exit to user space */
+ set_thread_flag(TIF_IO_BITMAP);
+ } else {
+ clear_thread_flag(TIF_IO_BITMAP);
+ /* Invalidate TSS */
+ preempt_disable();
+ tss_update_io_bitmap();
+ preempt_enable();
+ }
+}
+
+void io_bitmap_exit(void)
+{
+ struct io_bitmap *iobm = current->thread.io_bitmap;
+
+ current->thread.io_bitmap = NULL;
+ task_update_io_bitmap();
+ if (iobm && refcount_dec_and_test(&iobm->refcnt))
+ kfree(iobm);
+}
+
/*
- * this changes the io permissions bitmap in the current task.
+ * This changes the io permissions bitmap in the current task.
*/
long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
struct thread_struct *t = &current->thread;
- struct tss_struct *tss;
- unsigned int i, max_long, bytes, bytes_updated;
+ unsigned int i, max_long;
+ struct io_bitmap *iobm;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
@@ -41,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
* IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now:
*/
- if (!t->io_bitmap_ptr) {
- unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-
- if (!bitmap)
+ iobm = t->io_bitmap;
+ if (!iobm) {
+ /* No point to allocate a bitmap just to clear permissions */
+ if (!turn_on)
+ return 0;
+ iobm = kmalloc(sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
return -ENOMEM;
- memset(bitmap, 0xff, IO_BITMAP_BYTES);
- t->io_bitmap_ptr = bitmap;
- set_thread_flag(TIF_IO_BITMAP);
+ memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap));
+ refcount_set(&iobm->refcnt, 1);
+ }
- /*
- * Now that we have an IO bitmap, we need our TSS limit to be
- * correct. It's fine if we are preempted after doing this:
- * with TIF_IO_BITMAP set, context switches will keep our TSS
- * limit correct.
- */
- preempt_disable();
- refresh_tss_limit();
- preempt_enable();
+ /*
+ * If the bitmap is not shared, then nothing can take a refcount as
+ * current can obviously not fork at the same time. If it's shared
+ * duplicate it and drop the refcount on the original one.
+ */
+ if (refcount_read(&iobm->refcnt) > 1) {
+ iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
+ return -ENOMEM;
+ refcount_set(&iobm->refcnt, 1);
+ io_bitmap_exit();
}
/*
- * do it in the per-thread copy and in the TSS ...
- *
- * Disable preemption via get_cpu() - we must not switch away
- * because the ->io_bitmap_max value must match the bitmap
- * contents:
+ * Store the bitmap pointer (might be the same if the task already
+ * head one). Must be done here so freeing the bitmap when all
+ * permissions are dropped has the pointer set up.
*/
- tss = &per_cpu(cpu_tss_rw, get_cpu());
+ t->io_bitmap = iobm;
+ /* Mark it active for context switching and exit to user mode */
+ set_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Update the tasks bitmap. The update of the TSS bitmap happens on
+ * exit to user mode. So this needs no protection.
+ */
if (turn_on)
- bitmap_clear(t->io_bitmap_ptr, from, num);
+ bitmap_clear(iobm->bitmap, from, num);
else
- bitmap_set(t->io_bitmap_ptr, from, num);
+ bitmap_set(iobm->bitmap, from, num);
/*
* Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct:
*/
- max_long = 0;
- for (i = 0; i < IO_BITMAP_LONGS; i++)
- if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = UINT_MAX;
+ for (i = 0; i < IO_BITMAP_LONGS; i++) {
+ if (iobm->bitmap[i] != ~0UL)
max_long = i;
+ }
+ /* All permissions dropped? */
+ if (max_long == UINT_MAX) {
+ io_bitmap_exit();
+ return 0;
+ }
- bytes = (max_long + 1) * sizeof(unsigned long);
- bytes_updated = max(bytes, t->io_bitmap_max);
-
- t->io_bitmap_max = bytes;
-
- /* Update the TSS: */
- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+ iobm->max = (max_long + 1) * sizeof(unsigned long);
- put_cpu();
+ /*
+ * Update the sequence number to force a TSS update on return to
+ * user mode.
+ */
+ iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence);
return 0;
}
@@ -104,38 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
}
/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ * The sys_iopl functionality depends on the level argument, which if
+ * granted for the task is used to enable access to all 65536 I/O ports.
+ *
+ * This does not use the IOPL mechanism provided by the CPU as that would
+ * also allow the user space task to use the CLI/STI instructions.
*
- * Here we just change the flags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
+ * Disabling interrupts in a user space task is dangerous as it might lock
+ * up the machine and the semantics vs. syscalls and exceptions is
+ * undefined.
+ *
+ * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3
+ * 3 enables them.
+ *
+ * IOPL is strictly per thread and inherited on fork.
*/
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
- struct pt_regs *regs = current_pt_regs();
struct thread_struct *t = &current->thread;
-
- /*
- * Careful: the IOPL bits in regs->flags are undefined under Xen PV
- * and changing them has no effect.
- */
- unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+ unsigned int old;
if (level > 3)
return -EINVAL;
+
+ old = t->iopl_emul;
+
+ /* No point in going further if nothing changes */
+ if (level == old)
+ return 0;
+
/* Trying to gain more privileges? */
if (level > old) {
if (!capable(CAP_SYS_RAWIO) ||
security_locked_down(LOCKDOWN_IOPORT))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
- (level << X86_EFLAGS_IOPL_BIT);
- t->iopl = level << X86_EFLAGS_IOPL_BIT;
- set_iopl_mask(t->iopl);
+
+ t->iopl_emul = level;
+ task_update_io_bitmap();
return 0;
}
+
+#else /* CONFIG_X86_IOPL_IOPERM */
+
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ return -ENOSYS;
+}
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return -ENOSYS;
+}
+
+SYSCALL_DEFINE1(iopl, unsigned int, level)
+{
+ return -ENOSYS;
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index ddeeaac8adda..0db0375235b4 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -7,20 +7,20 @@
/*
* unsigned long native_save_fl(void)
*/
-ENTRY(native_save_fl)
+SYM_FUNC_START(native_save_fl)
pushf
pop %_ASM_AX
ret
-ENDPROC(native_save_fl)
+SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
/*
* void native_restore_fl(unsigned long flags)
* %eax/%rdi: flags
*/
-ENTRY(native_restore_fl)
+SYM_FUNC_START(native_restore_fl)
push %_ASM_ARG1
popf
ret
-ENDPROC(native_restore_fl)
+SYM_FUNC_END(native_restore_fl)
EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 3ad34f01de2a..6eb8b50ea07e 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -11,6 +11,7 @@
#include <linux/acpi_pmtmr.h>
#include <linux/kernel.h>
#include <linux/reboot.h>
+#include <linux/serial_8250.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/hypervisor.h>
@@ -21,9 +22,24 @@
#include <asm/setup.h>
#include <asm/jailhouse_para.h>
-static __initdata struct jailhouse_setup_data setup_data;
+static struct jailhouse_setup_data setup_data;
+#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1))
+#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2))
+
static unsigned int precalibrated_tsc_khz;
+static void jailhouse_setup_irq(unsigned int irq)
+{
+ struct mpc_intsrc mp_irq = {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
+ .srcbusirq = irq,
+ .dstirq = irq,
+ };
+ mp_save_irq(&mp_irq);
+}
+
static uint32_t jailhouse_cpuid_base(void)
{
if (boot_cpu_data.cpuid_level < 0 ||
@@ -45,7 +61,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now)
static void __init jailhouse_timer_init(void)
{
- lapic_timer_period = setup_data.apic_khz * (1000 / HZ);
+ lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ);
}
static unsigned long jailhouse_get_tsc(void)
@@ -77,33 +93,28 @@ static void __init jailhouse_get_smp_config(unsigned int early)
.type = IOAPIC_DOMAIN_STRICT,
.ops = &mp_ioapic_irqdomain_ops,
};
- struct mpc_intsrc mp_irq = {
- .type = MP_INTSRC,
- .irqtype = mp_INT,
- .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
- };
unsigned int cpu;
jailhouse_x2apic_init();
register_lapic_address(0xfee00000);
- for (cpu = 0; cpu < setup_data.num_cpus; cpu++) {
- generic_processor_info(setup_data.cpu_ids[cpu],
+ for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) {
+ generic_processor_info(setup_data.v1.cpu_ids[cpu],
boot_cpu_apic_version);
}
smp_found_config = 1;
- if (setup_data.standard_ioapic) {
+ if (setup_data.v1.standard_ioapic) {
mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
- /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
- mp_irq.srcbusirq = mp_irq.dstirq = 3;
- mp_save_irq(&mp_irq);
-
- mp_irq.srcbusirq = mp_irq.dstirq = 4;
- mp_save_irq(&mp_irq);
+ if (IS_ENABLED(CONFIG_SERIAL_8250) &&
+ setup_data.hdr.version < 2) {
+ /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
+ jailhouse_setup_irq(3);
+ jailhouse_setup_irq(4);
+ }
}
}
@@ -126,9 +137,9 @@ static int __init jailhouse_pci_arch_init(void)
pcibios_last_bus = 0xff;
#ifdef CONFIG_PCI_MMCONFIG
- if (setup_data.pci_mmconfig_base) {
+ if (setup_data.v1.pci_mmconfig_base) {
pci_mmconfig_add(0, 0, pcibios_last_bus,
- setup_data.pci_mmconfig_base);
+ setup_data.v1.pci_mmconfig_base);
pci_mmcfg_arch_init();
}
#endif
@@ -136,9 +147,57 @@ static int __init jailhouse_pci_arch_init(void)
return 0;
}
+#ifdef CONFIG_SERIAL_8250
+static inline bool jailhouse_uart_enabled(unsigned int uart_nr)
+{
+ return setup_data.v2.flags & BIT(uart_nr);
+}
+
+static void jailhouse_serial_fixup(int port, struct uart_port *up,
+ u32 *capabilities)
+{
+ static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8};
+ unsigned int n;
+
+ for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) {
+ if (pcuart_base[n] != up->iobase)
+ continue;
+
+ if (jailhouse_uart_enabled(n)) {
+ pr_info("Enabling UART%u (port 0x%lx)\n", n,
+ up->iobase);
+ jailhouse_setup_irq(up->irq);
+ } else {
+ /* Deactivate UART if access isn't allowed */
+ up->iobase = 0;
+ }
+ break;
+ }
+}
+
+static void __init jailhouse_serial_workaround(void)
+{
+ /*
+ * There are flags inside setup_data that indicate availability of
+ * platform UARTs since setup data version 2.
+ *
+ * In case of version 1, we don't know which UARTs belong Linux. In
+ * this case, unconditionally register 1:1 mapping for legacy UART IRQs
+ * 3 and 4.
+ */
+ if (setup_data.hdr.version > 1)
+ serial8250_set_isa_configurator(jailhouse_serial_fixup);
+}
+#else /* !CONFIG_SERIAL_8250 */
+static inline void jailhouse_serial_workaround(void)
+{
+}
+#endif /* CONFIG_SERIAL_8250 */
+
static void __init jailhouse_init_platform(void)
{
u64 pa_data = boot_params.hdr.setup_data;
+ unsigned long setup_data_len;
struct setup_data header;
void *mapping;
@@ -163,16 +222,8 @@ static void __init jailhouse_init_platform(void)
memcpy(&header, mapping, sizeof(header));
early_memunmap(mapping, sizeof(header));
- if (header.type == SETUP_JAILHOUSE &&
- header.len >= sizeof(setup_data)) {
- pa_data += offsetof(struct setup_data, data);
-
- mapping = early_memremap(pa_data, sizeof(setup_data));
- memcpy(&setup_data, mapping, sizeof(setup_data));
- early_memunmap(mapping, sizeof(setup_data));
-
+ if (header.type == SETUP_JAILHOUSE)
break;
- }
pa_data = header.next;
}
@@ -180,13 +231,28 @@ static void __init jailhouse_init_platform(void)
if (!pa_data)
panic("Jailhouse: No valid setup data found");
- if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION)
- panic("Jailhouse: Unsupported setup data structure");
-
- pmtmr_ioport = setup_data.pm_timer_address;
+ /* setup data must at least contain the header */
+ if (header.len < sizeof(setup_data.hdr))
+ goto unsupported;
+
+ pa_data += offsetof(struct setup_data, data);
+ setup_data_len = min_t(unsigned long, sizeof(setup_data),
+ (unsigned long)header.len);
+ mapping = early_memremap(pa_data, setup_data_len);
+ memcpy(&setup_data, mapping, setup_data_len);
+ early_memunmap(mapping, setup_data_len);
+
+ if (setup_data.hdr.version == 0 ||
+ setup_data.hdr.compatible_version !=
+ JAILHOUSE_SETUP_REQUIRED_VERSION ||
+ (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) ||
+ (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN))
+ goto unsupported;
+
+ pmtmr_ioport = setup_data.v1.pm_timer_address;
pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
- precalibrated_tsc_khz = setup_data.tsc_khz;
+ precalibrated_tsc_khz = setup_data.v1.tsc_khz;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
pci_probe = 0;
@@ -196,6 +262,12 @@ static void __init jailhouse_init_platform(void)
* are none in a non-root cell.
*/
disable_acpi();
+
+ jailhouse_serial_workaround();
+ return;
+
+unsupported:
+ panic("Jailhouse: Unsupported setup data structure");
}
bool jailhouse_paravirt(void)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 044053235302..c1a8b9e71408 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -89,8 +89,7 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
return;
}
- text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
- (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+ text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL);
}
void arch_jump_label_transform(struct jump_entry *entry,
@@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
}
__jump_label_set_jump_code(entry, type,
- (union jump_code_union *) &tp->opcode, 0);
+ (union jump_code_union *)&tp->text, 0);
- tp->addr = entry_code;
- tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
- tp->len = JUMP_LABEL_NOP_SIZE;
+ text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL);
tp_vec_nr++;
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index edaa30b20841..64b6da95af98 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -44,7 +44,12 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
if (count > node->len - pos)
count = node->len - pos;
- pa = node->paddr + sizeof(struct setup_data) + pos;
+ pa = node->paddr + pos;
+
+ /* Is it direct data or invalid indirect one? */
+ if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT)
+ pa += sizeof(struct setup_data);
+
p = memremap(pa, count, MEMREMAP_WB);
if (!p)
return -ENOMEM;
@@ -108,9 +113,17 @@ static int __init create_setup_data_nodes(struct dentry *parent)
goto err_dir;
}
- node->paddr = pa_data;
- node->type = data->type;
- node->len = data->len;
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
+ node->paddr = ((struct setup_indirect *)data->data)->addr;
+ node->type = ((struct setup_indirect *)data->data)->type;
+ node->len = ((struct setup_indirect *)data->data)->len;
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
+
create_setup_data_node(d, no, node);
pa_data = data->next;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 43fc13c831af..4f13af7cbcdb 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -351,6 +351,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
kernel_insn_init(insn, dest, MAX_INSN_SIZE);
insn_get_length(insn);
+ /* We can not probe force emulate prefixed instruction */
+ if (insn_has_emulate_prefix(insn))
+ return 0;
+
/* Another subsystem puts a breakpoint, failed to recover */
if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index b348dd506d58..8900329c28a7 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_head *oplist)
insn_buff[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL);
list_del_init(&op->list);
}
@@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_head *oplist)
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buff[RELATIVEJUMP_SIZE];
+ u8 emulate_buff[RELATIVEJUMP_SIZE];
/* Set int3 to first byte for kprobes */
insn_buff[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+ emulate_buff[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ emulate_buff);
}
/*
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 7969da939213..d0a19121c6a4 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -100,7 +100,12 @@ static int __init get_setup_data_size(int nr, size_t *size)
if (!data)
return -ENOMEM;
if (nr == i) {
- *size = data->len;
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT)
+ *size = ((struct setup_indirect *)data->data)->len;
+ else
+ *size = data->len;
+
memunmap(data);
return 0;
}
@@ -130,7 +135,10 @@ static ssize_t type_show(struct kobject *kobj,
if (!data)
return -ENOMEM;
- ret = sprintf(buf, "0x%x\n", data->type);
+ if (data->type == SETUP_INDIRECT)
+ ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type);
+ else
+ ret = sprintf(buf, "0x%x\n", data->type);
memunmap(data);
return ret;
}
@@ -142,7 +150,7 @@ static ssize_t setup_data_data_read(struct file *fp,
loff_t off, size_t count)
{
int nr, ret = 0;
- u64 paddr;
+ u64 paddr, len;
struct setup_data *data;
void *p;
@@ -157,19 +165,28 @@ static ssize_t setup_data_data_read(struct file *fp,
if (!data)
return -ENOMEM;
- if (off > data->len) {
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
+ paddr = ((struct setup_indirect *)data->data)->addr;
+ len = ((struct setup_indirect *)data->data)->len;
+ } else {
+ paddr += sizeof(*data);
+ len = data->len;
+ }
+
+ if (off > len) {
ret = -EINVAL;
goto out;
}
- if (count > data->len - off)
- count = data->len - off;
+ if (count > len - off)
+ count = len - off;
if (!count)
goto out;
ret = count;
- p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
+ p = memremap(paddr, len, MEMREMAP_WB);
if (!p) {
ret = -ENOMEM;
goto out;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e820568ed4d5..32ef1ee733b7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -33,6 +33,7 @@
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
#include <asm/tlb.h>
+#include <asm/cpuidle_haltpoll.h>
static int kvmapf = 1;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 5dcd438ad8f2..16e125a50b33 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -298,48 +298,6 @@ static void load_segments(void)
);
}
-#ifdef CONFIG_KEXEC_FILE
-/* Update purgatory as needed after various image segments have been prepared */
-static int arch_update_purgatory(struct kimage *image)
-{
- int ret = 0;
-
- if (!image->file_mode)
- return 0;
-
- /* Setup copying of backup region */
- if (image->type == KEXEC_TYPE_CRASH) {
- ret = kexec_purgatory_get_set_symbol(image,
- "purgatory_backup_dest",
- &image->arch.backup_load_addr,
- sizeof(image->arch.backup_load_addr), 0);
- if (ret)
- return ret;
-
- ret = kexec_purgatory_get_set_symbol(image,
- "purgatory_backup_src",
- &image->arch.backup_src_start,
- sizeof(image->arch.backup_src_start), 0);
- if (ret)
- return ret;
-
- ret = kexec_purgatory_get_set_symbol(image,
- "purgatory_backup_sz",
- &image->arch.backup_src_sz,
- sizeof(image->arch.backup_src_sz), 0);
- if (ret)
- return ret;
- }
-
- return ret;
-}
-#else /* !CONFIG_KEXEC_FILE */
-static inline int arch_update_purgatory(struct kimage *image)
-{
- return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -353,11 +311,6 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
- /* update purgatory as needed */
- result = arch_update_purgatory(image);
- if (result)
- return result;
-
return 0;
}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 59d3d2763a9e..789f5e4f89de 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -341,8 +341,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.iret = native_iret,
.cpu.swapgs = native_swapgs,
- .cpu.set_iopl_mask = native_set_iopl_mask,
-
.cpu.start_context_switch = paravirt_nop,
.cpu.end_context_switch = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
deleted file mode 100644
index 23fdec030c37..000000000000
--- a/arch/x86/kernel/pci-calgary_64.c
+++ /dev/null
@@ -1,1586 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Derived from arch/powerpc/kernel/iommu.c
- *
- * Copyright IBM Corporation, 2006-2007
- * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
- *
- * Author: Jon Mason <jdmason@kudzu.us>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
-
- */
-
-#define pr_fmt(fmt) "Calgary: " fmt
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/crash_dump.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-direct.h>
-#include <linux/bitmap.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/scatterlist.h>
-#include <linux/iommu-helper.h>
-
-#include <asm/iommu.h>
-#include <asm/calgary.h>
-#include <asm/tce.h>
-#include <asm/pci-direct.h>
-#include <asm/dma.h>
-#include <asm/rio.h>
-#include <asm/bios_ebda.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-
-#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
-int use_calgary __read_mostly = 1;
-#else
-int use_calgary __read_mostly = 0;
-#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
-
-#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
-#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
-
-/* register offsets inside the host bridge space */
-#define CALGARY_CONFIG_REG 0x0108
-#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
-#define PHB_PLSSR_OFFSET 0x0120
-#define PHB_CONFIG_RW_OFFSET 0x0160
-#define PHB_IOBASE_BAR_LOW 0x0170
-#define PHB_IOBASE_BAR_HIGH 0x0180
-#define PHB_MEM_1_LOW 0x0190
-#define PHB_MEM_1_HIGH 0x01A0
-#define PHB_IO_ADDR_SIZE 0x01B0
-#define PHB_MEM_1_SIZE 0x01C0
-#define PHB_MEM_ST_OFFSET 0x01D0
-#define PHB_AER_OFFSET 0x0200
-#define PHB_CONFIG_0_HIGH 0x0220
-#define PHB_CONFIG_0_LOW 0x0230
-#define PHB_CONFIG_0_END 0x0240
-#define PHB_MEM_2_LOW 0x02B0
-#define PHB_MEM_2_HIGH 0x02C0
-#define PHB_MEM_2_SIZE_HIGH 0x02D0
-#define PHB_MEM_2_SIZE_LOW 0x02E0
-#define PHB_DOSHOLE_OFFSET 0x08E0
-
-/* CalIOC2 specific */
-#define PHB_SAVIOR_L2 0x0DB0
-#define PHB_PAGE_MIG_CTRL 0x0DA8
-#define PHB_PAGE_MIG_DEBUG 0x0DA0
-#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
-
-/* PHB_CONFIG_RW */
-#define PHB_TCE_ENABLE 0x20000000
-#define PHB_SLOT_DISABLE 0x1C000000
-#define PHB_DAC_DISABLE 0x01000000
-#define PHB_MEM2_ENABLE 0x00400000
-#define PHB_MCSR_ENABLE 0x00100000
-/* TAR (Table Address Register) */
-#define TAR_SW_BITS 0x0000ffffffff800fUL
-#define TAR_VALID 0x0000000000000008UL
-/* CSR (Channel/DMA Status Register) */
-#define CSR_AGENT_MASK 0xffe0ffff
-/* CCR (Calgary Configuration Register) */
-#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
-/* PMCR/PMDR (Page Migration Control/Debug Registers */
-#define PMR_SOFTSTOP 0x80000000
-#define PMR_SOFTSTOPFAULT 0x40000000
-#define PMR_HARDSTOP 0x20000000
-
-/*
- * The maximum PHB bus number.
- * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
- * x3950M2: 4 chassis, 48 PHBs per chassis = 192
- * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
- * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
- */
-#define MAX_PHB_BUS_NUM 256
-
-#define PHBS_PER_CALGARY 4
-
-/* register offsets in Calgary's internal register space */
-static const unsigned long tar_offsets[] = {
- 0x0580 /* TAR0 */,
- 0x0588 /* TAR1 */,
- 0x0590 /* TAR2 */,
- 0x0598 /* TAR3 */
-};
-
-static const unsigned long split_queue_offsets[] = {
- 0x4870 /* SPLIT QUEUE 0 */,
- 0x5870 /* SPLIT QUEUE 1 */,
- 0x6870 /* SPLIT QUEUE 2 */,
- 0x7870 /* SPLIT QUEUE 3 */
-};
-
-static const unsigned long phb_offsets[] = {
- 0x8000 /* PHB0 */,
- 0x9000 /* PHB1 */,
- 0xA000 /* PHB2 */,
- 0xB000 /* PHB3 */
-};
-
-/* PHB debug registers */
-
-static const unsigned long phb_debug_offsets[] = {
- 0x4000 /* PHB 0 DEBUG */,
- 0x5000 /* PHB 1 DEBUG */,
- 0x6000 /* PHB 2 DEBUG */,
- 0x7000 /* PHB 3 DEBUG */
-};
-
-/*
- * STUFF register for each debug PHB,
- * byte 1 = start bus number, byte 2 = end bus number
- */
-
-#define PHB_DEBUG_STUFF_OFFSET 0x0020
-
-unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
-static int translate_empty_slots __read_mostly = 0;
-static int calgary_detected __read_mostly = 0;
-
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
-
-struct calgary_bus_info {
- void *tce_space;
- unsigned char translation_disabled;
- signed char phbid;
- void __iomem *bbar;
-};
-
-static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calgary_tce_cache_blast(struct iommu_table *tbl);
-static void calgary_dump_error_regs(struct iommu_table *tbl);
-static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calioc2_tce_cache_blast(struct iommu_table *tbl);
-static void calioc2_dump_error_regs(struct iommu_table *tbl);
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
-static void get_tce_space_from_tar(void);
-
-static const struct cal_chipset_ops calgary_chip_ops = {
- .handle_quirks = calgary_handle_quirks,
- .tce_cache_blast = calgary_tce_cache_blast,
- .dump_error_regs = calgary_dump_error_regs
-};
-
-static const struct cal_chipset_ops calioc2_chip_ops = {
- .handle_quirks = calioc2_handle_quirks,
- .tce_cache_blast = calioc2_tce_cache_blast,
- .dump_error_regs = calioc2_dump_error_regs
-};
-
-static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-
-static inline int translation_enabled(struct iommu_table *tbl)
-{
- /* only PHBs with translation enabled have an IOMMU table */
- return (tbl != NULL);
-}
-
-static void iommu_range_reserve(struct iommu_table *tbl,
- unsigned long start_addr, unsigned int npages)
-{
- unsigned long index;
- unsigned long end;
- unsigned long flags;
-
- index = start_addr >> PAGE_SHIFT;
-
- /* bail out if we're asked to reserve a region we don't cover */
- if (index >= tbl->it_size)
- return;
-
- end = index + npages;
- if (end > tbl->it_size) /* don't go off the table */
- end = tbl->it_size;
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- bitmap_set(tbl->it_map, index, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static unsigned long iommu_range_alloc(struct device *dev,
- struct iommu_table *tbl,
- unsigned int npages)
-{
- unsigned long flags;
- unsigned long offset;
- unsigned long boundary_size;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- BUG_ON(npages == 0);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- tbl->chip_ops->tce_cache_blast(tbl);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- pr_warn("IOMMU full\n");
- spin_unlock_irqrestore(&tbl->it_lock, flags);
- if (panic_on_overflow)
- panic("Calgary: fix the allocator.\n");
- else
- return DMA_MAPPING_ERROR;
- }
- }
-
- tbl->it_hint = offset + npages;
- BUG_ON(tbl->it_hint > tbl->it_size);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-
- return offset;
-}
-
-static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
- void *vaddr, unsigned int npages, int direction)
-{
- unsigned long entry;
- dma_addr_t ret;
-
- entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == DMA_MAPPING_ERROR)) {
- pr_warn("failed to allocate %u pages in iommu %p\n",
- npages, tbl);
- return DMA_MAPPING_ERROR;
- }
-
- /* set the return dma address */
- ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
-
- /* put the TCEs in the HW table */
- tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
- direction);
- return ret;
-}
-
-static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
-{
- unsigned long entry;
- unsigned long flags;
-
- /* were we called with bad_dma_address? */
- if (unlikely(dma_addr == DMA_MAPPING_ERROR)) {
- WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
- "address 0x%Lx\n", dma_addr);
- return;
- }
-
- entry = dma_addr >> PAGE_SHIFT;
-
- BUG_ON(entry + npages > tbl->it_size);
-
- tce_free(tbl, entry, npages);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- bitmap_clear(tbl->it_map, entry, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static inline struct iommu_table *find_iommu_table(struct device *dev)
-{
- struct pci_dev *pdev;
- struct pci_bus *pbus;
- struct iommu_table *tbl;
-
- pdev = to_pci_dev(dev);
-
- /* search up the device tree for an iommu */
- pbus = pdev->bus;
- do {
- tbl = pci_iommu(pbus);
- if (tbl && tbl->it_busno == pbus->number)
- break;
- tbl = NULL;
- pbus = pbus->parent;
- } while (pbus);
-
- BUG_ON(tbl && (tbl->it_busno != pbus->number));
-
- return tbl;
-}
-
-static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems,enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- int i;
-
- if (!translation_enabled(tbl))
- return;
-
- for_each_sg(sglist, s, nelems, i) {
- unsigned int npages;
- dma_addr_t dma = s->dma_address;
- unsigned int dmalen = s->dma_length;
-
- if (dmalen == 0)
- break;
-
- npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
- iommu_free(tbl, dma, npages);
- }
-}
-
-static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- unsigned long vaddr;
- unsigned int npages;
- unsigned long entry;
- int i;
-
- for_each_sg(sg, s, nelems, i) {
- BUG_ON(!sg_page(s));
-
- vaddr = (unsigned long) sg_virt(s);
- npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
-
- entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == DMA_MAPPING_ERROR) {
- /* makes sure unmap knows to stop */
- s->dma_length = 0;
- goto error;
- }
-
- s->dma_address = (entry << PAGE_SHIFT) | s->offset;
-
- /* insert into HW table */
- tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
-
- s->dma_length = s->length;
- }
-
- return nelems;
-error:
- calgary_unmap_sg(dev, sg, nelems, dir, 0);
- for_each_sg(sg, s, nelems, i) {
- sg->dma_address = DMA_MAPPING_ERROR;
- sg->dma_length = 0;
- }
- return 0;
-}
-
-static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- void *vaddr = page_address(page) + offset;
- unsigned long uaddr;
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- uaddr = (unsigned long)vaddr;
- npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
-
- return iommu_alloc(dev, tbl, vaddr, npages, dir);
-}
-
-static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- unsigned int npages;
-
- npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- iommu_free(tbl, dma_addr, npages);
-}
-
-static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
-{
- void *ret = NULL;
- dma_addr_t mapping;
- unsigned int npages, order;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size); /* size rounded up to full pages */
- npages = size >> PAGE_SHIFT;
- order = get_order(size);
-
- /* alloc enough pages (and possibly more) */
- ret = (void *)__get_free_pages(flag, order);
- if (!ret)
- goto error;
- memset(ret, 0, size);
-
- /* set up tces to cover the allocated range */
- mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == DMA_MAPPING_ERROR)
- goto free;
- *dma_handle = mapping;
- return ret;
-free:
- free_pages((unsigned long)ret, get_order(size));
- ret = NULL;
-error:
- return ret;
-}
-
-static void calgary_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle,
- unsigned long attrs)
-{
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
-
- iommu_free(tbl, dma_handle, npages);
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static const struct dma_map_ops calgary_dma_ops = {
- .alloc = calgary_alloc_coherent,
- .free = calgary_free_coherent,
- .map_sg = calgary_map_sg,
- .unmap_sg = calgary_unmap_sg,
- .map_page = calgary_map_page,
- .unmap_page = calgary_unmap_page,
- .dma_supported = dma_direct_supported,
- .mmap = dma_common_mmap,
- .get_sgtable = dma_common_get_sgtable,
-};
-
-static inline void __iomem * busno_to_bbar(unsigned char num)
-{
- return bus_info[num].bbar;
-}
-
-static inline int busno_to_phbid(unsigned char num)
-{
- return bus_info[num].phbid;
-}
-
-static inline unsigned long split_queue_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return split_queue_offsets[idx];
-}
-
-static inline unsigned long tar_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return tar_offsets[idx];
-}
-
-static inline unsigned long phb_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return phb_offsets[idx];
-}
-
-static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
-{
- unsigned long target = ((unsigned long)bar) | offset;
- return (void __iomem*)target;
-}
-
-static inline int is_calioc2(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALIOC2);
-}
-
-static inline int is_calgary(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALGARY);
-}
-
-static inline int is_cal_pci_dev(unsigned short device)
-{
- return (is_calgary(device) || is_calioc2(device));
-}
-
-static void calgary_tce_cache_blast(struct iommu_table *tbl)
-{
- u64 val;
- u32 aer;
- int i = 0;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
-
- /* disable arbitration on the bus */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- aer = readl(target);
- writel(0, target);
-
- /* read plssr to ensure it got there */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- val = readl(target);
-
- /* poll split queues until all DMA activity is done */
- target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
- do {
- val = readq(target);
- i++;
- } while ((val & 0xff) != 0xff && i < 100);
- if (i == 100)
- pr_warn("PCI bus not quiesced, continuing anyway\n");
-
- /* invalidate TCE cache */
- target = calgary_reg(bbar, tar_offset(tbl->it_busno));
- writeq(tbl->tar_val, target);
-
- /* enable arbitration */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- writel(aer, target);
- (void)readl(target); /* flush */
-}
-
-static void calioc2_tce_cache_blast(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u64 val64;
- u32 val;
- int i = 0;
- int count = 1;
- unsigned char bus = tbl->it_busno;
-
-begin:
- printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
- "sequence - count %d\n", bus, count);
-
- /* 1. using the Page Migration Control reg set SoftStop */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
- val |= PMR_SOFTSTOP;
- printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
-
- /* 2. poll split queues until all DMA activity is done */
- printk(KERN_DEBUG "2a. starting to poll split queues\n");
- target = calgary_reg(bbar, split_queue_offset(bus));
- do {
- val64 = readq(target);
- i++;
- } while ((val64 & 0xff) != 0xff && i < 100);
- if (i == 100)
- pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
-
- /* 3. poll Page Migration DEBUG for SoftStopFault */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
-
- /* 4. if SoftStopFault - goto (1) */
- if (val & PMR_SOFTSTOPFAULT) {
- if (++count < 100)
- goto begin;
- else {
- pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
- return; /* pray for the best */
- }
- }
-
- /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
-
- /* 6. invalidate TCE cache */
- printk(KERN_DEBUG "6. invalidating TCE cache\n");
- target = calgary_reg(bbar, tar_offset(bus));
- writeq(tbl->tar_val, target);
-
- /* 7. Re-read PMCR */
- printk(KERN_DEBUG "7a. Re-reading PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
-
- /* 8. Remove HardStop */
- printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = 0;
- printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
-}
-
-static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
- u64 limit)
-{
- unsigned int numpages;
-
- limit = limit | 0xfffff;
- limit++;
-
- numpages = ((limit - start) >> PAGE_SHIFT);
- iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
-}
-
-static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
-{
- void __iomem *target;
- u64 low, high, sizelow;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* peripheral MEM_1 region */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
- sizelow = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
-{
- void __iomem *target;
- u32 val32;
- u64 low, high, sizelow, sizehigh;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* is it enabled? */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- if (!(val32 & PHB_MEM2_ENABLE))
- return;
-
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
- sizelow = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
- sizehigh = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = (sizehigh << 32) | sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-/*
- * some regions of the IO address space do not get translated, so we
- * must not give devices IO addresses in those regions. The regions
- * are the 640KB-1MB region and the two PCI peripheral memory holes.
- * Reserve all of them in the IOMMU bitmap to avoid giving them out
- * later.
- */
-static void __init calgary_reserve_regions(struct pci_dev *dev)
-{
- unsigned int npages;
- u64 start;
- struct iommu_table *tbl = pci_iommu(dev->bus);
-
- /* avoid the BIOS/VGA first 640KB-1MB region */
- /* for CalIOC2 - avoid the entire first MB */
- if (is_calgary(dev->device)) {
- start = (640 * 1024);
- npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
- } else { /* calioc2 */
- start = 0;
- npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
- }
- iommu_range_reserve(tbl, start, npages);
-
- /* reserve the two PCI peripheral memory regions in IO space */
- calgary_reserve_peripheral_mem_1(dev);
- calgary_reserve_peripheral_mem_2(dev);
-}
-
-static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
-{
- u64 val64;
- u64 table_phys;
- void __iomem *target;
- int ret;
- struct iommu_table *tbl;
-
- /* build TCE tables for each PHB */
- ret = build_tce_table(dev, bbar);
- if (ret)
- return ret;
-
- tbl = pci_iommu(dev->bus);
- tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
-
- if (is_kdump_kernel())
- calgary_init_bitmap_from_tce_table(tbl);
- else
- tce_free(tbl, 0, tbl->it_size);
-
- if (is_calgary(dev->device))
- tbl->chip_ops = &calgary_chip_ops;
- else if (is_calioc2(dev->device))
- tbl->chip_ops = &calioc2_chip_ops;
- else
- BUG();
-
- calgary_reserve_regions(dev);
-
- /* set TARs for each PHB */
- target = calgary_reg(bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
-
- /* zero out all TAR bits under sw control */
- val64 &= ~TAR_SW_BITS;
- table_phys = (u64)__pa(tbl->it_base);
-
- val64 |= table_phys;
-
- BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
- val64 |= (u64) specified_table_size;
-
- tbl->tar_val = cpu_to_be64(val64);
-
- writeq(tbl->tar_val, target);
- readq(target); /* flush */
-
- return 0;
-}
-
-static void __init calgary_free_bus(struct pci_dev *dev)
-{
- u64 val64;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- void __iomem *target;
- unsigned int bitmapsz;
-
- target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
- val64 &= ~TAR_SW_BITS;
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
- tbl->it_map = NULL;
-
- kfree(tbl);
-
- set_pci_iommu(dev->bus, NULL);
-
- /* Can't free bootmem allocated memory after system is up :-( */
- bus_info[dev->bus->number].tce_space = NULL;
-}
-
-static void calgary_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 csr, plssr;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
- tbl->it_busno, csr, plssr);
-}
-
-static void calioc2_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- u32 csr, csmr, plssr, mck, rcstat;
- void __iomem *target;
- unsigned long phboff = phb_offset(tbl->it_busno);
- unsigned long erroff;
- u32 errregs[7];
- int i;
-
- /* dump CSR */
- target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
- /* dump PLSSR */
- target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
- /* dump CSMR */
- target = calgary_reg(bbar, phboff | 0x290);
- csmr = be32_to_cpu(readl(target));
- /* dump mck */
- target = calgary_reg(bbar, phboff | 0x800);
- mck = be32_to_cpu(readl(target));
-
- pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
-
- pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
- csr, plssr, csmr, mck);
-
- /* dump rest of error regs */
- pr_emerg("");
- for (i = 0; i < ARRAY_SIZE(errregs); i++) {
- /* err regs are at 0x810 - 0x870 */
- erroff = (0x810 + (i * 0x10));
- target = calgary_reg(bbar, phboff | erroff);
- errregs[i] = be32_to_cpu(readl(target));
- pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
- }
- pr_cont("\n");
-
- /* root complex status */
- target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
- rcstat = be32_to_cpu(readl(target));
- printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
- PHB_ROOT_COMPLEX_STATUS);
-}
-
-static void calgary_watchdog(struct timer_list *t)
-{
- struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer);
- void __iomem *bbar = tbl->bbar;
- u32 val32;
- void __iomem *target;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- val32 = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- if (val32 & CSR_AGENT_MASK) {
- tbl->chip_ops->dump_error_regs(tbl);
-
- /* reset error */
- writel(0, target);
-
- /* Disable bus that caused the error */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
- PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_SLOT_DISABLE;
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
- } else {
- /* Reset the timer */
- mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
- }
-}
-
-static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
- unsigned char busnum, unsigned long timeout)
-{
- u64 val64;
- void __iomem *target;
- unsigned int phb_shift = ~0; /* silence gcc */
- u64 mask;
-
- switch (busno_to_phbid(busnum)) {
- case 0: phb_shift = (63 - 19);
- break;
- case 1: phb_shift = (63 - 23);
- break;
- case 2: phb_shift = (63 - 27);
- break;
- case 3: phb_shift = (63 - 35);
- break;
- default:
- BUG_ON(busno_to_phbid(busnum));
- }
-
- target = calgary_reg(bbar, CALGARY_CONFIG_REG);
- val64 = be64_to_cpu(readq(target));
-
- /* zero out this PHB's timer bits */
- mask = ~(0xFUL << phb_shift);
- val64 &= mask;
- val64 |= (timeout << phb_shift);
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-}
-
-static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 val;
-
- /*
- * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
- */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
- val = cpu_to_be32(readl(target));
- val |= 0x00800000;
- writel(cpu_to_be32(val), target);
-}
-
-static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
-
- /*
- * Give split completion a longer timeout on bus 1 for aic94xx
- * http://bugzilla.kernel.org/show_bug.cgi?id=7180
- */
- if (is_calgary(dev->device) && (busnum == 1))
- calgary_set_split_completion_timeout(tbl->bbar, busnum,
- CCR_2SEC_TIMEOUT);
-}
-
-static void __init calgary_enable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* enable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
-
- printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
- (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
- "Calgary" : "CalIOC2", busnum);
- printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
- "bus.\n");
-
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0);
- mod_timer(&tbl->watchdog_timer, jiffies);
-}
-
-static void __init calgary_disable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* disable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
-
- printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- del_timer_sync(&tbl->watchdog_timer);
-}
-
-static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
-{
- pci_dev_get(dev);
- set_pci_iommu(dev->bus, NULL);
-
- /* is the device behind a bridge? */
- if (dev->bus->parent)
- dev->bus->parent->self = dev;
- else
- dev->bus->self = dev;
-}
-
-static int __init calgary_init_one(struct pci_dev *dev)
-{
- void __iomem *bbar;
- struct iommu_table *tbl;
- int ret;
-
- bbar = busno_to_bbar(dev->bus->number);
- ret = calgary_setup_tar(dev, bbar);
- if (ret)
- goto done;
-
- pci_dev_get(dev);
-
- if (dev->bus->parent) {
- if (dev->bus->parent->self)
- printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
- "bus->parent->self!\n", dev);
- dev->bus->parent->self = dev;
- } else
- dev->bus->self = dev;
-
- tbl = pci_iommu(dev->bus);
- tbl->chip_ops->handle_quirks(tbl, dev);
-
- calgary_enable_translation(dev);
-
- return 0;
-
-done:
- return ret;
-}
-
-static int __init calgary_locate_bbars(void)
-{
- int ret;
- int rioidx, phb, bus;
- void __iomem *bbar;
- void __iomem *target;
- unsigned long offset;
- u8 start_bus, end_bus;
- u32 val;
-
- ret = -ENODATA;
- for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
- struct rio_detail *rio = rio_devs[rioidx];
-
- if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
- continue;
-
- /* map entire 1MB of Calgary config space */
- bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
- if (!bbar)
- goto error;
-
- for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
- offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
- target = calgary_reg(bbar, offset);
-
- val = be32_to_cpu(readl(target));
-
- start_bus = (u8)((val & 0x00FF0000) >> 16);
- end_bus = (u8)((val & 0x0000FF00) >> 8);
-
- if (end_bus) {
- for (bus = start_bus; bus <= end_bus; bus++) {
- bus_info[bus].bbar = bbar;
- bus_info[bus].phbid = phb;
- }
- } else {
- bus_info[start_bus].bbar = bbar;
- bus_info[start_bus].phbid = phb;
- }
- }
- }
-
- return 0;
-
-error:
- /* scan bus_info and iounmap any bbars we previously ioremap'd */
- for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
- if (bus_info[bus].bbar)
- iounmap(bus_info[bus].bbar);
-
- return ret;
-}
-
-static int __init calgary_init(void)
-{
- int ret;
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- ret = calgary_locate_bbars();
- if (ret)
- return ret;
-
- /* Purely for kdump kernel case */
- if (is_kdump_kernel())
- get_tce_space_from_tar();
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- calgary_init_one_nontraslated(dev);
- continue;
- }
-
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- ret = calgary_init_one(dev);
- if (ret)
- goto error;
- } while (1);
-
- dev = NULL;
- for_each_pci_dev(dev) {
- struct iommu_table *tbl;
-
- tbl = find_iommu_table(&dev->dev);
-
- if (translation_enabled(tbl))
- dev->dev.dma_ops = &calgary_dma_ops;
- }
-
- return ret;
-
-error:
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- pci_dev_put(dev);
- continue;
- }
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- calgary_disable_translation(dev);
- calgary_free_bus(dev);
- pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
- dev->dev.dma_ops = NULL;
- } while (1);
-
- return ret;
-}
-
-static inline int __init determine_tce_table_size(void)
-{
- int ret;
-
- if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
- return specified_table_size;
-
- if (is_kdump_kernel() && saved_max_pfn) {
- /*
- * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
- * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
- * larger table size has twice as many entries, so shift the
- * max ram address by 13 to divide by 8K and then look at the
- * order of the result to choose between 0-7.
- */
- ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
- if (ret > TCE_TABLE_SIZE_8M)
- ret = TCE_TABLE_SIZE_8M;
- } else {
- /*
- * Use 8M by default (suggested by Muli) if it's not
- * kdump kernel and saved_max_pfn isn't set.
- */
- ret = TCE_TABLE_SIZE_8M;
- }
-
- return ret;
-}
-
-static int __init build_detail_arrays(void)
-{
- unsigned long ptr;
- unsigned numnodes, i;
- int scal_detail_size, rio_detail_size;
-
- numnodes = rio_table_hdr->num_scal_dev;
- if (numnodes > MAX_NUMNODES){
- printk(KERN_WARNING
- "Calgary: MAX_NUMNODES too low! Defined as %d, "
- "but system has %d nodes.\n",
- MAX_NUMNODES, numnodes);
- return -ENODEV;
- }
-
- switch (rio_table_hdr->version){
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- default:
- printk(KERN_WARNING
- "Calgary: Invalid Rio Grande Table Version: %d\n",
- rio_table_hdr->version);
- return -EPROTO;
- }
-
- ptr = ((unsigned long)rio_table_hdr) + 3;
- for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev;
- i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 0;
-}
-
-static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
-{
- int dev;
- u32 val;
-
- if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
- /*
- * FIXME: properly scan for devices across the
- * PCI-to-PCI bridge on every CalIOC2 port.
- */
- return 1;
- }
-
- for (dev = 1; dev < 8; dev++) {
- val = read_pci_config(bus, dev, 0, 0);
- if (val != 0xffffffff)
- break;
- }
- return (val != 0xffffffff);
-}
-
-/*
- * calgary_init_bitmap_from_tce_table():
- * Function for kdump case. In the second/kdump kernel initialize
- * the bitmap based on the tce table entries obtained from first kernel
- */
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
-{
- u64 *tp;
- unsigned int index;
- tp = ((u64 *)tbl->it_base);
- for (index = 0 ; index < tbl->it_size; index++) {
- if (*tp != 0x0)
- set_bit(index, tbl->it_map);
- tp++;
- }
-}
-
-/*
- * get_tce_space_from_tar():
- * Function for kdump case. Get the tce tables from first kernel
- * by reading the contents of the base address register of calgary iommu
- */
-static void __init get_tce_space_from_tar(void)
-{
- int bus;
- void __iomem *target;
- unsigned long tce_space;
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- target = calgary_reg(bus_info[bus].bbar,
- tar_offset(bus));
- tce_space = be64_to_cpu(readq(target));
- tce_space = tce_space & TAR_SW_BITS;
-
- tce_space = tce_space & (~specified_table_size);
- info->tce_space = (u64 *)__va(tce_space);
- }
- }
- return;
-}
-
-static int __init calgary_iommu_init(void)
-{
- int ret;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- return 0;
-}
-
-int __init detect_calgary(void)
-{
- int bus;
- void *tbl;
- int calgary_found = 0;
- unsigned long ptr;
- unsigned int offset, prev_offset;
- int ret;
-
- /*
- * if the user specified iommu=off or iommu=soft or we found
- * another HW IOMMU already, bail out.
- */
- if (no_iommu || iommu_detected)
- return -ENODEV;
-
- if (!use_calgary)
- return -ENODEV;
-
- if (!early_pci_allowed())
- return -ENODEV;
-
- printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
-
- ptr = (unsigned long)phys_to_virt(get_bios_ebda());
-
- rio_table_hdr = NULL;
- prev_offset = 0;
- offset = 0x180;
- /*
- * The next offset is stored in the 1st word.
- * Only parse up until the offset increases:
- */
- while (offset > prev_offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- prev_offset = offset;
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
- "in EBDA - bailing!\n");
- return -ENODEV;
- }
-
- ret = build_detail_arrays();
- if (ret) {
- printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
- return -ENOMEM;
- }
-
- specified_table_size = determine_tce_table_size();
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
-
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- /*
- * If it is kdump kernel, find and use tce tables
- * from first kernel, else allocate tce tables here
- */
- if (!is_kdump_kernel()) {
- tbl = alloc_tce_table();
- if (!tbl)
- goto cleanup;
- info->tce_space = tbl;
- }
- calgary_found = 1;
- }
- }
-
- printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
- calgary_found ? "found" : "not found");
-
- if (calgary_found) {
- iommu_detected = 1;
- calgary_detected = 1;
- printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
- specified_table_size);
-
- x86_init.iommu.iommu_init = calgary_iommu_init;
- }
- return calgary_found;
-
-cleanup:
- for (--bus; bus >= 0; --bus) {
- struct calgary_bus_info *info = &bus_info[bus];
-
- if (info->tce_space)
- free_tce_table(info->tce_space);
- }
- return -ENOMEM;
-}
-
-static int __init calgary_parse_options(char *p)
-{
- unsigned int bridge;
- unsigned long val;
- size_t len;
- ssize_t ret;
-
- while (*p) {
- if (!strncmp(p, "64k", 3))
- specified_table_size = TCE_TABLE_SIZE_64K;
- else if (!strncmp(p, "128k", 4))
- specified_table_size = TCE_TABLE_SIZE_128K;
- else if (!strncmp(p, "256k", 4))
- specified_table_size = TCE_TABLE_SIZE_256K;
- else if (!strncmp(p, "512k", 4))
- specified_table_size = TCE_TABLE_SIZE_512K;
- else if (!strncmp(p, "1M", 2))
- specified_table_size = TCE_TABLE_SIZE_1M;
- else if (!strncmp(p, "2M", 2))
- specified_table_size = TCE_TABLE_SIZE_2M;
- else if (!strncmp(p, "4M", 2))
- specified_table_size = TCE_TABLE_SIZE_4M;
- else if (!strncmp(p, "8M", 2))
- specified_table_size = TCE_TABLE_SIZE_8M;
-
- len = strlen("translate_empty_slots");
- if (!strncmp(p, "translate_empty_slots", len))
- translate_empty_slots = 1;
-
- len = strlen("disable");
- if (!strncmp(p, "disable", len)) {
- p += len;
- if (*p == '=')
- ++p;
- if (*p == '\0')
- break;
- ret = kstrtoul(p, 0, &val);
- if (ret)
- break;
-
- bridge = val;
- if (bridge < MAX_PHB_BUS_NUM) {
- printk(KERN_INFO "Calgary: disabling "
- "translation for PHB %#x\n", bridge);
- bus_info[bridge].translation_disabled = 1;
- }
- }
-
- p = strpbrk(p, ",");
- if (!p)
- break;
-
- p++; /* skip ',' */
- }
- return 1;
-}
-__setup("calgary=", calgary_parse_options);
-
-static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
-{
- struct iommu_table *tbl;
- unsigned int npages;
- int i;
-
- tbl = pci_iommu(dev->bus);
-
- for (i = 0; i < 4; i++) {
- struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
-
- /* Don't give out TCEs that map MEM resources */
- if (!(r->flags & IORESOURCE_MEM))
- continue;
-
- /* 0-based? we reserve the whole 1st MB anyway */
- if (!r->start)
- continue;
-
- /* cover the whole region */
- npages = resource_size(r) >> PAGE_SHIFT;
- npages++;
-
- iommu_range_reserve(tbl, r->start, npages);
- }
-}
-
-static int __init calgary_fixup_tce_spaces(void)
-{
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- if (no_iommu || swiotlb || !calgary_detected)
- return -ENODEV;
-
- printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled)
- continue;
-
- if (!info->tce_space)
- continue;
-
- calgary_fixup_one_tce_space(dev);
-
- } while (1);
-
- return 0;
-}
-
-/*
- * We need to be call after pcibios_assign_resources (fs_initcall level)
- * and before device_initcall.
- */
-rootfs_initcall(calgary_fixup_tce_spaces);
-
-IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index fa4352dce491..57de2ebff7e2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -12,7 +12,6 @@
#include <asm/dma.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/calgary.h>
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
@@ -112,11 +111,6 @@ static __init int iommu_setup(char *p)
gart_parse_options(p);
-#ifdef CONFIG_CALGARY_IOMMU
- if (!strncmp(p, "calgary", 7))
- use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
p += strcspn(p, ",");
if (*p == ',')
++p;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5e94c4354d4e..bd2a11ca5dd6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -41,6 +41,7 @@
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
+#include <asm/io_bitmap.h>
#include <asm/proto.h>
#include "process.h"
@@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
#endif
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
},
-#ifdef CONFIG_X86_32
- /*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
-#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
@@ -110,28 +102,89 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
void exit_thread(struct task_struct *tsk)
{
struct thread_struct *t = &tsk->thread;
- unsigned long *bp = t->io_bitmap_ptr;
struct fpu *fpu = &t->fpu;
- if (bp) {
- struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
-
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
- /*
- * Careful, clear this in the TSS too:
- */
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
- t->io_bitmap_max = 0;
- put_cpu();
- kfree(bp);
- }
+ if (test_thread_flag(TIF_IO_BITMAP))
+ io_bitmap_exit();
free_vm86(t);
fpu__drop(fpu);
}
+static int set_new_tls(struct task_struct *p, unsigned long tls)
+{
+ struct user_desc __user *utls = (struct user_desc __user *)tls;
+
+ if (in_ia32_syscall())
+ return do_set_thread_area(p, -1, utls, 0);
+ else
+ return do_set_thread_area_64(p, ARCH_SET_FS, tls);
+}
+
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
+{
+ struct inactive_task_frame *frame;
+ struct fork_frame *fork_frame;
+ struct pt_regs *childregs;
+ int ret = 0;
+
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.io_bitmap = NULL;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+#ifdef CONFIG_X86_64
+ savesegment(gs, p->thread.gsindex);
+ p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
+ savesegment(fs, p->thread.fsindex);
+ p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
+#else
+ p->thread.sp0 = (unsigned long) (childregs + 1);
+ /*
+ * Clear all status flags including IF and set fixed bit. 64bit
+ * does not have this initialization as the frame does not contain
+ * flags. The flags consistency (especially vs. AC) is there
+ * ensured via objtool, which lacks 32bit support.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
+#endif
+
+ /* Kernel thread ? */
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ memset(childregs, 0, sizeof(struct pt_regs));
+ kthread_frame_init(frame, sp, arg);
+ return 0;
+ }
+
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+#ifdef CONFIG_X86_32
+ task_user_gs(p) = get_user_gs(current_pt_regs());
+#endif
+
+ /* Set a new TLS for the child thread? */
+ if (clone_flags & CLONE_SETTLS)
+ ret = set_new_tls(p, tls);
+
+ if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
+ io_bitmap_share(p);
+
+ return ret;
+}
+
void flush_thread(void)
{
struct task_struct *tsk = current;
@@ -269,31 +322,96 @@ void arch_setup_new_exec(void)
}
}
-static inline void switch_to_bitmap(struct thread_struct *prev,
- struct thread_struct *next,
- unsigned long tifp, unsigned long tifn)
+#ifdef CONFIG_X86_IOPL_IOPERM
+static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
{
- struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ /*
+ * Invalidate the I/O bitmap by moving io_bitmap_base outside the
+ * TSS limit so any subsequent I/O access from user space will
+ * trigger a #GP.
+ *
+ * This is correct even when VMEXIT rewrites the TSS limit
+ * to 0x67 as the only requirement is that the base points
+ * outside the limit.
+ */
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+}
- if (tifn & _TIF_IO_BITMAP) {
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
+static inline void switch_to_bitmap(unsigned long tifp)
+{
+ /*
+ * Invalidate I/O bitmap if the previous task used it. This prevents
+ * any possible leakage of an active I/O bitmap.
+ *
+ * If the next task has an I/O bitmap it will handle it on exit to
+ * user mode.
+ */
+ if (tifp & _TIF_IO_BITMAP)
+ tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw));
+}
+
+static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
+{
+ /*
+ * Copy at least the byte range of the incoming tasks bitmap which
+ * covers the permitted I/O ports.
+ *
+ * If the previous task which used an I/O bitmap had more bits
+ * permitted, then the copy needs to cover those as well so they
+ * get turned off.
+ */
+ memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
+ max(tss->io_bitmap.prev_max, iobm->max));
+
+ /*
+ * Store the new max and the sequence number of this bitmap
+ * and a pointer to the bitmap itself.
+ */
+ tss->io_bitmap.prev_max = iobm->max;
+ tss->io_bitmap.prev_sequence = iobm->sequence;
+}
+
+/**
+ * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
+ */
+void tss_update_io_bitmap(void)
+{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ u16 *base = &tss->x86_tss.io_bitmap_base;
+
+ if (test_thread_flag(TIF_IO_BITMAP)) {
+ struct thread_struct *t = &current->thread;
+
+ if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
+ *base = IO_BITMAP_OFFSET_VALID_ALL;
+ } else {
+ struct io_bitmap *iobm = t->io_bitmap;
+ /*
+ * Only copy bitmap data when the sequence number
+ * differs. The update time is accounted to the
+ * incoming task.
+ */
+ if (tss->io_bitmap.prev_sequence != iobm->sequence)
+ tss_copy_io_bitmap(tss, iobm);
+
+ /* Enable the bitmap */
+ *base = IO_BITMAP_OFFSET_VALID_MAP;
+ }
/*
- * Make sure that the TSS limit is correct for the CPU
- * to notice the IO bitmap.
+ * Make sure that the TSS limit is covering the io bitmap.
+ * It might have been cut down by a VMEXIT to 0x67 which
+ * would cause a subsequent I/O access from user space to
+ * trigger a #GP because tbe bitmap is outside the TSS
+ * limit.
*/
refresh_tss_limit();
- } else if (tifp & _TIF_IO_BITMAP) {
- /*
- * Clear any possible leftover bits:
- */
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ } else {
+ tss_invalidate_io_bitmap(tss);
}
}
+#else /* CONFIG_X86_IOPL_IOPERM */
+static inline void switch_to_bitmap(unsigned long tifp) { }
+#endif
#ifdef CONFIG_SMP
@@ -505,7 +623,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
- switch_to_bitmap(prev, next, tifp, tifn);
+
+ switch_to_bitmap(tifp);
propagate_user_return_notify(prev_p, next_p);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b8ceec4974fe..323499f48858 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
-{
- struct pt_regs *childregs = task_pt_regs(p);
- struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
- struct inactive_task_frame *frame = &fork_frame->frame;
- struct task_struct *tsk;
- int err;
-
- /*
- * For a new task use the RESET flags value since there is no before.
- * All the status flags are zero; DF and all the system flags must also
- * be 0, specifically IF must be 0 because we context switch to the new
- * task with interrupts disabled.
- */
- frame->flags = X86_EFLAGS_FIXED;
- frame->bp = 0;
- frame->ret_addr = (unsigned long) ret_from_fork;
- p->thread.sp = (unsigned long) fork_frame;
- p->thread.sp0 = (unsigned long) (childregs+1);
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- /* kernel thread */
- memset(childregs, 0, sizeof(struct pt_regs));
- frame->bx = sp; /* function */
- frame->di = arg;
- p->thread.io_bitmap_ptr = NULL;
- return 0;
- }
- frame->bx = 0;
- *childregs = *current_pt_regs();
- childregs->ax = 0;
- if (sp)
- childregs->sp = sp;
-
- task_user_gs(p) = get_user_gs(current_pt_regs());
-
- p->thread.io_bitmap_ptr = NULL;
- tsk = current;
- err = -ENOMEM;
-
- if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
-
- err = 0;
-
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS)
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)tls, 0);
-
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
- return err;
-}
-
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
@@ -255,15 +187,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_TLS(next, cpu);
- /*
- * Restore IOPL if needed. In normal use, the flags restore
- * in the switch assembly will handle this. But if the kernel
- * is running virtualized at a non-zero CPL, the popf will
- * not restore flags, so it must be done in a separate step.
- */
- if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
- set_iopl_mask(next->iopl);
-
switch_to_extra(prev_p, next_p);
/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index af64519b2695..506d66830d4d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
task->thread.gsbase = gsbase;
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
-{
- int err;
- struct pt_regs *childregs;
- struct fork_frame *fork_frame;
- struct inactive_task_frame *frame;
- struct task_struct *me = current;
-
- childregs = task_pt_regs(p);
- fork_frame = container_of(childregs, struct fork_frame, regs);
- frame = &fork_frame->frame;
-
- frame->bp = 0;
- frame->ret_addr = (unsigned long) ret_from_fork;
- p->thread.sp = (unsigned long) fork_frame;
- p->thread.io_bitmap_ptr = NULL;
-
- savesegment(gs, p->thread.gsindex);
- p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
- savesegment(fs, p->thread.fsindex);
- p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
- savesegment(es, p->thread.es);
- savesegment(ds, p->thread.ds);
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- /* kernel thread */
- memset(childregs, 0, sizeof(struct pt_regs));
- frame->bx = sp; /* function */
- frame->r12 = arg;
- return 0;
- }
- frame->bx = 0;
- *childregs = *current_pt_regs();
-
- childregs->ax = 0;
- if (sp)
- childregs->sp = sp;
-
- err = -ENOMEM;
- if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
-
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS) {
-#ifdef CONFIG_IA32_EMULATION
- if (in_ia32_syscall())
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)tls, 0);
- else
-#endif
- err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
- if (err)
- goto out;
- }
- err = 0;
-out:
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
-
- return err;
-}
-
static void
start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
@@ -572,17 +497,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_to_extra(prev_p, next_p);
-#ifdef CONFIG_XEN_PV
- /*
- * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
- * current_pt_regs()->flags may not match the current task's
- * intended IOPL. We need to switch it manually.
- */
- if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
- prev->iopl != next->iopl))
- xen_set_iopl_mask(next->iopl);
-#endif
-
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3c5bbe8e4120..066e5b01a7e0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -42,6 +42,7 @@
#include <asm/traps.h>
#include <asm/syscall.h>
#include <asm/fsgsbase.h>
+#include <asm/io_bitmap.h>
#include "tls.h"
@@ -697,7 +698,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n,
static int ioperm_active(struct task_struct *target,
const struct user_regset *regset)
{
- return target->thread.io_bitmap_max / regset->size;
+ struct io_bitmap *iobm = target->thread.io_bitmap;
+
+ return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0;
}
static int ioperm_get(struct task_struct *target,
@@ -705,12 +708,13 @@ static int ioperm_get(struct task_struct *target,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- if (!target->thread.io_bitmap_ptr)
+ struct io_bitmap *iobm = target->thread.io_bitmap;
+
+ if (!iobm)
return -ENXIO;
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- target->thread.io_bitmap_ptr,
- 0, IO_BITMAP_BYTES);
+ iobm->bitmap, 0, IO_BITMAP_BYTES);
}
/*
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index ee26df08002e..94b33885f8d2 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -35,8 +35,7 @@
#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
.text
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
/* Save the CPU context, used for jumping back */
pushl %ebx
@@ -93,8 +92,9 @@ relocate_kernel:
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
ret
+SYM_CODE_END(relocate_kernel)
-identity_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* set return address to 0 if not preserving context */
pushl $0
/* store the start address on the stack */
@@ -191,8 +191,9 @@ identity_mapped:
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
ret
+SYM_CODE_END(identity_mapped)
-virtual_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
movl CR4(%edi), %eax
movl %eax, %cr4
movl CR3(%edi), %eax
@@ -208,9 +209,10 @@ virtual_mapped:
popl %esi
popl %ebx
ret
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movl 8(%esp), %edx
movl 4(%esp), %ecx
pushl %ebp
@@ -270,6 +272,7 @@ swap_pages:
popl %ebx
popl %ebp
ret
+SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c51ccff5cd01..ef3ba99068d3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -38,8 +38,7 @@
.text
.align PAGE_SIZE
.code64
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
/*
* %rdi indirection_page
* %rsi page_list
@@ -103,8 +102,9 @@ relocate_kernel:
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
ret
+SYM_CODE_END(relocate_kernel)
-identity_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* set return address to 0 if not preserving context */
pushq $0
/* store the start address on the stack */
@@ -209,8 +209,9 @@ identity_mapped:
movq $virtual_mapped, %rax
pushq %rax
ret
+SYM_CODE_END(identity_mapped)
-virtual_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
movq RSP(%r8), %rsp
movq CR4(%r8), %rax
movq %rax, %cr4
@@ -228,9 +229,10 @@ virtual_mapped:
popq %rbp
popq %rbx
ret
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movq %rdi, %rcx /* Put the page_list in %rcx */
xorl %edi, %edi
xorl %esi, %esi
@@ -283,6 +285,7 @@ swap_pages:
jmp 0b
3:
ret
+SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 77ea96b794bd..cedfe2077a69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -143,6 +143,13 @@ struct boot_params boot_params;
/*
* Machine setup..
*/
+static struct resource rodata_resource = {
+ .name = "Kernel rodata",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
static struct resource data_resource = {
.name = "Kernel data",
.start = 0,
@@ -438,6 +445,12 @@ static void __init memblock_x86_reserve_range_setup_data(void)
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
memblock_reserve(pa_data, sizeof(*data) + data->len);
+
+ if (data->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT)
+ memblock_reserve(((struct setup_indirect *)data->data)->addr,
+ ((struct setup_indirect *)data->data)->len);
+
pa_data = data->next;
early_memunmap(data, sizeof(*data));
}
@@ -459,7 +472,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* due to mapping restrictions.
*
* On 64bit, kdump kernel need be restricted to be under 64TB, which is
- * the upper limit of system RAM in 4-level paing mode. Since the kdump
+ * the upper limit of system RAM in 4-level paging mode. Since the kdump
* jumping could be from 5-level to 4-level, the jumping will fail if
* kernel is put above 64TB, and there's no way to detect the paging mode
* of the kernel which will be loaded for dumping during the 1st kernel
@@ -743,8 +756,8 @@ static void __init trim_bios_range(void)
e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
/*
- * special case: Some BIOSen report the PC BIOS
- * area (640->1Mb) as ram even though it is not.
+ * special case: Some BIOSes report the PC BIOS
+ * area (640Kb -> 1Mb) as RAM even though it is not.
* take them out.
*/
e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
@@ -951,7 +964,9 @@ void __init setup_arch(char **cmdline_p)
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
- data_resource.start = __pa_symbol(_etext);
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
data_resource.end = __pa_symbol(_edata)-1;
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
@@ -1040,6 +1055,7 @@ void __init setup_arch(char **cmdline_p)
/* after parse_early_param, so could debug it */
insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
@@ -1122,17 +1138,15 @@ void __init setup_arch(char **cmdline_p)
reserve_bios_regions();
- if (efi_enabled(EFI_MEMMAP)) {
- efi_fake_memmap();
- efi_find_mirror();
- efi_esrt_init();
+ efi_fake_memmap();
+ efi_find_mirror();
+ efi_esrt_init();
- /*
- * The EFI specification says that boot service code won't be
- * called after ExitBootServices(). This is, in fact, a lie.
- */
- efi_reserve_boot_services();
- }
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
+ efi_reserve_boot_services();
/* preallocate 4k for mptable mpc */
e820__memblock_alloc_reserved_mpc_new();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 86663874ef04..e6d7894ad127 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void)
pcpu_cpu_distance,
pcpu_fc_alloc, pcpu_fc_free);
if (rc < 0)
- pr_warning("%s allocator failed (%d), falling back to page size\n",
- pcpu_fc_names[pcpu_chosen_fc], rc);
+ pr_warn("%s allocator failed (%d), falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a49fe1dcb47e..4c61f0713832 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -57,7 +57,7 @@ void __init tboot_probe(void)
*/
if (!e820__mapped_any(boot_params.tboot_addr,
boot_params.tboot_addr, E820_TYPE_RESERVED)) {
- pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
+ pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
return;
}
@@ -65,13 +65,12 @@ void __init tboot_probe(void)
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
- pr_warning("tboot at 0x%llx is invalid\n",
- boot_params.tboot_addr);
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
tboot = NULL;
return;
}
if (tboot->version < 5) {
- pr_warning("tboot version is invalid: %u\n", tboot->version);
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
tboot = NULL;
return;
}
@@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
- pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ pr_warn("unsupported sleep state 0x%x\n", sleep_state);
return -1;
}
@@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
if (!tboot_enabled())
return 0;
- pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
return -ENODEV;
}
@@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps)
}
if (timeout)
- pr_warning("tboot wait for APs timeout\n");
+ pr_warn("tboot wait for APs timeout\n");
return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
}
@@ -516,7 +515,7 @@ int tboot_force_iommu(void)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
- pr_warning("Forcing Intel-IOMMU to enabled\n");
+ pr_warn("Forcing Intel-IOMMU to enabled\n");
dmar_disabled = 0;
#ifdef CONFIG_SWIOTLB
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
deleted file mode 100644
index 6384be751eff..000000000000
--- a/arch/x86/kernel/tce_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file manages the translation entries for the IBM Calgary IOMMU.
- *
- * Derived from arch/powerpc/platforms/pseries/iommu.c
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Jon Mason <jdmason@us.ibm.com>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/memblock.h>
-#include <asm/tce.h>
-#include <asm/calgary.h>
-#include <asm/proto.h>
-#include <asm/cacheflush.h>
-
-/* flush a tce at 'tceaddr' to main memory */
-static inline void flush_tce(void* tceaddr)
-{
- /* a single tce can't cross a cache line */
- if (boot_cpu_has(X86_FEATURE_CLFLUSH))
- clflush(tceaddr);
- else
- wbinvd();
-}
-
-void tce_build(struct iommu_table *tbl, unsigned long index,
- unsigned int npages, unsigned long uaddr, int direction)
-{
- u64* tp;
- u64 t;
- u64 rpn;
-
- t = (1 << TCE_READ_SHIFT);
- if (direction != DMA_TO_DEVICE)
- t |= (1 << TCE_WRITE_SHIFT);
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
- t &= ~TCE_RPN_MASK;
- t |= (rpn << TCE_RPN_SHIFT);
-
- *tp = cpu_to_be64(t);
- flush_tce(tp);
-
- uaddr += PAGE_SIZE;
- tp++;
- }
-}
-
-void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
-{
- u64* tp;
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- *tp = cpu_to_be64(0);
- flush_tce(tp);
- tp++;
- }
-}
-
-static inline unsigned int table_size_to_number_of_entries(unsigned char size)
-{
- /*
- * size is the order of the table, 0-7
- * smallest table is 8K entries, so shift result by 13 to
- * multiply by 8K
- */
- return (1 << size) << 13;
-}
-
-static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
-{
- unsigned int bitmapsz;
- unsigned long bmppages;
- int ret;
-
- tbl->it_busno = dev->bus->number;
-
- /* set the tce table size - measured in entries */
- tbl->it_size = table_size_to_number_of_entries(specified_table_size);
-
- /*
- * number of bytes needed for the bitmap size in number of
- * entries; we need one bit per entry
- */
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
- if (!bmppages) {
- printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
- ret = -ENOMEM;
- goto done;
- }
-
- tbl->it_map = (unsigned long*)bmppages;
-
- memset(tbl->it_map, 0, bitmapsz);
-
- tbl->it_hint = 0;
-
- spin_lock_init(&tbl->it_lock);
-
- return 0;
-
-done:
- return ret;
-}
-
-int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
-{
- struct iommu_table *tbl;
- int ret;
-
- if (pci_iommu(dev->bus)) {
- printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
- dev, pci_iommu(dev->bus));
- BUG();
- }
-
- tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
- if (!tbl) {
- printk(KERN_ERR "Calgary: error allocating iommu_table\n");
- ret = -ENOMEM;
- goto done;
- }
-
- ret = tce_table_setparms(dev, tbl);
- if (ret)
- goto free_tbl;
-
- tbl->bbar = bbar;
-
- set_pci_iommu(dev->bus, tbl);
-
- return 0;
-
-free_tbl:
- kfree(tbl);
-done:
- return ret;
-}
-
-void * __init alloc_tce_table(void)
-{
- unsigned int size;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- return memblock_alloc_low(size, size);
-}
-
-void __init free_tce_table(void *tbl)
-{
- unsigned int size;
-
- if (!tbl)
- return;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- memblock_free(__pa(tbl), size);
-}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4bb0f8447112..c90312146da0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,11 +37,6 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index ec534f978867..b8acf639abd1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -364,12 +364,12 @@ retry:
/* Force it to 0 if random warps brought us here */
atomic_set(&test_runs, 0);
- pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n",
smp_processor_id(), cpu);
- pr_warning("Measured %Ld cycles TSC warp between CPUs, "
- "turning off TSC clock.\n", max_warp);
+ pr_warn("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
if (random_warps)
- pr_warning("TSC warped randomly between CPUs\n");
+ pr_warn("TSC warped randomly between CPUs\n");
mark_tsc_unstable("check_tsc_sync_source failed");
}
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 548fefed71ee..4d732a444711 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -1,6 +1,6 @@
/*
- * umip.c Emulation for instruction protected by the Intel User-Mode
- * Instruction Prevention feature
+ * umip.c Emulation for instruction protected by the User-Mode Instruction
+ * Prevention feature
*
* Copyright (c) 2017, Intel Corporation.
* Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
@@ -18,10 +18,10 @@
/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
*
- * The feature User-Mode Instruction Prevention present in recent Intel
- * processor prevents a group of instructions (SGDT, SIDT, SLDT, SMSW and STR)
- * from being executed with CPL > 0. Otherwise, a general protection fault is
- * issued.
+ * User-Mode Instruction Prevention is a security feature present in recent
+ * x86 processors that, when enabled, prevents a group of instructions (SGDT,
+ * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general
+ * protection fault if the instruction is executed with CPL > 0.
*
* Rather than relaying to the user space the general protection fault caused by
* the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
@@ -91,7 +91,7 @@ const char * const umip_insns[5] = {
#define umip_pr_err(regs, fmt, ...) \
umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
-#define umip_pr_warning(regs, fmt, ...) \
+#define umip_pr_warn(regs, fmt, ...) \
umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
/**
@@ -380,14 +380,14 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (umip_inst < 0)
return false;
- umip_pr_warning(regs, "%s instruction cannot be used by applications.\n",
+ umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
/* Do not emulate (spoof) SLDT or STR. */
if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT)
return false;
- umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n");
+ umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
user_64bit_mode(regs)))
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 8cd745ef8c7b..15e5aad8ac2c 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -842,8 +842,8 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
/**
* arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @auprobe: the probepoint information.
* @mm: the probed address space.
- * @arch_uprobe: the probepoint information.
* @addr: virtual address at which to install the probepoint
* Return 0 on success or a -ve number on error.
*/
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index a024c4f7ba56..641f0fe1e5b4 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -31,7 +31,7 @@
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
-ENTRY(verify_cpu)
+SYM_FUNC_START_LOCAL(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
@@ -137,4 +137,4 @@ ENTRY(verify_cpu)
popf # Restore caller passed flags
xorl %eax, %eax
ret
-ENDPROC(verify_cpu)
+SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e2feacf921a0..3a1a819da137 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -21,6 +21,9 @@
#define LOAD_OFFSET __START_KERNEL_map
#endif
+#define EMITS_PT_NOTE
+#define RO_EXCEPTION_TABLE_ALIGN 16
+
#include <asm-generic/vmlinux.lds.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
@@ -141,17 +144,12 @@ SECTIONS
*(.text.__x86.indirect_thunk)
__indirect_thunk_end = .;
#endif
+ } :text =0xcccc
- /* End of text section */
- _etext = .;
- } :text = 0x9090
-
- NOTES :text :note
-
- EXCEPTION_TABLE(16) :text = 0x9090
-
- /* .text should occupy whole number of pages */
+ /* End of text section, which should occupy whole number of pages */
+ _etext = .;
. = ALIGN(PAGE_SIZE);
+
X86_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
X86_ALIGN_RODATA_END
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 18a799c8fa28..ce89430a7f80 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -31,6 +31,28 @@ static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
+static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
+static __init void get_rtc_noop(struct timespec64 *now) { }
+
+static __initconst const struct of_device_id of_cmos_match[] = {
+ { .compatible = "motorola,mc146818" },
+ {}
+};
+
+/*
+ * Allow devicetree configured systems to disable the RTC by setting the
+ * corresponding DT node's status property to disabled. Code is optimized
+ * out for CONFIG_OF=n builds.
+ */
+static __init void x86_wallclock_init(void)
+{
+ struct device_node *node = of_find_matching_node(NULL, of_cmos_match);
+
+ if (node && !of_device_is_available(node)) {
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_platform.set_wallclock = set_rtc_noop;
+ }
+}
/*
* The platform setup functions are preset with the default functions
@@ -73,7 +95,7 @@ struct x86_init_ops x86_init __initdata = {
.timers = {
.setup_percpu_clockev = setup_boot_APIC_clock,
.timer_init = hpet_time_init,
- .wallclock_init = x86_init_noop,
+ .wallclock_init = x86_wallclock_init,
},
.iommu = {