diff options
Diffstat (limited to 'arch/x86/kernel')
65 files changed, 2019 insertions, 2773 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3578ad248bc9..32acb970f416 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -134,7 +134,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_X86_INTEL_UMIP) += umip.o +obj-$(CONFIG_X86_UMIP) += umip.o obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o @@ -146,7 +146,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o - obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index e95e95960156..daf88f8143c5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -9,8 +9,7 @@ .code32 ALIGN -ENTRY(wakeup_pmode_return) -wakeup_pmode_return: +SYM_CODE_START(wakeup_pmode_return) movw $__KERNEL_DS, %ax movw %ax, %ss movw %ax, %fs @@ -39,6 +38,7 @@ wakeup_pmode_return: # jump to place where we left off movl saved_eip, %eax jmp *%eax +SYM_CODE_END(wakeup_pmode_return) bogus_magic: jmp bogus_magic @@ -72,7 +72,7 @@ restore_registers: popfl ret -ENTRY(do_suspend_lowlevel) +SYM_CODE_START(do_suspend_lowlevel) call save_processor_state call save_registers pushl $3 @@ -87,10 +87,11 @@ ret_point: call restore_registers call restore_processor_state ret +SYM_CODE_END(do_suspend_lowlevel) .data ALIGN -ENTRY(saved_magic) .long 0 +SYM_DATA(saved_magic, .long 0) saved_eip: .long 0 # saved registers diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 7f9ade13bbcf..c8daa92f38dc 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,7 +14,7 @@ /* * Hooray, we are in Long 64-bit mode (but still running in low memory) */ -ENTRY(wakeup_long64) +SYM_FUNC_START(wakeup_long64) movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax @@ -40,9 +40,9 @@ ENTRY(wakeup_long64) movq saved_rip, %rax jmp *%rax -ENDPROC(wakeup_long64) +SYM_FUNC_END(wakeup_long64) -ENTRY(do_suspend_lowlevel) +SYM_FUNC_START(do_suspend_lowlevel) FRAME_BEGIN subq $8, %rsp xorl %eax, %eax @@ -125,7 +125,7 @@ ENTRY(do_suspend_lowlevel) addq $8, %rsp FRAME_END jmp restore_processor_state -ENDPROC(do_suspend_lowlevel) +SYM_FUNC_END(do_suspend_lowlevel) .data saved_rbp: .quad 0 @@ -136,4 +136,4 @@ saved_rbx: .quad 0 saved_rip: .quad 0 saved_rsp: .quad 0 -ENTRY(saved_magic) .quad 0 +SYM_DATA(saved_magic, .quad 0) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9d3a971ea364..9ec463fe96f2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { struct text_poke_loc *tp; - unsigned char int3 = 0xcc; void *ip; /* * Having observed our INT3 instruction, we now must observe * bp_patching.nr_entries. * - * nr_entries != 0 INT3 - * WMB RMB - * write INT3 if (nr_entries) + * nr_entries != 0 INT3 + * WMB RMB + * write INT3 if (nr_entries) * * Idem for other elements in bp_patching. */ @@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs) return 0; /* - * Discount the sizeof(int3). See text_poke_bp_batch(). + * Discount the INT3. See text_poke_bp_batch(). */ - ip = (void *) regs->ip - sizeof(int3); + ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. @@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs) return 0; } - /* set up the specified breakpoint detour */ - regs->ip = (unsigned long) tp->detour; + ip += tp->len; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + return 0; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->rel32); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->rel32); + break; + + default: + BUG(); + } return 1; } @@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * synchronization using int3 breakpoint. * * The way it is done: - * - For each entry in the vector: + * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: @@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler); */ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - int patched_all_but_first = 0; - unsigned char int3 = 0xcc; + unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; + int do_sync; lockdep_assert_held(&text_mutex); @@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) /* * Second step: update all but the first byte of the patched range. */ - for (i = 0; i < nr_entries; i++) { + for (do_sync = 0, i = 0; i < nr_entries; i++) { if (tp[i].len - sizeof(int3) > 0) { text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].opcode + sizeof(int3), + (const char *)tp[i].text + sizeof(int3), tp[i].len - sizeof(int3)); - patched_all_but_first++; + do_sync++; } } - if (patched_all_but_first) { + if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ - for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + do_sync++; + } + + if (do_sync) + on_each_cpu(do_sync_core, NULL, 1); - on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. @@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) bp_patching.nr_entries = 0; } +void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + + if (!opcode) + opcode = (void *)tp->text; + else + memcpy((void *)tp->text, opcode, len); + + if (!emulate) + emulate = opcode; + + kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); + insn_get_length(&insn); + + BUG_ON(!insn_complete(&insn)); + BUG_ON(len != insn.length); + + tp->addr = addr; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->rel32 = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->rel32 = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->rel32 = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch @@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp = { - .detour = handler, - .addr = addr, - .len = len, - }; - - if (len > POKE_MAX_OPCODE_SIZE) { - WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); - return; - } - - memcpy((void *)tp.opcode, opcode, len); + struct text_poke_loc tp; + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a6ac3712db8b..4bbccb9d16dc 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - pr_warning( - "PCI-DMA: Warning: Small IOMMU %luMB." + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", - iommu_size >> 20); + iommu_size >> 20); } return iommu_size; @@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" - "falling back to iommu=soft.\n"); + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); return -1; } @@ -733,8 +731,8 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); - pr_warning("falling back to iommu=soft.\n"); + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); } return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2b0faf86da1b..28446fa6bf18 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warning("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n",(long)res); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -977,7 +977,7 @@ static int __init calibrate_APIC_clock(void) */ if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); - pr_warning("APIC frequency too slow, disabling apic timer\n"); + pr_warn("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -1021,7 +1021,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - pr_warning("APIC timer disabled due to verification failure\n"); + pr_warn("APIC timer disabled due to verification failure\n"); return -1; } @@ -1095,8 +1095,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", - smp_processor_id()); + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1811,11 +1811,11 @@ static int __init setup_nox2apic(char *str) int apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); return 0; } - pr_warning("x2apic already enabled.\n"); + pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } setup_clear_cpu_cap(X86_FEATURE_X2APIC); @@ -1983,7 +1983,7 @@ static int __init apic_verify(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); + pr_warn("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -2337,7 +2337,7 @@ static int cpuid_to_apicid[] = { #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @id: APIC ID to check + * @apicid: APIC ID to check */ bool apic_id_is_primary_thread(unsigned int apicid) { @@ -2410,9 +2410,8 @@ int generic_processor_info(int apicid, int version) disabled_cpu_apicid == apicid) { int thiscpu = num_processors + disabled_cpus; - pr_warning("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", - thiscpu, apicid); + pr_warn("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2426,8 +2425,7 @@ int generic_processor_info(int apicid, int version) apicid != boot_cpu_physical_apicid) { int thiscpu = max + disabled_cpus - 1; - pr_warning( - "APIC: NR_CPUS/possible_cpus limit of %i almost" + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2438,9 +2436,8 @@ int generic_processor_info(int apicid, int version) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2470,13 +2467,13 @@ int generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); + pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); version = 0x10; } if (version != boot_cpu_apic_version) { - pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", boot_cpu_apic_version, cpu, version); } @@ -2845,7 +2842,7 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_VERBOSE; #ifdef CONFIG_X86_64 else { - pr_warning("APIC Verbosity level %s not recognised" + pr_warn("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d6af97fd170a..913c88617848 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1725,19 +1725,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { - /* If we are moving the irq we need to mask it */ + /* If we are moving the IRQ we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic_irq(data); + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { - if (unlikely(masked)) { + if (unlikely(moveit)) { /* Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets @@ -1766,15 +1767,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) */ if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { } #endif @@ -1783,11 +1786,11 @@ static void ioapic_ack_level(struct irq_data *irq_data) { struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; - bool masked; + bool moveit; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(irq_data); + moveit = ioapic_prepare_move(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1842,7 +1845,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(irq_data, masked); + ioapic_finish_move(irq_data, moveit); } static void ioapic_ir_ack_level(struct irq_data *irq_data) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e6230af19864..d5b51a740524 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -14,6 +14,8 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/efi.h> #include <asm/e820/api.h> #include <asm/uv/uv_mmrs.h> @@ -25,12 +27,17 @@ static DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; -static bool uv_hubless_system; +static int uv_hubbed_system; +static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; +/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; +static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + /* Information derived from CPUID: */ static struct { unsigned int apicid_shift; @@ -248,17 +255,35 @@ static void __init uv_set_apicid_hibit(void) } } -static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static void __init uv_stringify(int len, char *to, char *from) +{ + /* Relies on 'to' being NULL chars so result will be NULL terminated */ + strncpy(to, from, len-1); +} + +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) { int pnodeid; int uv_apic; + uv_stringify(sizeof(oem_id), oem_id, _oem_id); + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) == 0) { - uv_hubless_system = true; - pr_info("UV: OEM IDs %s/%s, HUBLESS\n", - oem_id, oem_table_id); - } + if (strncmp(oem_id, "NSGI", 4) != 0) + return 0; + + /* UV4 Hubless, CH, (0x11:UV4+Any) */ + if (strncmp(oem_id, "NSGI4", 5) == 0) + uv_hubless_system = 0x11; + + /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + else + uv_hubless_system = 0x9; + + pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", + oem_id, oem_table_id, uv_hubless_system); + return 0; } @@ -286,6 +311,24 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (uv_hub_info->hub_revision == 0) goto badbios; + switch (uv_hub_info->hub_revision) { + case UV4_HUB_REVISION_BASE: + uv_hubbed_system = 0x11; + break; + + case UV3_HUB_REVISION_BASE: + uv_hubbed_system = 0x9; + break; + + case UV2_HUB_REVISION_BASE: + uv_hubbed_system = 0x5; + break; + + case UV1_HUB_REVISION_BASE: + uv_hubbed_system = 0x3; + break; + } + pnodeid = early_get_pnodeid(); early_get_apic_socketid_shift(); @@ -336,9 +379,15 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); -int is_uv_hubless(void) +int is_uv_hubbed(int uvtype) +{ + return (uv_hubbed_system & uvtype); +} +EXPORT_SYMBOL_GPL(is_uv_hubbed); + +int is_uv_hubless(int uvtype) { - return uv_hubless_system; + return (uv_hubless_system & uvtype); } EXPORT_SYMBOL_GPL(is_uv_hubless); @@ -1255,7 +1304,8 @@ static int __init decode_uv_systab(void) struct uv_systab *st; int i; - if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + /* If system is uv3 or lower, there is no extended UVsystab */ + if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) return 0; /* No extended UVsystab required */ st = uv_systab; @@ -1434,6 +1484,103 @@ static void __init build_socket_tables(void) } } +/* Check which reboot to use */ +static void check_efi_reboot(void) +{ + /* If EFI reboot not available, use ACPI reboot */ + if (!efi_enabled(EFI_BOOT)) + reboot_type = BOOT_ACPI; +} + +/* Setup user proc fs files */ +static int proc_hubbed_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubbed_system); + return 0; +} + +static int proc_hubless_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubless_system); + return 0; +} + +static int proc_oemid_show(struct seq_file *file, void *data) +{ + seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + return 0; +} + +static int proc_hubbed_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_hubbed_show, (void *)NULL); +} + +static int proc_hubless_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_hubless_show, (void *)NULL); +} + +static int proc_oemid_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_oemid_show, (void *)NULL); +} + +/* (struct is "non-const" as open function is set at runtime) */ +static struct file_operations proc_version_fops = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations proc_oemid_fops = { + .open = proc_oemid_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init void uv_setup_proc_files(int hubless) +{ + struct proc_dir_entry *pde; + char *name = hubless ? "hubless" : "hubbed"; + + pde = proc_mkdir(UV_PROC_NODE, NULL); + proc_create("oemid", 0, pde, &proc_oemid_fops); + proc_create(name, 0, pde, &proc_version_fops); + if (hubless) + proc_version_fops.open = proc_hubless_open; + else + proc_version_fops.open = proc_hubbed_open; +} + +/* Initialize UV hubless systems */ +static __init int uv_system_init_hubless(void) +{ + int rc; + + /* Setup PCH NMI handler */ + uv_nmi_setup_hubless(); + + /* Init kernel/BIOS interface */ + rc = uv_bios_init(); + if (rc < 0) + return rc; + + /* Process UVsystab */ + rc = decode_uv_systab(); + if (rc < 0) + return rc; + + /* Create user access node */ + if (rc >= 0) + uv_setup_proc_files(1); + + check_efi_reboot(); + + return rc; +} + static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; @@ -1559,32 +1706,27 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); uv_scir_register_cpu_notifier(); - proc_mkdir("sgi_uv", NULL); + uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ pci_register_set_vga_state(uv_set_vga_state); - /* - * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as - * EFI is not enabled in the kdump kernel: - */ - if (is_kdump_kernel()) - reboot_type = BOOT_ACPI; + check_efi_reboot(); } /* - * There is a small amount of UV specific code needed to initialize a - * UV system that does not have a "UV HUB" (referred to as "hubless"). + * There is a different code path needed to initialize a UV system that does + * not have a "UV HUB" (referred to as "hubless"). */ void __init uv_system_init(void) { - if (likely(!is_uv_system() && !is_uv_hubless())) + if (likely(!is_uv_system() && !is_uv_hubless(1))) return; if (is_uv_system()) uv_system_init_hub(); else - uv_nmi_setup_hubless(); + uv_system_init_hubless(); } apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..890f60083eca 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o +obj-y += intel.o intel_pconfig.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 91c2561b905f..8bf64899f56a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,8 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init mds_print_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -105,6 +107,13 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); + + /* + * As MDS and TAA mitigations are inter-related, print MDS + * mitigation until after TAA mitigation selection is done. + */ + mds_print_mitigation(); arch_smt_update(); @@ -243,6 +252,12 @@ static void __init mds_select_mitigation(void) (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } +} + +static void __init mds_print_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -269,6 +284,113 @@ static int __init mds_cmdline(char *str) early_param("mds", mds_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + + /* + * Update MDS mitigation, if necessary, as the mds_user_clear is + * now enabled for TAA mitigation. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -786,13 +908,10 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" void cpu_bugs_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -819,6 +938,17 @@ void cpu_bugs_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1149,6 +1279,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1304,11 +1437,24 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif static ssize_t mds_show_state(char *buf) @@ -1328,6 +1474,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1398,6 +1559,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1434,4 +1601,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9ae7d1bcd4f4..baa2fed8deb6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -53,10 +53,7 @@ #include <asm/microcode_intel.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> - -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> -#endif #include "cpu.h" @@ -565,8 +562,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; -__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; +/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */ +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); +__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); void load_percpu_segment(int cpu) { @@ -1016,13 +1014,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,27 +1042,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1073,15 +1072,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -1092,19 +1093,30 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1121,6 +1133,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; @@ -1554,6 +1581,8 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1749,7 +1778,7 @@ static void wait_for_master_cpu(int cpu) } #ifdef CONFIG_X86_64 -static void setup_getcpu(int cpu) +static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; @@ -1769,7 +1798,59 @@ static void setup_getcpu(int cpu) write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); } + +static inline void ucode_cpu_init(int cpu) +{ + if (cpu) + load_ucode_ap(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) +{ + /* Set up the per-CPU TSS IST stacks */ + tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); +} + +static inline void gdt_setup_doublefault_tss(int cpu) { } + +#else /* CONFIG_X86_64 */ + +static inline void setup_getcpu(int cpu) { } + +static inline void ucode_cpu_init(int cpu) +{ + show_ucode_info_early(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) { } + +static inline void gdt_setup_doublefault_tss(int cpu) +{ +#ifdef CONFIG_DOUBLEFAULT + /* Set up the doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif +} +#endif /* !CONFIG_X86_64 */ + +static inline void tss_setup_io_bitmap(struct tss_struct *tss) +{ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; + +#ifdef CONFIG_X86_IOPL_IOPERM + tss->io_bitmap.prev_max = 0; + tss->io_bitmap.prev_sequence = 0; + memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap)); + /* + * Invalidate the extra array entry past the end of the all + * permission bitmap as required by the hardware. + */ + tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL; #endif +} /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -1777,21 +1858,15 @@ static void setup_getcpu(int cpu) * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ -#ifdef CONFIG_X86_64 - void cpu_init(void) { + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct task_struct *cur = current; int cpu = raw_smp_processor_id(); - struct task_struct *me; - struct tss_struct *t; - int i; wait_for_master_cpu(cpu); - if (cpu) - load_ucode_ap(); - - t = &per_cpu(cpu_tss_rw, cpu); + ucode_cpu_init(cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1800,63 +1875,47 @@ void cpu_init(void) #endif setup_getcpu(cpu); - me = current; - pr_debug("Initializing CPU#%d\n", cpu); - cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ - switch_to_new_gdt(cpu); - loadsegment(fs, 0); - load_current_idt(); - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); + if (IS_ENABLED(CONFIG_X86_64)) { + loadsegment(fs, 0); + memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); - x86_configure_nx(); - x2apic_setup(); + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); - /* - * set up and load the per-CPU TSS - */ - if (!t->x86_tss.ist[0]) { - t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); - t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); - t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); - t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + x2apic_setup(); } - t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - mmgrab(&init_mm); - me->active_mm = &init_mm; - BUG_ON(me->mm); + cur->active_mm = &init_mm; + BUG_ON(cur->mm); initialize_tlbstate_and_flush(); - enter_lazy_tlb(&init_mm, me); + enter_lazy_tlb(&init_mm, cur); - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. - */ + /* Initialize the TSS. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + /* + * sp0 points to the entry trampoline stack regardless of what task + * is running. + */ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); @@ -1864,6 +1923,8 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); + gdt_setup_doublefault_tss(cpu); + fpu__init_cpu(); if (is_uv_system()) @@ -1872,63 +1933,6 @@ void cpu_init(void) load_fixmap_gdt(cpu); } -#else - -void cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); - - wait_for_master_cpu(cpu); - - show_ucode_info_early(); - - pr_info("Initializing CPU#%d\n", cpu); - - if (cpu_feature_enabled(X86_FEATURE_VME) || - boot_cpu_has(X86_FEATURE_TSC) || - boot_cpu_has(X86_FEATURE_DE)) - cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - load_current_idt(); - switch_to_new_gdt(cpu); - - /* - * Set up and load the per-CPU TSS and LDT - */ - mmgrab(&init_mm); - curr->active_mm = &init_mm; - BUG_ON(curr->mm); - initialize_tlbstate_and_flush(); - enter_lazy_tlb(&init_mm, curr); - - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. - */ - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); - load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); - - load_mm_ldt(&init_mm); - - t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - -#ifdef CONFIG_DOUBLEFAULT - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif - - clear_all_debug_regs(); - dbg_restore_debug_regs(); - - fpu__init_cpu(); - - load_fixmap_gdt(cpu); -} -#endif - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c0e2407abdd6..38ab6e115eac 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -44,6 +44,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c2fdc00df163..4a900804a023 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c) detect_tme(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 @@ -814,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, @@ -842,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, { 0x00, 0, 0 } }; @@ -854,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc) return; /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && \ - intel_tlb_table[k].descriptor != 0; k++) + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) ; if (intel_tlb_table[k].tlb_type == 0) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6ea7fdc82f3c..5167bd2bb6b1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -583,7 +583,7 @@ bool amd_filter_mce(struct mce *m) * - Prevent possible spurious interrupts from the IF bank on Family 0x17 * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) +static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { int i, num_msrs; u64 hwcr; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 743370ee4983..5f42f25bac8f 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -488,8 +488,9 @@ int mce_usable_address(struct mce *m) if (!(m->status & MCI_STATUS_ADDRV)) return 0; - /* Checks after this one are Intel-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + /* Checks after this one are Intel/Zhaoxin-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 1; if (!(m->status & MCI_STATUS_MISCV)) @@ -507,10 +508,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address); bool mce_is_memory_error(struct mce *m) { - if (m->cpuvendor == X86_VENDOR_AMD || - m->cpuvendor == X86_VENDOR_HYGON) { + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: return amd_mce_is_memory_error(m); - } else if (m->cpuvendor == X86_VENDOR_INTEL) { + + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -527,9 +531,10 @@ bool mce_is_memory_error(struct mce *m) return (m->status & 0xef80) == BIT(7) || (m->status & 0xef00) == BIT(8) || (m->status & 0xeffc) == 0xc; - } - return false; + default: + return false; + } } EXPORT_SYMBOL_GPL(mce_is_memory_error); @@ -1127,6 +1132,12 @@ static bool __mc_check_crashing_cpu(int cpu) u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { + if (mcgstatus & MCG_STATUS_LMCES) + return false; + } + if (mcgstatus & MCG_STATUS_RIPV) { mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); return true; @@ -1277,9 +1288,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Check if this MCE is signaled to only this logical processor, - * on Intel only. + * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL) + if (m.cpuvendor == X86_VENDOR_INTEL || + m.cpuvendor == X86_VENDOR_ZHAOXIN) lmce = m.mcgstatus & MCG_STATUS_LMCES; /* @@ -1697,6 +1709,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } + } + if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; if (cfg->bootlog != 0) @@ -1760,6 +1784,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c) } } +static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * These CPUs have MCA bank 8 which reports only one error type called + * SVAD (System View Address Decoder). The reporting of that error is + * controlled by IA32_MC8.CTL.0. + * + * If enabled, prefetching on these CPUs will cause SVAD MCE when + * virtual machines start and result in a system panic. Always disable + * bank 8 SVAD error by default. + */ + if ((c->x86 == 7 && c->x86_model == 0x1b) || + (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (this_cpu_read(mce_num_banks) > 8) + mce_banks[8].ctl = 0; + } + + intel_init_cmci(); + intel_init_lmce(); + mce_adjust_timer = cmci_intel_adjust_timer; +} + +static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -1781,6 +1834,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_centaur_feature_init(c); break; + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_init(c); + break; + default: break; } @@ -1792,6 +1849,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: mce_intel_feature_clear(c); break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_clear(c); + break; + default: break; } @@ -2014,15 +2076,16 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs - * are socket-wide. - * Disabling them for just a single offlined CPU is bad, since it will - * inhibit reporting for all shared resources on the socket like the - * last level cache (LLC), the integrated memory controller (iMC), etc. + * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these + * MSRs are socket-wide. Disabling them for just a single offlined CPU + * is bad, since it will inhibit reporting for all shared resources on + * the socket like the last level cache (LLC), the integrated memory + * controller (iMC), etc. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) return; mce_disable_error_reporting(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 88cd9598fa57..e270d0770134 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -85,8 +85,10 @@ static int cmci_supported(int *banks) * initialization is vendor keyed and this * makes sure none of the backdoors are entered otherwise. */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); @@ -423,7 +425,7 @@ void cmci_disable_bank(int bank) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } -static void intel_init_cmci(void) +void intel_init_cmci(void) { int banks; @@ -442,7 +444,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -static void intel_init_lmce(void) +void intel_init_lmce(void) { u64 val; @@ -455,7 +457,7 @@ static void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } -static void intel_clear_lmce(void) +void intel_clear_lmce(void) { u64 val; @@ -482,6 +484,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43031db429d2..842b273bce31 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval); bool mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); +void intel_init_cmci(void); +void intel_init_lmce(void); +void intel_clear_lmce(void); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } +static inline void intel_init_cmci(void) { } +static inline void intel_init_lmce(void) { } +static inline void intel_clear_lmce(void) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index 6e2becf547c5..d01e0da0163a 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -40,15 +40,58 @@ #define THERMAL_THROTTLING_EVENT 0 #define POWER_LIMIT_EVENT 1 -/* - * Current thermal event state: +/** + * struct _thermal_state - Represent the current thermal event state + * @next_check: Stores the next timestamp, when it is allowed + * to log the next warning message. + * @last_interrupt_time: Stores the timestamp for the last threshold + * high event. + * @therm_work: Delayed workqueue structure + * @count: Stores the current running count for thermal + * or power threshold interrupts. + * @last_count: Stores the previous running count for thermal + * or power threshold interrupts. + * @max_time_ms: This shows the maximum amount of time CPU was + * in throttled state for a single thermal + * threshold high to low state. + * @total_time_ms: This is a cumulative time during which CPU was + * in the throttled state. + * @rate_control_active: Set when a throttling message is logged. + * This is used for the purpose of rate-control. + * @new_event: Stores the last high/low status of the + * THERM_STATUS_PROCHOT or + * THERM_STATUS_POWER_LIMIT. + * @level: Stores whether this _thermal_state instance is + * for a CORE level or for PACKAGE level. + * @sample_index: Index for storing the next sample in the buffer + * temp_samples[]. + * @sample_count: Total number of samples collected in the buffer + * temp_samples[]. + * @average: The last moving average of temperature samples + * @baseline_temp: Temperature at which thermal threshold high + * interrupt was generated. + * @temp_samples: Storage for temperature samples to calculate + * moving average. + * + * This structure is used to represent data related to thermal state for a CPU. + * There is a separate storage for core and package level for each CPU. */ struct _thermal_state { - bool new_event; - int event; u64 next_check; + u64 last_interrupt_time; + struct delayed_work therm_work; unsigned long count; unsigned long last_count; + unsigned long max_time_ms; + unsigned long total_time_ms; + bool rate_control_active; + bool new_event; + u8 level; + u8 sample_index; + u8 sample_count; + u8 average; + u8 baseline_temp; + u8 temp_samples[3]; }; struct thermal_state { @@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count); define_therm_throt_device_show_func(package_power_limit, count); define_therm_throt_device_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(core_throttle, max_time_ms); +define_therm_throt_device_one_ro(core_throttle_max_time_ms); + +define_therm_throt_device_show_func(package_throttle, max_time_ms); +define_therm_throt_device_one_ro(package_throttle_max_time_ms); + +define_therm_throt_device_show_func(core_throttle, total_time_ms); +define_therm_throt_device_one_ro(core_throttle_total_time_ms); + +define_therm_throt_device_show_func(package_throttle, total_time_ms); +define_therm_throt_device_one_ro(package_throttle_total_time_ms); + static struct attribute *thermal_throttle_attrs[] = { &dev_attr_core_throttle_count.attr, + &dev_attr_core_throttle_max_time_ms.attr, + &dev_attr_core_throttle_total_time_ms.attr, NULL }; @@ -135,6 +192,105 @@ static const struct attribute_group thermal_attr_group = { #define CORE_LEVEL 0 #define PACKAGE_LEVEL 1 +#define THERM_THROT_POLL_INTERVAL HZ +#define THERM_STATUS_PROCHOT_LOG BIT(1) + +static void clear_therm_status_log(int level) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); +} + +static void get_therm_status(int level, bool *proc_hot, u8 *temp) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + if (msr_val & THERM_STATUS_PROCHOT_LOG) + *proc_hot = true; + else + *proc_hot = false; + + *temp = (msr_val >> 16) & 0x7F; +} + +static void throttle_active_work(struct work_struct *work) +{ + struct _thermal_state *state = container_of(to_delayed_work(work), + struct _thermal_state, therm_work); + unsigned int i, avg, this_cpu = smp_processor_id(); + u64 now = get_jiffies_64(); + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* temperature value is offset from the max so lesser means hotter */ + if (!hot && temp > state->baseline_temp) { + if (state->rate_control_active) + pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + + state->rate_control_active = false; + return; + } + + if (time_before64(now, state->next_check) && + state->rate_control_active) + goto re_arm; + + state->next_check = now + CHECK_INTERVAL; + + if (state->count != state->last_count) { + /* There was one new thermal interrupt */ + state->last_count = state->count; + state->average = 0; + state->sample_count = 0; + state->sample_index = 0; + } + + state->temp_samples[state->sample_index] = temp; + state->sample_count++; + state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); + if (state->sample_count < ARRAY_SIZE(state->temp_samples)) + goto re_arm; + + avg = 0; + for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) + avg += state->temp_samples[i]; + + avg /= ARRAY_SIZE(state->temp_samples); + + if (state->average > avg) { + pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + state->rate_control_active = true; + } + + state->average = avg; + +re_arm: + clear_therm_status_log(state->level); + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); +} + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -178,27 +334,33 @@ static void therm_throt_process(bool new_event, int event, int level) if (new_event) state->count++; - if (time_before64(now, state->next_check) && - state->count != state->last_count) + if (event != THERMAL_THROTTLING_EVENT) return; - state->next_check = now + CHECK_INTERVAL; - state->last_count = state->count; + if (new_event && !state->last_interrupt_time) { + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* + * Ignore short temperature spike as the system is not close + * to PROCHOT. 10C offset is large enough to ignore. It is + * already dropped from the high threshold temperature. + */ + if (temp > 10) + return; - /* if we just entered the thermal event */ - if (new_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); - return; - } - if (old_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); - return; + state->baseline_temp = temp; + state->last_interrupt_time = now; + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); + } else if (old_event && state->last_interrupt_time) { + unsigned long throttle_time; + + throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); + if (throttle_time > state->max_time_ms) + state->max_time_ms = throttle_time; + state->total_time_ms += throttle_time; + state->last_interrupt_time = 0; } } @@ -244,20 +406,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_max_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_total_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } } + return 0; + +del_group: + sysfs_remove_group(&dev->kobj, &thermal_attr_group); + return err; } @@ -269,15 +458,29 @@ static void thermal_throttle_remove_dev(struct device *dev) /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_online(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + state->package_throttle.level = PACKAGE_LEVEL; + state->core_throttle.level = CORE_LEVEL; + + INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); + INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + return thermal_throttle_add_dev(dev, cpu); } static int thermal_throttle_offline(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + cancel_delayed_work(&state->package_throttle.therm_work); + cancel_delayed_work(&state->core_throttle.therm_work); + + state->package_throttle.rate_control_active = false; + state->core_throttle.rate_control_active = false; + thermal_throttle_remove_dev(dev); return 0; } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a0e52bd00ecc..3f6b137ef4e6 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev, dummy; + u32 rev, dummy __always_unused; mc = (struct microcode_amd *)amd_ucode_patch; @@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy; + u32 rev, dummy __always_unused; BUG_ON(raw_smp_processor_id() != cpu); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index cb0fdcaf1415..7019d4b2df0c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache); */ static DEFINE_MUTEX(microcode_mutex); -/* - * Serialize late loading so that CPUs get updated one-by-one. - */ -static DEFINE_RAW_SPINLOCK(update_lock); - struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -566,11 +561,18 @@ static int __reload_late(void *info) if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) return -1; - raw_spin_lock(&update_lock); - apply_microcode_local(&err); - raw_spin_unlock(&update_lock); + /* + * On an SMT system, it suffices to load the microcode on one sibling of + * the core because the microcode engine is shared between the threads. + * Synchronization still needs to take place so that no concurrent + * loading attempts happen on multiple threads of an SMT core. See + * below. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) + apply_microcode_local(&err); + else + goto wait_for_siblings; - /* siblings return UCODE_OK because their engine got updated already */ if (err > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); ret = -1; @@ -578,14 +580,18 @@ static int __reload_late(void *info) ret = 1; } +wait_for_siblings: + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) + panic("Timeout during microcode update!\n"); + /* - * Increase the wait timeout to a safe value here since we're - * serializing the microcode update and that could take a while on a - * large number of CPUs. And that is fine as the *actual* timeout will - * be determined by the last CPU finished updating and thus cut short. + * At least one thread has completed update on each core. + * For others, simply call the update to make sure the + * per-cpu cpuinfo can be updated with right microcode + * revision. */ - if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) - panic("Timeout during microcode update!\n"); + if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) + apply_microcode_local(&err); return ret; } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ce799cfe9434..6a99535d7f37 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; struct microcode_intel *mc; enum ucode_state ret; static int prev_rev; @@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_ERROR; } - if (rev != prev_rev) { + if (bsp && rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", rev, mc->hdr.date & 0xffff, @@ -852,7 +853,7 @@ out: c->microcode = rev; /* Update boot_cpu_data's revision too, if we're on the BSP: */ - if (c->cpu_index == boot_cpu_data.cpu_index) + if (bsp) boot_cpu_data.microcode = rev; return ret; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c656d92cd708..caa032ce3fe3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -290,7 +290,12 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - mark_tsc_unstable("running on Hyper-V"); + if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + } else { + mark_tsc_unstable("running on Hyper-V"); + } /* * Generation 2 instances don't support reading the NMI status from diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 5c900f9527ff..c4be62058dd9 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup); #ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned long tmp; + unsigned int changed = 0; + unsigned long tmp, prev; int i; if (!cpu_has(c, X86_FEATURE_RDRAND)) @@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } + + /* + * Stupid sanity-check whether RDRAND does *actually* generate + * some at least random-looking data. + */ + prev = tmp; + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (rdrand_long(&tmp)) { + if (prev != tmp) + changed++; + + prev = tmp; + } + } + + if (WARN_ON_ONCE(!changed)) + pr_emerg( +"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); + } #endif diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a46dee8e78db..2e3b06d6bbc6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); - rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; - rdt_last_cmd_puts("Directory was removed\n"); goto unlock; } @@ -2648,10 +2646,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); - rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; - rdt_last_cmd_puts("Directory was removed\n"); goto out_unlock; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..3e20d322bc98 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> + */ + +#include <linux/cpufeature.h> + +#include <asm/cmdline.h> + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index eb651fbde92a..00fc55ac7ffa 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/memblock.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -39,6 +40,7 @@ #include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> +#include <asm/cmdline.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } +/* + * When the crashkernel option is specified, only use the low + * 1M for the real mode trampoline. + */ +void __init crash_reserve_low_1M(void) +{ + if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) + return; + + memblock_reserve(0, 1<<20); + pr_info("Reserving the low 1M of memory for crashkernel\n"); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -173,8 +188,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_KEXEC_FILE -static unsigned long crash_zero_bytes; - static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -189,8 +202,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) unsigned int nr_ranges = 0; struct crash_mem *cmem; - walk_system_ram_res(0, -1, &nr_ranges, - get_nr_ram_ranges_callback); + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); if (!nr_ranges) return NULL; @@ -217,15 +229,19 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) { int ret = 0; + /* Exclude the low 1M because it is always reserved */ + ret = crash_exclude_mem_range(cmem, 0, 1<<20); + if (ret) + return ret; + /* Exclude crashkernel region */ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; - if (crashk_low_res.end) { + if (crashk_low_res.end) ret = crash_exclude_mem_range(cmem, crashk_low_res.start, - crashk_low_res.end); - } + crashk_low_res.end); return ret; } @@ -246,16 +262,13 @@ static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) { struct crash_mem *cmem; - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - int ret, i; + int ret; cmem = fill_up_crash_elf_data(); if (!cmem) return -ENOMEM; - ret = walk_system_ram_res(0, -1, cmem, - prepare_elf64_ram_headers_callback); + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); if (ret) goto out; @@ -265,24 +278,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr, goto out; /* By default prepare 64bit headers */ - ret = crash_prepare_elf64_headers(cmem, - IS_ENABLED(CONFIG_X86_64), addr, sz); - if (ret) - goto out; + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); - /* - * If a range matches backup region, adjust offset to backup - * segment. - */ - ehdr = (Elf64_Ehdr *)*addr; - phdr = (Elf64_Phdr *)(ehdr + 1); - for (i = 0; i < ehdr->e_phnum; phdr++, i++) - if (phdr->p_type == PT_LOAD && - phdr->p_paddr == image->arch.backup_src_start && - phdr->p_memsz == image->arch.backup_src_sz) { - phdr->p_offset = image->arch.backup_load_addr; - break; - } out: vfree(cmem); return ret; @@ -296,8 +293,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_table[nr_e820_entries], entry, - sizeof(struct e820_entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -321,19 +317,11 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; - int ret = 0; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; cmem->nr_ranges = 1; - /* Exclude Backup region */ - start = image->arch.backup_load_addr; - end = start + image->arch.backup_src_sz - 1; - ret = crash_exclude_mem_range(cmem, start, end); - if (ret) - return ret; - /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; @@ -356,28 +344,28 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; - /* Add first 640K segment */ - ei.addr = image->arch.backup_src_start; - ei.size = image->arch.backup_src_sz; - ei.type = E820_TYPE_RAM; - add_e820_entry(params, &ei); + /* Add the low 1M */ + cmd.type = E820_TYPE_RAM; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd, + memmap_entry_callback); /* Add ACPI tables */ cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add e820 reserved ranges */ cmd.type = E820_TYPE_RESERVED; flags = IORESOURCE_MEM; walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add crashk_low_res region */ if (crashk_low_res.end) { @@ -388,8 +376,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) } /* Exclude some ranges from crashk_res and add rest to memmap */ - ret = memmap_exclude_ranges(image, cmem, crashk_res.start, - crashk_res.end); + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end); if (ret) goto out; @@ -409,55 +396,12 @@ out: return ret; } -static int determine_backup_region(struct resource *res, void *arg) -{ - struct kimage *image = arg; - - image->arch.backup_src_start = res->start; - image->arch.backup_src_sz = resource_size(res); - - /* Expecting only one range for backup region */ - return 1; -} - int crash_load_segments(struct kimage *image) { int ret; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ULONG_MAX, .top_down = false }; - /* - * Determine and load a segment for backup area. First 640K RAM - * region is backup source - */ - - ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, - image, determine_backup_region); - - /* Zero or postive return values are ok */ - if (ret < 0) - return ret; - - /* Add backup segment. */ - if (image->arch.backup_src_sz) { - kbuf.buffer = &crash_zero_bytes; - kbuf.bufsz = sizeof(crash_zero_bytes); - kbuf.memsz = image->arch.backup_src_sz; - kbuf.buf_align = PAGE_SIZE; - /* - * Ideally there is no source for backup segment. This is - * copied in purgatory after crash. Just add a zero filled - * segment for now to make sure checksum logic works fine. - */ - ret = kexec_add_buffer(&kbuf); - if (ret) - return ret; - image->arch.backup_load_addr = kbuf.mem; - pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, - image->arch.backup_src_start, kbuf.memsz); - } - /* Prepare elf headers and add a segment */ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0b8cedb20d6d..0d6c657593f8 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -54,7 +54,7 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .sp0 = STACK_START, .ss0 = __KERNEL_DS, .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, .ip = (unsigned long) doublefault_fn, /* 0x2 bit is always set */ @@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif .__cr3 = __pa_nodebug(swapper_pg_dir), }; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7da2bcd2b8eb..c5399e80c59c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type) case E820_TYPE_RAM: /* Fall through: */ case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; @@ -999,6 +1000,17 @@ void __init e820__reserve_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + e820__range_update(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -1037,6 +1049,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; case E820_TYPE_PMEM: return "Persistent Memory"; case E820_TYPE_RESERVED: return "Reserved"; + case E820_TYPE_SOFT_RESERVED: return "Soft Reserved"; default: return "Unknown E820 type"; } } @@ -1052,6 +1065,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) case E820_TYPE_PRAM: /* Fall-through: */ case E820_TYPE_PMEM: /* Fall-through: */ case E820_TYPE_RESERVED: /* Fall-through: */ + case E820_TYPE_SOFT_RESERVED: /* Fall-through: */ default: return IORESOURCE_MEM; } } @@ -1064,6 +1078,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; + case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ @@ -1078,11 +1093,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) return true; /* - * Treat persistent memory like device memory, i.e. reserve it - * for exclusive use of a driver + * Treat persistent memory and other special memory ranges like + * device memory, i.e. reserve it for exclusive use of a driver */ switch (type) { case E820_TYPE_RESERVED: + case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; @@ -1285,6 +1301,9 @@ void __init e820__memblock_setup(void) if (end != (resource_size_t)end) continue; + if (entry->type == E820_TYPE_SOFT_RESERVED) + memblock_reserve(entry->addr, entry->size); + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) continue; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6f6b1d04dadf..4cba91ec8049 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -710,6 +710,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e5cb67d67c03..319be936c348 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; +static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -254,10 +254,13 @@ static void __init setup_xstate_features(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_offsets[0] = 0; - xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); - xstate_offsets[1] = xstate_sizes[0]; - xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = FIELD_SIZEOF(struct fxregs_state, + xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) @@ -342,7 +345,7 @@ static int xfeature_is_aligned(int xfeature_nr) */ static void __init setup_xstate_comp(void) { - unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; + unsigned int xstate_comp_sizes[XFEATURE_MAX]; int i; /* @@ -350,8 +353,9 @@ static void __init setup_xstate_comp(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); + xstate_comp_offsets[XFEATURE_FP] = 0; + xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, + xmm_space); if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { @@ -840,7 +844,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that pcntxt_mask is + * have not enabled. Remember that xfeatures_mask is * what we write to the XCR0 register. */ WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 024c3053dbba..060a361d9d11 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1043,6 +1043,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, return; /* + * If the return location is actually pointing directly to + * the start of a direct trampoline (if we trace the trampoline + * it will still be offset by MCOUNT_INSN_SIZE), then the + * return address is actually off by one word, and we + * need to adjust for that. + */ + if (ftrace_direct_func_count) { + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } + } + + /* * Protect against fault, even if it shouldn't * happen. This tool is too much intrusive to * ignore such a protection. diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 073aab525d80..e8a9f8370112 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -12,20 +12,18 @@ #include <asm/frame.h> #include <asm/asm-offsets.h> -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) - #ifdef CONFIG_FRAME_POINTER # define MCOUNT_FRAME 1 /* using frame = true */ #else # define MCOUNT_FRAME 0 /* using frame = false */ #endif -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) ret -END(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) -ENTRY(ftrace_caller) +SYM_CODE_START(ftrace_caller) #ifdef CONFIG_FRAME_POINTER /* @@ -85,11 +83,11 @@ ftrace_graph_call: #endif /* This is weak to keep gas from relaxing the jumps */ -WEAK(ftrace_stub) +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) ret -END(ftrace_caller) +SYM_CODE_END(ftrace_caller) -ENTRY(ftrace_regs_caller) +SYM_CODE_START(ftrace_regs_caller) /* * We're here from an mcount/fentry CALL, and the stack frame looks like: * @@ -138,7 +136,7 @@ ENTRY(ftrace_regs_caller) movl function_trace_op, %ecx # 3rd argument: ftrace_pos pushl %esp # 4th argument: pt_regs -GLOBAL(ftrace_regs_call) +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub addl $4, %esp # skip 4th argument @@ -163,9 +161,10 @@ GLOBAL(ftrace_regs_call) popl %eax jmp .Lftrace_ret +SYM_CODE_END(ftrace_regs_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) +SYM_CODE_START(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx @@ -179,7 +178,7 @@ ENTRY(ftrace_graph_caller) popl %ecx popl %eax ret -END(ftrace_graph_caller) +SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 809d54397dba..369e61faacfe 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -14,9 +14,6 @@ .code64 .section .entry.text, "ax" -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) - #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ # define MCOUNT_FRAME_SIZE (8+16*2) @@ -88,6 +85,7 @@ EXPORT_SYMBOL(__fentry__) movq %rdi, RDI(%rsp) movq %r8, R8(%rsp) movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) /* * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. @@ -114,7 +112,11 @@ EXPORT_SYMBOL(__fentry__) subq $MCOUNT_INSN_SIZE, %rdi .endm -.macro restore_mcount_regs +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + movq R9(%rsp), %r9 movq R8(%rsp), %r8 movq RDI(%rsp), %rdi @@ -123,31 +125,29 @@ EXPORT_SYMBOL(__fentry__) movq RCX(%rsp), %rcx movq RAX(%rsp), %rax - /* ftrace_regs_caller can modify %rbp */ - movq RBP(%rsp), %rbp - - addq $MCOUNT_REG_SIZE, %rsp + addq $MCOUNT_REG_SIZE-\save, %rsp .endm #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) retq -ENDPROC(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs -GLOBAL(ftrace_caller_op_ptr) +SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx -GLOBAL(ftrace_call) +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub restore_mcount_regs @@ -157,10 +157,10 @@ GLOBAL(ftrace_call) * think twice before adding any new code or changing the * layout here. */ -GLOBAL(ftrace_epilogue) +SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub #endif @@ -168,19 +168,21 @@ GLOBAL(ftrace_graph_call) * This is weak to keep gas from relaxing the jumps. * It is also used to copy the retq for trampolines. */ -WEAK(ftrace_stub) +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -ENDPROC(ftrace_caller) +SYM_FUNC_END(ftrace_caller) -ENTRY(ftrace_regs_caller) +SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq + UNWIND_HINT_SAVE + /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ -GLOBAL(ftrace_regs_caller_op_ptr) +SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx @@ -209,7 +211,7 @@ GLOBAL(ftrace_regs_caller_op_ptr) /* regs go into 4th parameter */ leaq (%rsp), %rcx -GLOBAL(ftrace_regs_call) +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub /* Copy flags back to SS, to restore them */ @@ -228,7 +230,33 @@ GLOBAL(ftrace_regs_call) movq R10(%rsp), %r10 movq RBX(%rsp), %rbx - restore_mcount_regs + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* If ORIG_RAX is anything but zero, make this a call to that */ + movq ORIG_RAX(%rsp), %rax + cmpq $0, %rax + je 1f + + /* Swap the flags with orig_rax */ + movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + + jmp 2f + +1: restore_mcount_regs + + +2: + /* + * The stack layout is nondetermistic here, depending on which path was + * taken. This confuses objtool and ORC, rightfully so. For now, + * pretend the stack always looks like the non-direct case. + */ + UNWIND_HINT_RESTORE /* Restore flags */ popfq @@ -239,16 +267,16 @@ GLOBAL(ftrace_regs_call) * The trampoline will add the code to jump * to the return. */ -GLOBAL(ftrace_regs_caller_end) +SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue -ENDPROC(ftrace_regs_caller) +SYM_FUNC_END(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(function_hook) +SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -261,7 +289,7 @@ fgraph_trace: jnz ftrace_graph_caller #endif -GLOBAL(ftrace_stub) +SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) retq trace: @@ -279,11 +307,12 @@ trace: restore_mcount_regs jmp fgraph_trace -ENDPROC(function_hook) +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) +SYM_FUNC_START(ftrace_graph_caller) /* Saves rbp into %rdx and fills first parameter */ save_mcount_regs @@ -294,9 +323,9 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -ENDPROC(ftrace_graph_caller) +SYM_FUNC_END(ftrace_graph_caller) -ENTRY(return_to_handler) +SYM_CODE_START(return_to_handler) UNWIND_HINT_EMPTY subq $24, %rsp @@ -312,5 +341,5 @@ ENTRY(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC %rdi -END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30f9cb2c0b55..3923ab4630d7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -64,7 +64,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * can. */ __HEAD -ENTRY(startup_32) +SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -156,7 +156,7 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(xen_entry) +SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK) /* Unknown implementation; there's really nothing we can do at this point. */ ud2a @@ -172,6 +172,7 @@ num_subarch_entries = (. - subarch_entries) / 4 #else jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ +SYM_CODE_END(startup_32) #ifdef CONFIG_HOTPLUG_CPU /* @@ -179,12 +180,12 @@ num_subarch_entries = (. - subarch_entries) / 4 * up already except stack. We just set up stack here. Then call * start_secondary(). */ -ENTRY(start_cpu0) +SYM_FUNC_START(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp call *(initial_code) 1: jmp 1b -ENDPROC(start_cpu0) +SYM_FUNC_END(start_cpu0) #endif /* @@ -195,7 +196,7 @@ ENDPROC(start_cpu0) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ -ENTRY(startup_32_smp) +SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax movl %eax,%ds @@ -362,7 +363,7 @@ ENTRY(startup_32_smp) call *(initial_code) 1: jmp 1b -ENDPROC(startup_32_smp) +SYM_FUNC_END(startup_32_smp) #include "verify_cpu.S" @@ -392,7 +393,7 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handler_array) +SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip @@ -407,9 +408,9 @@ ENTRY(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) +SYM_FUNC_END(early_idt_handler_array) -early_idt_handler_common: +SYM_CODE_START_LOCAL(early_idt_handler_common) /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -460,10 +461,10 @@ early_idt_handler_common: decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret -ENDPROC(early_idt_handler_common) +SYM_CODE_END(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ -ENTRY(early_ignore_irq) +SYM_FUNC_START(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -498,19 +499,16 @@ ENTRY(early_ignore_irq) hlt_loop: hlt jmp hlt_loop -ENDPROC(early_ignore_irq) +SYM_FUNC_END(early_ignore_irq) __INITDATA .align 4 -GLOBAL(early_recursion_flag) - .long 0 +SYM_DATA(early_recursion_flag, .long 0) __REFDATA .align 4 -ENTRY(initial_code) - .long i386_start_kernel -ENTRY(setup_once_ref) - .long setup_once +SYM_DATA(initial_code, .long i386_start_kernel) +SYM_DATA(setup_once_ref, .long setup_once) #ifdef CONFIG_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) @@ -553,7 +551,7 @@ EXPORT_SYMBOL(empty_zero_page) __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PGD_ALIGN -ENTRY(initial_page_table) +SYM_DATA_START(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 @@ -571,17 +569,28 @@ ENTRY(initial_page_table) # error "Kernel PMDs should be 1, 2 or 3" # endif .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + +SYM_DATA_END(initial_page_table) #endif .data .balign 4 -ENTRY(initial_stack) - /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel - * unwinder reliably detect the end of the stack. - */ - .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ - TOP_OF_KERNEL_STACK_PADDING; +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, + .long init_thread_union + THREAD_SIZE - + SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) __INITRODATA int_msg: @@ -597,27 +606,28 @@ int_msg: */ .data -.globl boot_gdt_descr - ALIGN # early boot GDT descriptor (must use 1:1 address mapping) .word 0 # 32 bit align gdt_desc.address -boot_gdt_descr: +SYM_DATA_START_LOCAL(boot_gdt_descr) .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET +SYM_DATA_END(boot_gdt_descr) # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -ENTRY(early_gdt_descr) +SYM_DATA_START(early_gdt_descr) .word GDT_ENTRIES*8-1 .long gdt_page /* Overwritten for secondary CPUs */ +SYM_DATA_END(early_gdt_descr) /* * The boot_gdt must mirror the equivalent in setup.S and is * used only for booting. */ .align L1_CACHE_BYTES -ENTRY(boot_gdt) +SYM_DATA_START(boot_gdt) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +SYM_DATA_END(boot_gdt) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index f3d3e9646a99..4bbc770af632 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -49,8 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .text __HEAD .code64 - .globl startup_64 -startup_64: +SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -90,7 +89,9 @@ startup_64: /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f -ENTRY(secondary_startup_64) +SYM_CODE_END(startup_64) + +SYM_CODE_START(secondary_startup_64) UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -240,7 +241,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -END(secondary_startup_64) +SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -250,30 +251,28 @@ END(secondary_startup_64) * up already except stack. We just set up stack here. Then call * start_secondary() via .Ljump_to_C_code. */ -ENTRY(start_cpu0) +SYM_CODE_START(start_cpu0) UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code -END(start_cpu0) +SYM_CODE_END(start_cpu0) #endif /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 - GLOBAL(initial_code) - .quad x86_64_start_kernel - GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(fixed_percpu_data) - GLOBAL(initial_stack) - /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel - * unwinder reliably detect the end of the stack. - */ - .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS +SYM_DATA(initial_code, .quad x86_64_start_kernel) +SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) + +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) __FINITDATA __INIT -ENTRY(early_idt_handler_array) +SYM_CODE_START(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 @@ -289,9 +288,9 @@ ENTRY(early_idt_handler_array) .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr UNWIND_HINT_IRET_REGS offset=16 -END(early_idt_handler_array) +SYM_CODE_END(early_idt_handler_array) -early_idt_handler_common: +SYM_CODE_START_LOCAL(early_idt_handler_common) /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -333,17 +332,11 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) jmp restore_regs_and_return_to_kernel -END(early_idt_handler_common) +SYM_CODE_END(early_idt_handler_common) - __INITDATA - .balign 4 -GLOBAL(early_recursion_flag) - .long 0 - -#define NEXT_PAGE(name) \ - .balign PAGE_SIZE; \ -GLOBAL(name) +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) #ifdef CONFIG_PAGE_TABLE_ISOLATION /* @@ -358,11 +351,11 @@ GLOBAL(name) */ #define PTI_USER_PGD_FILL 512 /* This ensures they are 8k-aligned: */ -#define NEXT_PGD_PAGE(name) \ - .balign 2 * PAGE_SIZE; \ -GLOBAL(name) +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE) #else -#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_DATA_START_PAGE_ALIGNED(name) #define PTI_USER_PGD_FILL 0 #endif @@ -375,17 +368,23 @@ GLOBAL(name) .endr __INITDATA -NEXT_PGD_PAGE(early_top_pgt) + .balign 4 + +SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(early_top_pgt) -NEXT_PAGE(early_dynamic_pgts) +SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +SYM_DATA_END(early_dynamic_pgts) + +SYM_DATA(early_recursion_flag, .long 0) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH) -NEXT_PGD_PAGE(init_top_pgt) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -393,11 +392,13 @@ NEXT_PGD_PAGE(init_top_pgt) /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) -NEXT_PAGE(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 -NEXT_PAGE(level2_ident_pgt) +SYM_DATA_END(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt) /* * Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. @@ -407,25 +408,29 @@ NEXT_PAGE(level2_ident_pgt) * the CPU should ignore the bit. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +SYM_DATA_END(level2_ident_pgt) #else -NEXT_PGD_PAGE(init_top_pgt) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) #endif #ifdef CONFIG_X86_5LEVEL -NEXT_PAGE(level4_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level4_kernel_pgt) #endif -NEXT_PAGE(level3_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level3_kernel_pgt) -NEXT_PAGE(level2_kernel_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable * anyway. @@ -442,8 +447,9 @@ NEXT_PAGE(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) +SYM_DATA_END(level2_kernel_pgt) -NEXT_PAGE(level2_fixmap_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 pgtno = 0 .rept (FIXMAP_PMD_NUM) @@ -453,31 +459,32 @@ NEXT_PAGE(level2_fixmap_pgt) .endr /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 +SYM_DATA_END(level2_fixmap_pgt) -NEXT_PAGE(level1_fixmap_pgt) +SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) .fill 512,8,0 .endr +SYM_DATA_END(level1_fixmap_pgt) #undef PMDS .data .align 16 - .globl early_gdt_descr -early_gdt_descr: - .word GDT_ENTRIES*8-1 -early_gdt_descr_base: - .quad INIT_PER_CPU_VAR(gdt_page) - -ENTRY(phys_base) - /* This must match the first entry in level2_kernel_pgt */ - .quad 0x0000000000000000 + +SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) +SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) + + .align 16 +/* This must match the first entry in level2_kernel_pgt */ +SYM_DATA(phys_base, .quad 0x0) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS -NEXT_PAGE(empty_zero_page) +SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) .skip PAGE_SIZE +SYM_DATA_END(empty_zero_page) EXPORT_SYMBOL(empty_zero_page) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 61a89d3c0382..8abeee0dd7bf 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -3,32 +3,69 @@ * This contains the io-permission bitmap code - written by obz, with changes * by Linus. 32/64 bits code unification by Miguel Botón. */ - -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/kernel.h> #include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> #include <linux/security.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> #include <linux/syscalls.h> #include <linux/bitmap.h> -#include <asm/syscalls.h> +#include <linux/ioport.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/io_bitmap.h> #include <asm/desc.h> +#ifdef CONFIG_X86_IOPL_IOPERM + +static atomic64_t io_bitmap_sequence; + +void io_bitmap_share(struct task_struct *tsk) +{ + /* Can be NULL when current->thread.iopl_emul == 3 */ + if (current->thread.io_bitmap) { + /* + * Take a refcount on current's bitmap. It can be used by + * both tasks as long as none of them changes the bitmap. + */ + refcount_inc(¤t->thread.io_bitmap->refcnt); + tsk->thread.io_bitmap = current->thread.io_bitmap; + } + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); +} + +static void task_update_io_bitmap(void) +{ + struct thread_struct *t = ¤t->thread; + + if (t->iopl_emul == 3 || t->io_bitmap) { + /* TSS update is handled on exit to user space */ + set_thread_flag(TIF_IO_BITMAP); + } else { + clear_thread_flag(TIF_IO_BITMAP); + /* Invalidate TSS */ + preempt_disable(); + tss_update_io_bitmap(); + preempt_enable(); + } +} + +void io_bitmap_exit(void) +{ + struct io_bitmap *iobm = current->thread.io_bitmap; + + current->thread.io_bitmap = NULL; + task_update_io_bitmap(); + if (iobm && refcount_dec_and_test(&iobm->refcnt)) + kfree(iobm); +} + /* - * this changes the io permissions bitmap in the current task. + * This changes the io permissions bitmap in the current task. */ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) { struct thread_struct *t = ¤t->thread; - struct tss_struct *tss; - unsigned int i, max_long, bytes, bytes_updated; + unsigned int i, max_long; + struct io_bitmap *iobm; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; @@ -41,59 +78,72 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ - if (!t->io_bitmap_ptr) { - unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - - if (!bitmap) + iobm = t->io_bitmap; + if (!iobm) { + /* No point to allocate a bitmap just to clear permissions */ + if (!turn_on) + return 0; + iobm = kmalloc(sizeof(*iobm), GFP_KERNEL); + if (!iobm) return -ENOMEM; - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); + memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap)); + refcount_set(&iobm->refcnt, 1); + } - /* - * Now that we have an IO bitmap, we need our TSS limit to be - * correct. It's fine if we are preempted after doing this: - * with TIF_IO_BITMAP set, context switches will keep our TSS - * limit correct. - */ - preempt_disable(); - refresh_tss_limit(); - preempt_enable(); + /* + * If the bitmap is not shared, then nothing can take a refcount as + * current can obviously not fork at the same time. If it's shared + * duplicate it and drop the refcount on the original one. + */ + if (refcount_read(&iobm->refcnt) > 1) { + iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + refcount_set(&iobm->refcnt, 1); + io_bitmap_exit(); } /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: + * Store the bitmap pointer (might be the same if the task already + * head one). Must be done here so freeing the bitmap when all + * permissions are dropped has the pointer set up. */ - tss = &per_cpu(cpu_tss_rw, get_cpu()); + t->io_bitmap = iobm; + /* Mark it active for context switching and exit to user mode */ + set_thread_flag(TIF_IO_BITMAP); + /* + * Update the tasks bitmap. The update of the TSS bitmap happens on + * exit to user mode. So this needs no protection. + */ if (turn_on) - bitmap_clear(t->io_bitmap_ptr, from, num); + bitmap_clear(iobm->bitmap, from, num); else - bitmap_set(t->io_bitmap_ptr, from, num); + bitmap_set(iobm->bitmap, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, * to keep it obviously correct: */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) + max_long = UINT_MAX; + for (i = 0; i < IO_BITMAP_LONGS; i++) { + if (iobm->bitmap[i] != ~0UL) max_long = i; + } + /* All permissions dropped? */ + if (max_long == UINT_MAX) { + io_bitmap_exit(); + return 0; + } - bytes = (max_long + 1) * sizeof(unsigned long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + iobm->max = (max_long + 1) * sizeof(unsigned long); - put_cpu(); + /* + * Update the sequence number to force a TSS update on return to + * user mode. + */ + iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); return 0; } @@ -104,38 +154,61 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) } /* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. + * The sys_iopl functionality depends on the level argument, which if + * granted for the task is used to enable access to all 65536 I/O ports. + * + * This does not use the IOPL mechanism provided by the CPU as that would + * also allow the user space task to use the CLI/STI instructions. * - * Here we just change the flags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. + * Disabling interrupts in a user space task is dangerous as it might lock + * up the machine and the semantics vs. syscalls and exceptions is + * undefined. + * + * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3 + * 3 enables them. + * + * IOPL is strictly per thread and inherited on fork. */ SYSCALL_DEFINE1(iopl, unsigned int, level) { - struct pt_regs *regs = current_pt_regs(); struct thread_struct *t = ¤t->thread; - - /* - * Careful: the IOPL bits in regs->flags are undefined under Xen PV - * and changing them has no effect. - */ - unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT; + unsigned int old; if (level > 3) return -EINVAL; + + old = t->iopl_emul; + + /* No point in going further if nothing changes */ + if (level == old) + return 0; + /* Trying to gain more privileges? */ if (level > old) { if (!capable(CAP_SYS_RAWIO) || security_locked_down(LOCKDOWN_IOPORT)) return -EPERM; } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | - (level << X86_EFLAGS_IOPL_BIT); - t->iopl = level << X86_EFLAGS_IOPL_BIT; - set_iopl_mask(t->iopl); + + t->iopl_emul = level; + task_update_io_bitmap(); return 0; } + +#else /* CONFIG_X86_IOPL_IOPERM */ + +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return -ENOSYS; +} +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return -ENOSYS; +} + +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + return -ENOSYS; +} +#endif diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index ddeeaac8adda..0db0375235b4 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,20 +7,20 @@ /* * unsigned long native_save_fl(void) */ -ENTRY(native_save_fl) +SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret -ENDPROC(native_save_fl) +SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) /* * void native_restore_fl(unsigned long flags) * %eax/%rdi: flags */ -ENTRY(native_restore_fl) +SYM_FUNC_START(native_restore_fl) push %_ASM_ARG1 popf ret -ENDPROC(native_restore_fl) +SYM_FUNC_END(native_restore_fl) EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 3ad34f01de2a..6eb8b50ea07e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -11,6 +11,7 @@ #include <linux/acpi_pmtmr.h> #include <linux/kernel.h> #include <linux/reboot.h> +#include <linux/serial_8250.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/hypervisor.h> @@ -21,9 +22,24 @@ #include <asm/setup.h> #include <asm/jailhouse_para.h> -static __initdata struct jailhouse_setup_data setup_data; +static struct jailhouse_setup_data setup_data; +#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1)) +#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2)) + static unsigned int precalibrated_tsc_khz; +static void jailhouse_setup_irq(unsigned int irq) +{ + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + .srcbusirq = irq, + .dstirq = irq, + }; + mp_save_irq(&mp_irq); +} + static uint32_t jailhouse_cpuid_base(void) { if (boot_cpu_data.cpuid_level < 0 || @@ -45,7 +61,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now) static void __init jailhouse_timer_init(void) { - lapic_timer_period = setup_data.apic_khz * (1000 / HZ); + lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ); } static unsigned long jailhouse_get_tsc(void) @@ -77,33 +93,28 @@ static void __init jailhouse_get_smp_config(unsigned int early) .type = IOAPIC_DOMAIN_STRICT, .ops = &mp_ioapic_irqdomain_ops, }; - struct mpc_intsrc mp_irq = { - .type = MP_INTSRC, - .irqtype = mp_INT, - .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, - }; unsigned int cpu; jailhouse_x2apic_init(); register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { - generic_processor_info(setup_data.cpu_ids[cpu], + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { + generic_processor_info(setup_data.v1.cpu_ids[cpu], boot_cpu_apic_version); } smp_found_config = 1; - if (setup_data.standard_ioapic) { + if (setup_data.v1.standard_ioapic) { mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); - /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ - mp_irq.srcbusirq = mp_irq.dstirq = 3; - mp_save_irq(&mp_irq); - - mp_irq.srcbusirq = mp_irq.dstirq = 4; - mp_save_irq(&mp_irq); + if (IS_ENABLED(CONFIG_SERIAL_8250) && + setup_data.hdr.version < 2) { + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + jailhouse_setup_irq(3); + jailhouse_setup_irq(4); + } } } @@ -126,9 +137,9 @@ static int __init jailhouse_pci_arch_init(void) pcibios_last_bus = 0xff; #ifdef CONFIG_PCI_MMCONFIG - if (setup_data.pci_mmconfig_base) { + if (setup_data.v1.pci_mmconfig_base) { pci_mmconfig_add(0, 0, pcibios_last_bus, - setup_data.pci_mmconfig_base); + setup_data.v1.pci_mmconfig_base); pci_mmcfg_arch_init(); } #endif @@ -136,9 +147,57 @@ static int __init jailhouse_pci_arch_init(void) return 0; } +#ifdef CONFIG_SERIAL_8250 +static inline bool jailhouse_uart_enabled(unsigned int uart_nr) +{ + return setup_data.v2.flags & BIT(uart_nr); +} + +static void jailhouse_serial_fixup(int port, struct uart_port *up, + u32 *capabilities) +{ + static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; + unsigned int n; + + for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) { + if (pcuart_base[n] != up->iobase) + continue; + + if (jailhouse_uart_enabled(n)) { + pr_info("Enabling UART%u (port 0x%lx)\n", n, + up->iobase); + jailhouse_setup_irq(up->irq); + } else { + /* Deactivate UART if access isn't allowed */ + up->iobase = 0; + } + break; + } +} + +static void __init jailhouse_serial_workaround(void) +{ + /* + * There are flags inside setup_data that indicate availability of + * platform UARTs since setup data version 2. + * + * In case of version 1, we don't know which UARTs belong Linux. In + * this case, unconditionally register 1:1 mapping for legacy UART IRQs + * 3 and 4. + */ + if (setup_data.hdr.version > 1) + serial8250_set_isa_configurator(jailhouse_serial_fixup); +} +#else /* !CONFIG_SERIAL_8250 */ +static inline void jailhouse_serial_workaround(void) +{ +} +#endif /* CONFIG_SERIAL_8250 */ + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; + unsigned long setup_data_len; struct setup_data header; void *mapping; @@ -163,16 +222,8 @@ static void __init jailhouse_init_platform(void) memcpy(&header, mapping, sizeof(header)); early_memunmap(mapping, sizeof(header)); - if (header.type == SETUP_JAILHOUSE && - header.len >= sizeof(setup_data)) { - pa_data += offsetof(struct setup_data, data); - - mapping = early_memremap(pa_data, sizeof(setup_data)); - memcpy(&setup_data, mapping, sizeof(setup_data)); - early_memunmap(mapping, sizeof(setup_data)); - + if (header.type == SETUP_JAILHOUSE) break; - } pa_data = header.next; } @@ -180,13 +231,28 @@ static void __init jailhouse_init_platform(void) if (!pa_data) panic("Jailhouse: No valid setup data found"); - if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) - panic("Jailhouse: Unsupported setup data structure"); - - pmtmr_ioport = setup_data.pm_timer_address; + /* setup data must at least contain the header */ + if (header.len < sizeof(setup_data.hdr)) + goto unsupported; + + pa_data += offsetof(struct setup_data, data); + setup_data_len = min_t(unsigned long, sizeof(setup_data), + (unsigned long)header.len); + mapping = early_memremap(pa_data, setup_data_len); + memcpy(&setup_data, mapping, setup_data_len); + early_memunmap(mapping, setup_data_len); + + if (setup_data.hdr.version == 0 || + setup_data.hdr.compatible_version != + JAILHOUSE_SETUP_REQUIRED_VERSION || + (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) || + (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN)) + goto unsupported; + + pmtmr_ioport = setup_data.v1.pm_timer_address; pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); - precalibrated_tsc_khz = setup_data.tsc_khz; + precalibrated_tsc_khz = setup_data.v1.tsc_khz; setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); pci_probe = 0; @@ -196,6 +262,12 @@ static void __init jailhouse_init_platform(void) * are none in a non-root cell. */ disable_acpi(); + + jailhouse_serial_workaround(); + return; + +unsupported: + panic("Jailhouse: Unsupported setup data structure"); } bool jailhouse_paravirt(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 044053235302..c1a8b9e71408 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -89,8 +89,7 @@ static void __ref __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, - (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL); } void arch_jump_label_transform(struct jump_entry *entry, @@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } __jump_label_set_jump_code(entry, type, - (union jump_code_union *) &tp->opcode, 0); + (union jump_code_union *)&tp->text, 0); - tp->addr = entry_code; - tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; - tp->len = JUMP_LABEL_NOP_SIZE; + text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL); tp_vec_nr++; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index edaa30b20841..64b6da95af98 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -44,7 +44,12 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, if (count > node->len - pos) count = node->len - pos; - pa = node->paddr + sizeof(struct setup_data) + pos; + pa = node->paddr + pos; + + /* Is it direct data or invalid indirect one? */ + if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT) + pa += sizeof(struct setup_data); + p = memremap(pa, count, MEMREMAP_WB); if (!p) return -ENOMEM; @@ -108,9 +113,17 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - node->paddr = pa_data; - node->type = data->type; - node->len = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + node->paddr = ((struct setup_indirect *)data->data)->addr; + node->type = ((struct setup_indirect *)data->data)->type; + node->len = ((struct setup_indirect *)data->data)->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + create_setup_data_node(d, no, node); pa_data = data->next; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 43fc13c831af..4f13af7cbcdb 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -351,6 +351,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) kernel_insn_init(insn, dest, MAX_INSN_SIZE); insn_get_length(insn); + /* We can not probe force emulate prefixed instruction */ + if (insn_has_emulate_prefix(insn)) + return 0; + /* Another subsystem puts a breakpoint, failed to recover */ if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b348dd506d58..8900329c28a7 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = RELATIVEJUMP_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL); list_del_init(&op->list); } @@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_head *oplist) void arch_unoptimize_kprobe(struct optimized_kprobe *op) { u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 emulate_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ insn_buff[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + emulate_buff[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + emulate_buff); } /* diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 7969da939213..d0a19121c6a4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -100,7 +100,12 @@ static int __init get_setup_data_size(int nr, size_t *size) if (!data) return -ENOMEM; if (nr == i) { - *size = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + *size = ((struct setup_indirect *)data->data)->len; + else + *size = data->len; + memunmap(data); return 0; } @@ -130,7 +135,10 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - ret = sprintf(buf, "0x%x\n", data->type); + if (data->type == SETUP_INDIRECT) + ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); + else + ret = sprintf(buf, "0x%x\n", data->type); memunmap(data); return ret; } @@ -142,7 +150,7 @@ static ssize_t setup_data_data_read(struct file *fp, loff_t off, size_t count) { int nr, ret = 0; - u64 paddr; + u64 paddr, len; struct setup_data *data; void *p; @@ -157,19 +165,28 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (off > data->len) { + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } else { + paddr += sizeof(*data); + len = data->len; + } + + if (off > len) { ret = -EINVAL; goto out; } - if (count > data->len - off) - count = data->len - off; + if (count > len - off) + count = len - off; if (!count) goto out; ret = count; - p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); + p = memremap(paddr, len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e820568ed4d5..32ef1ee733b7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -33,6 +33,7 @@ #include <asm/apicdef.h> #include <asm/hypervisor.h> #include <asm/tlb.h> +#include <asm/cpuidle_haltpoll.h> static int kvmapf = 1; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5dcd438ad8f2..16e125a50b33 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -298,48 +298,6 @@ static void load_segments(void) ); } -#ifdef CONFIG_KEXEC_FILE -/* Update purgatory as needed after various image segments have been prepared */ -static int arch_update_purgatory(struct kimage *image) -{ - int ret = 0; - - if (!image->file_mode) - return 0; - - /* Setup copying of backup region */ - if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_dest", - &image->arch.backup_load_addr, - sizeof(image->arch.backup_load_addr), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_src", - &image->arch.backup_src_start, - sizeof(image->arch.backup_src_start), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_sz", - &image->arch.backup_src_sz, - sizeof(image->arch.backup_src_sz), 0); - if (ret) - return ret; - } - - return ret; -} -#else /* !CONFIG_KEXEC_FILE */ -static inline int arch_update_purgatory(struct kimage *image) -{ - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -353,11 +311,6 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; - /* update purgatory as needed */ - result = arch_update_purgatory(image); - if (result) - return result; - return 0; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 59d3d2763a9e..789f5e4f89de 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -341,8 +341,6 @@ struct paravirt_patch_template pv_ops = { .cpu.iret = native_iret, .cpu.swapgs = native_swapgs, - .cpu.set_iopl_mask = native_set_iopl_mask, - .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c deleted file mode 100644 index 23fdec030c37..000000000000 --- a/arch/x86/kernel/pci-calgary_64.c +++ /dev/null @@ -1,1586 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Derived from arch/powerpc/kernel/iommu.c - * - * Copyright IBM Corporation, 2006-2007 - * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> - * - * Author: Jon Mason <jdmason@kudzu.us> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - - */ - -#define pr_fmt(fmt) "Calgary: " fmt - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/crash_dump.h> -#include <linux/dma-mapping.h> -#include <linux/dma-direct.h> -#include <linux/bitmap.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <linux/scatterlist.h> -#include <linux/iommu-helper.h> - -#include <asm/iommu.h> -#include <asm/calgary.h> -#include <asm/tce.h> -#include <asm/pci-direct.h> -#include <asm/dma.h> -#include <asm/rio.h> -#include <asm/bios_ebda.h> -#include <asm/x86_init.h> -#include <asm/iommu_table.h> - -#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT -int use_calgary __read_mostly = 1; -#else -int use_calgary __read_mostly = 0; -#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ - -#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 - -/* register offsets inside the host bridge space */ -#define CALGARY_CONFIG_REG 0x0108 -#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ -#define PHB_PLSSR_OFFSET 0x0120 -#define PHB_CONFIG_RW_OFFSET 0x0160 -#define PHB_IOBASE_BAR_LOW 0x0170 -#define PHB_IOBASE_BAR_HIGH 0x0180 -#define PHB_MEM_1_LOW 0x0190 -#define PHB_MEM_1_HIGH 0x01A0 -#define PHB_IO_ADDR_SIZE 0x01B0 -#define PHB_MEM_1_SIZE 0x01C0 -#define PHB_MEM_ST_OFFSET 0x01D0 -#define PHB_AER_OFFSET 0x0200 -#define PHB_CONFIG_0_HIGH 0x0220 -#define PHB_CONFIG_0_LOW 0x0230 -#define PHB_CONFIG_0_END 0x0240 -#define PHB_MEM_2_LOW 0x02B0 -#define PHB_MEM_2_HIGH 0x02C0 -#define PHB_MEM_2_SIZE_HIGH 0x02D0 -#define PHB_MEM_2_SIZE_LOW 0x02E0 -#define PHB_DOSHOLE_OFFSET 0x08E0 - -/* CalIOC2 specific */ -#define PHB_SAVIOR_L2 0x0DB0 -#define PHB_PAGE_MIG_CTRL 0x0DA8 -#define PHB_PAGE_MIG_DEBUG 0x0DA0 -#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 - -/* PHB_CONFIG_RW */ -#define PHB_TCE_ENABLE 0x20000000 -#define PHB_SLOT_DISABLE 0x1C000000 -#define PHB_DAC_DISABLE 0x01000000 -#define PHB_MEM2_ENABLE 0x00400000 -#define PHB_MCSR_ENABLE 0x00100000 -/* TAR (Table Address Register) */ -#define TAR_SW_BITS 0x0000ffffffff800fUL -#define TAR_VALID 0x0000000000000008UL -/* CSR (Channel/DMA Status Register) */ -#define CSR_AGENT_MASK 0xffe0ffff -/* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL -/* PMCR/PMDR (Page Migration Control/Debug Registers */ -#define PMR_SOFTSTOP 0x80000000 -#define PMR_SOFTSTOPFAULT 0x40000000 -#define PMR_HARDSTOP 0x20000000 - -/* - * The maximum PHB bus number. - * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 - * x3950M2: 4 chassis, 48 PHBs per chassis = 192 - * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 - * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 - */ -#define MAX_PHB_BUS_NUM 256 - -#define PHBS_PER_CALGARY 4 - -/* register offsets in Calgary's internal register space */ -static const unsigned long tar_offsets[] = { - 0x0580 /* TAR0 */, - 0x0588 /* TAR1 */, - 0x0590 /* TAR2 */, - 0x0598 /* TAR3 */ -}; - -static const unsigned long split_queue_offsets[] = { - 0x4870 /* SPLIT QUEUE 0 */, - 0x5870 /* SPLIT QUEUE 1 */, - 0x6870 /* SPLIT QUEUE 2 */, - 0x7870 /* SPLIT QUEUE 3 */ -}; - -static const unsigned long phb_offsets[] = { - 0x8000 /* PHB0 */, - 0x9000 /* PHB1 */, - 0xA000 /* PHB2 */, - 0xB000 /* PHB3 */ -}; - -/* PHB debug registers */ - -static const unsigned long phb_debug_offsets[] = { - 0x4000 /* PHB 0 DEBUG */, - 0x5000 /* PHB 1 DEBUG */, - 0x6000 /* PHB 2 DEBUG */, - 0x7000 /* PHB 3 DEBUG */ -}; - -/* - * STUFF register for each debug PHB, - * byte 1 = start bus number, byte 2 = end bus number - */ - -#define PHB_DEBUG_STUFF_OFFSET 0x0020 - -unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; -static int translate_empty_slots __read_mostly = 0; -static int calgary_detected __read_mostly = 0; - -static struct rio_table_hdr *rio_table_hdr __initdata; -static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; -static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; - -struct calgary_bus_info { - void *tce_space; - unsigned char translation_disabled; - signed char phbid; - void __iomem *bbar; -}; - -static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calgary_tce_cache_blast(struct iommu_table *tbl); -static void calgary_dump_error_regs(struct iommu_table *tbl); -static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calioc2_tce_cache_blast(struct iommu_table *tbl); -static void calioc2_dump_error_regs(struct iommu_table *tbl); -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); -static void get_tce_space_from_tar(void); - -static const struct cal_chipset_ops calgary_chip_ops = { - .handle_quirks = calgary_handle_quirks, - .tce_cache_blast = calgary_tce_cache_blast, - .dump_error_regs = calgary_dump_error_regs -}; - -static const struct cal_chipset_ops calioc2_chip_ops = { - .handle_quirks = calioc2_handle_quirks, - .tce_cache_blast = calioc2_tce_cache_blast, - .dump_error_regs = calioc2_dump_error_regs -}; - -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; - -static inline int translation_enabled(struct iommu_table *tbl) -{ - /* only PHBs with translation enabled have an IOMMU table */ - return (tbl != NULL); -} - -static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) -{ - unsigned long index; - unsigned long end; - unsigned long flags; - - index = start_addr >> PAGE_SHIFT; - - /* bail out if we're asked to reserve a region we don't cover */ - if (index >= tbl->it_size) - return; - - end = index + npages; - if (end > tbl->it_size) /* don't go off the table */ - end = tbl->it_size; - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_set(tbl->it_map, index, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static unsigned long iommu_range_alloc(struct device *dev, - struct iommu_table *tbl, - unsigned int npages) -{ - unsigned long flags; - unsigned long offset; - unsigned long boundary_size; - - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - - BUG_ON(npages == 0); - - spin_lock_irqsave(&tbl->it_lock, flags); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - tbl->chip_ops->tce_cache_blast(tbl); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - pr_warn("IOMMU full\n"); - spin_unlock_irqrestore(&tbl->it_lock, flags); - if (panic_on_overflow) - panic("Calgary: fix the allocator.\n"); - else - return DMA_MAPPING_ERROR; - } - } - - tbl->it_hint = offset + npages; - BUG_ON(tbl->it_hint > tbl->it_size); - - spin_unlock_irqrestore(&tbl->it_lock, flags); - - return offset; -} - -static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, - void *vaddr, unsigned int npages, int direction) -{ - unsigned long entry; - dma_addr_t ret; - - entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_MAPPING_ERROR)) { - pr_warn("failed to allocate %u pages in iommu %p\n", - npages, tbl); - return DMA_MAPPING_ERROR; - } - - /* set the return dma address */ - ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); - - /* put the TCEs in the HW table */ - tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, - direction); - return ret; -} - -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) -{ - unsigned long entry; - unsigned long flags; - - /* were we called with bad_dma_address? */ - if (unlikely(dma_addr == DMA_MAPPING_ERROR)) { - WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " - "address 0x%Lx\n", dma_addr); - return; - } - - entry = dma_addr >> PAGE_SHIFT; - - BUG_ON(entry + npages > tbl->it_size); - - tce_free(tbl, entry, npages); - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_clear(tbl->it_map, entry, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static inline struct iommu_table *find_iommu_table(struct device *dev) -{ - struct pci_dev *pdev; - struct pci_bus *pbus; - struct iommu_table *tbl; - - pdev = to_pci_dev(dev); - - /* search up the device tree for an iommu */ - pbus = pdev->bus; - do { - tbl = pci_iommu(pbus); - if (tbl && tbl->it_busno == pbus->number) - break; - tbl = NULL; - pbus = pbus->parent; - } while (pbus); - - BUG_ON(tbl && (tbl->it_busno != pbus->number)); - - return tbl; -} - -static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems,enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - int i; - - if (!translation_enabled(tbl)) - return; - - for_each_sg(sglist, s, nelems, i) { - unsigned int npages; - dma_addr_t dma = s->dma_address; - unsigned int dmalen = s->dma_length; - - if (dmalen == 0) - break; - - npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); - iommu_free(tbl, dma, npages); - } -} - -static int calgary_map_sg(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - unsigned long vaddr; - unsigned int npages; - unsigned long entry; - int i; - - for_each_sg(sg, s, nelems, i) { - BUG_ON(!sg_page(s)); - - vaddr = (unsigned long) sg_virt(s); - npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); - - entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_MAPPING_ERROR) { - /* makes sure unmap knows to stop */ - s->dma_length = 0; - goto error; - } - - s->dma_address = (entry << PAGE_SHIFT) | s->offset; - - /* insert into HW table */ - tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir); - - s->dma_length = s->length; - } - - return nelems; -error: - calgary_unmap_sg(dev, sg, nelems, dir, 0); - for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_MAPPING_ERROR; - sg->dma_length = 0; - } - return 0; -} - -static dma_addr_t calgary_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - void *vaddr = page_address(page) + offset; - unsigned long uaddr; - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, PAGE_SIZE); - - return iommu_alloc(dev, tbl, vaddr, npages, dir); -} - -static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - unsigned int npages; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - iommu_free(tbl, dma_addr, npages); -} - -static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) -{ - void *ret = NULL; - dma_addr_t mapping; - unsigned int npages, order; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); /* size rounded up to full pages */ - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(flag, order); - if (!ret) - goto error; - memset(ret, 0, size); - - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_MAPPING_ERROR) - goto free; - *dma_handle = mapping; - return ret; -free: - free_pages((unsigned long)ret, get_order(size)); - ret = NULL; -error: - return ret; -} - -static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - - iommu_free(tbl, dma_handle, npages); - free_pages((unsigned long)vaddr, get_order(size)); -} - -static const struct dma_map_ops calgary_dma_ops = { - .alloc = calgary_alloc_coherent, - .free = calgary_free_coherent, - .map_sg = calgary_map_sg, - .unmap_sg = calgary_unmap_sg, - .map_page = calgary_map_page, - .unmap_page = calgary_unmap_page, - .dma_supported = dma_direct_supported, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, -}; - -static inline void __iomem * busno_to_bbar(unsigned char num) -{ - return bus_info[num].bbar; -} - -static inline int busno_to_phbid(unsigned char num) -{ - return bus_info[num].phbid; -} - -static inline unsigned long split_queue_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return split_queue_offsets[idx]; -} - -static inline unsigned long tar_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return tar_offsets[idx]; -} - -static inline unsigned long phb_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return phb_offsets[idx]; -} - -static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) -{ - unsigned long target = ((unsigned long)bar) | offset; - return (void __iomem*)target; -} - -static inline int is_calioc2(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALIOC2); -} - -static inline int is_calgary(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALGARY); -} - -static inline int is_cal_pci_dev(unsigned short device) -{ - return (is_calgary(device) || is_calioc2(device)); -} - -static void calgary_tce_cache_blast(struct iommu_table *tbl) -{ - u64 val; - u32 aer; - int i = 0; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - - /* disable arbitration on the bus */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - aer = readl(target); - writel(0, target); - - /* read plssr to ensure it got there */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - val = readl(target); - - /* poll split queues until all DMA activity is done */ - target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); - do { - val = readq(target); - i++; - } while ((val & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("PCI bus not quiesced, continuing anyway\n"); - - /* invalidate TCE cache */ - target = calgary_reg(bbar, tar_offset(tbl->it_busno)); - writeq(tbl->tar_val, target); - - /* enable arbitration */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - writel(aer, target); - (void)readl(target); /* flush */ -} - -static void calioc2_tce_cache_blast(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u64 val64; - u32 val; - int i = 0; - int count = 1; - unsigned char bus = tbl->it_busno; - -begin: - printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " - "sequence - count %d\n", bus, count); - - /* 1. using the Page Migration Control reg set SoftStop */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); - val |= PMR_SOFTSTOP; - printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - - /* 2. poll split queues until all DMA activity is done */ - printk(KERN_DEBUG "2a. starting to poll split queues\n"); - target = calgary_reg(bbar, split_queue_offset(bus)); - do { - val64 = readq(target); - i++; - } while ((val64 & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); - - /* 3. poll Page Migration DEBUG for SoftStopFault */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); - - /* 4. if SoftStopFault - goto (1) */ - if (val & PMR_SOFTSTOPFAULT) { - if (++count < 100) - goto begin; - else { - pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); - return; /* pray for the best */ - } - } - - /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); - - /* 6. invalidate TCE cache */ - printk(KERN_DEBUG "6. invalidating TCE cache\n"); - target = calgary_reg(bbar, tar_offset(bus)); - writeq(tbl->tar_val, target); - - /* 7. Re-read PMCR */ - printk(KERN_DEBUG "7a. Re-reading PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); - - /* 8. Remove HardStop */ - printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = 0; - printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); -} - -static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, - u64 limit) -{ - unsigned int numpages; - - limit = limit | 0xfffff; - limit++; - - numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(pci_iommu(dev->bus), start, numpages); -} - -static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) -{ - void __iomem *target; - u64 low, high, sizelow; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* peripheral MEM_1 region */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); - sizelow = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) -{ - void __iomem *target; - u32 val32; - u64 low, high, sizelow, sizehigh; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* is it enabled? */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - if (!(val32 & PHB_MEM2_ENABLE)) - return; - - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); - sizelow = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); - sizehigh = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = (sizehigh << 32) | sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -/* - * some regions of the IO address space do not get translated, so we - * must not give devices IO addresses in those regions. The regions - * are the 640KB-1MB region and the two PCI peripheral memory holes. - * Reserve all of them in the IOMMU bitmap to avoid giving them out - * later. - */ -static void __init calgary_reserve_regions(struct pci_dev *dev) -{ - unsigned int npages; - u64 start; - struct iommu_table *tbl = pci_iommu(dev->bus); - - /* avoid the BIOS/VGA first 640KB-1MB region */ - /* for CalIOC2 - avoid the entire first MB */ - if (is_calgary(dev->device)) { - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; - } else { /* calioc2 */ - start = 0; - npages = (1 * 1024 * 1024) >> PAGE_SHIFT; - } - iommu_range_reserve(tbl, start, npages); - - /* reserve the two PCI peripheral memory regions in IO space */ - calgary_reserve_peripheral_mem_1(dev); - calgary_reserve_peripheral_mem_2(dev); -} - -static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) -{ - u64 val64; - u64 table_phys; - void __iomem *target; - int ret; - struct iommu_table *tbl; - - /* build TCE tables for each PHB */ - ret = build_tce_table(dev, bbar); - if (ret) - return ret; - - tbl = pci_iommu(dev->bus); - tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - - if (is_kdump_kernel()) - calgary_init_bitmap_from_tce_table(tbl); - else - tce_free(tbl, 0, tbl->it_size); - - if (is_calgary(dev->device)) - tbl->chip_ops = &calgary_chip_ops; - else if (is_calioc2(dev->device)) - tbl->chip_ops = &calioc2_chip_ops; - else - BUG(); - - calgary_reserve_regions(dev); - - /* set TARs for each PHB */ - target = calgary_reg(bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - - /* zero out all TAR bits under sw control */ - val64 &= ~TAR_SW_BITS; - table_phys = (u64)__pa(tbl->it_base); - - val64 |= table_phys; - - BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); - val64 |= (u64) specified_table_size; - - tbl->tar_val = cpu_to_be64(val64); - - writeq(tbl->tar_val, target); - readq(target); /* flush */ - - return 0; -} - -static void __init calgary_free_bus(struct pci_dev *dev) -{ - u64 val64; - struct iommu_table *tbl = pci_iommu(dev->bus); - void __iomem *target; - unsigned int bitmapsz; - - target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - val64 &= ~TAR_SW_BITS; - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ - - bitmapsz = tbl->it_size / BITS_PER_BYTE; - free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); - tbl->it_map = NULL; - - kfree(tbl); - - set_pci_iommu(dev->bus, NULL); - - /* Can't free bootmem allocated memory after system is up :-( */ - bus_info[dev->bus->number].tce_space = NULL; -} - -static void calgary_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 csr, plssr; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", - tbl->it_busno, csr, plssr); -} - -static void calioc2_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - u32 csr, csmr, plssr, mck, rcstat; - void __iomem *target; - unsigned long phboff = phb_offset(tbl->it_busno); - unsigned long erroff; - u32 errregs[7]; - int i; - - /* dump CSR */ - target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - /* dump PLSSR */ - target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - /* dump CSMR */ - target = calgary_reg(bbar, phboff | 0x290); - csmr = be32_to_cpu(readl(target)); - /* dump mck */ - target = calgary_reg(bbar, phboff | 0x800); - mck = be32_to_cpu(readl(target)); - - pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - - pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); - - /* dump rest of error regs */ - pr_emerg(""); - for (i = 0; i < ARRAY_SIZE(errregs); i++) { - /* err regs are at 0x810 - 0x870 */ - erroff = (0x810 + (i * 0x10)); - target = calgary_reg(bbar, phboff | erroff); - errregs[i] = be32_to_cpu(readl(target)); - pr_cont("0x%08x@0x%lx ", errregs[i], erroff); - } - pr_cont("\n"); - - /* root complex status */ - target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); - rcstat = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, - PHB_ROOT_COMPLEX_STATUS); -} - -static void calgary_watchdog(struct timer_list *t) -{ - struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); - void __iomem *bbar = tbl->bbar; - u32 val32; - void __iomem *target; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - val32 = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - if (val32 & CSR_AGENT_MASK) { - tbl->chip_ops->dump_error_regs(tbl); - - /* reset error */ - writel(0, target); - - /* Disable bus that caused the error */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_SLOT_DISABLE; - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - } else { - /* Reset the timer */ - mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); - } -} - -static void __init calgary_set_split_completion_timeout(void __iomem *bbar, - unsigned char busnum, unsigned long timeout) -{ - u64 val64; - void __iomem *target; - unsigned int phb_shift = ~0; /* silence gcc */ - u64 mask; - - switch (busno_to_phbid(busnum)) { - case 0: phb_shift = (63 - 19); - break; - case 1: phb_shift = (63 - 23); - break; - case 2: phb_shift = (63 - 27); - break; - case 3: phb_shift = (63 - 35); - break; - default: - BUG_ON(busno_to_phbid(busnum)); - } - - target = calgary_reg(bbar, CALGARY_CONFIG_REG); - val64 = be64_to_cpu(readq(target)); - - /* zero out this PHB's timer bits */ - mask = ~(0xFUL << phb_shift); - val64 &= mask; - val64 |= (timeout << phb_shift); - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ -} - -static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 val; - - /* - * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 - */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); - val = cpu_to_be32(readl(target)); - val |= 0x00800000; - writel(cpu_to_be32(val), target); -} - -static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (is_calgary(dev->device) && (busnum == 1)) - calgary_set_split_completion_timeout(tbl->bbar, busnum, - CCR_2SEC_TIMEOUT); -} - -static void __init calgary_enable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* enable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - - printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", - (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? - "Calgary" : "CalIOC2", busnum); - printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " - "bus.\n"); - - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); - mod_timer(&tbl->watchdog_timer, jiffies); -} - -static void __init calgary_disable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* disable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); - - printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - del_timer_sync(&tbl->watchdog_timer); -} - -static void __init calgary_init_one_nontraslated(struct pci_dev *dev) -{ - pci_dev_get(dev); - set_pci_iommu(dev->bus, NULL); - - /* is the device behind a bridge? */ - if (dev->bus->parent) - dev->bus->parent->self = dev; - else - dev->bus->self = dev; -} - -static int __init calgary_init_one(struct pci_dev *dev) -{ - void __iomem *bbar; - struct iommu_table *tbl; - int ret; - - bbar = busno_to_bbar(dev->bus->number); - ret = calgary_setup_tar(dev, bbar); - if (ret) - goto done; - - pci_dev_get(dev); - - if (dev->bus->parent) { - if (dev->bus->parent->self) - printk(KERN_WARNING "Calgary: IEEEE, dev %p has " - "bus->parent->self!\n", dev); - dev->bus->parent->self = dev; - } else - dev->bus->self = dev; - - tbl = pci_iommu(dev->bus); - tbl->chip_ops->handle_quirks(tbl, dev); - - calgary_enable_translation(dev); - - return 0; - -done: - return ret; -} - -static int __init calgary_locate_bbars(void) -{ - int ret; - int rioidx, phb, bus; - void __iomem *bbar; - void __iomem *target; - unsigned long offset; - u8 start_bus, end_bus; - u32 val; - - ret = -ENODATA; - for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { - struct rio_detail *rio = rio_devs[rioidx]; - - if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) - continue; - - /* map entire 1MB of Calgary config space */ - bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); - if (!bbar) - goto error; - - for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { - offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; - target = calgary_reg(bbar, offset); - - val = be32_to_cpu(readl(target)); - - start_bus = (u8)((val & 0x00FF0000) >> 16); - end_bus = (u8)((val & 0x0000FF00) >> 8); - - if (end_bus) { - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; - } - } else { - bus_info[start_bus].bbar = bbar; - bus_info[start_bus].phbid = phb; - } - } - } - - return 0; - -error: - /* scan bus_info and iounmap any bbars we previously ioremap'd */ - for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) - if (bus_info[bus].bbar) - iounmap(bus_info[bus].bbar); - - return ret; -} - -static int __init calgary_init(void) -{ - int ret; - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - ret = calgary_locate_bbars(); - if (ret) - return ret; - - /* Purely for kdump kernel case */ - if (is_kdump_kernel()) - get_tce_space_from_tar(); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - calgary_init_one_nontraslated(dev); - continue; - } - - if (!info->tce_space && !translate_empty_slots) - continue; - - ret = calgary_init_one(dev); - if (ret) - goto error; - } while (1); - - dev = NULL; - for_each_pci_dev(dev) { - struct iommu_table *tbl; - - tbl = find_iommu_table(&dev->dev); - - if (translation_enabled(tbl)) - dev->dev.dma_ops = &calgary_dma_ops; - } - - return ret; - -error: - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - pci_dev_put(dev); - continue; - } - if (!info->tce_space && !translate_empty_slots) - continue; - - calgary_disable_translation(dev); - calgary_free_bus(dev); - pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - dev->dev.dma_ops = NULL; - } while (1); - - return ret; -} - -static inline int __init determine_tce_table_size(void) -{ - int ret; - - if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) - return specified_table_size; - - if (is_kdump_kernel() && saved_max_pfn) { - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); - if (ret > TCE_TABLE_SIZE_8M) - ret = TCE_TABLE_SIZE_8M; - } else { - /* - * Use 8M by default (suggested by Muli) if it's not - * kdump kernel and saved_max_pfn isn't set. - */ - ret = TCE_TABLE_SIZE_8M; - } - - return ret; -} - -static int __init build_detail_arrays(void) -{ - unsigned long ptr; - unsigned numnodes, i; - int scal_detail_size, rio_detail_size; - - numnodes = rio_table_hdr->num_scal_dev; - if (numnodes > MAX_NUMNODES){ - printk(KERN_WARNING - "Calgary: MAX_NUMNODES too low! Defined as %d, " - "but system has %d nodes.\n", - MAX_NUMNODES, numnodes); - return -ENODEV; - } - - switch (rio_table_hdr->version){ - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - default: - printk(KERN_WARNING - "Calgary: Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return -EPROTO; - } - - ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < numnodes; i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; - i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 0; -} - -static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) -{ - int dev; - u32 val; - - if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { - /* - * FIXME: properly scan for devices across the - * PCI-to-PCI bridge on every CalIOC2 port. - */ - return 1; - } - - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff) - break; - } - return (val != 0xffffffff); -} - -/* - * calgary_init_bitmap_from_tce_table(): - * Function for kdump case. In the second/kdump kernel initialize - * the bitmap based on the tce table entries obtained from first kernel - */ -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) -{ - u64 *tp; - unsigned int index; - tp = ((u64 *)tbl->it_base); - for (index = 0 ; index < tbl->it_size; index++) { - if (*tp != 0x0) - set_bit(index, tbl->it_map); - tp++; - } -} - -/* - * get_tce_space_from_tar(): - * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base address register of calgary iommu - */ -static void __init get_tce_space_from_tar(void) -{ - int bus; - void __iomem *target; - unsigned long tce_space; - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - target = calgary_reg(bus_info[bus].bbar, - tar_offset(bus)); - tce_space = be64_to_cpu(readq(target)); - tce_space = tce_space & TAR_SW_BITS; - - tce_space = tce_space & (~specified_table_size); - info->tce_space = (u64 *)__va(tce_space); - } - } - return; -} - -static int __init calgary_iommu_init(void) -{ - int ret; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - return 0; -} - -int __init detect_calgary(void) -{ - int bus; - void *tbl; - int calgary_found = 0; - unsigned long ptr; - unsigned int offset, prev_offset; - int ret; - - /* - * if the user specified iommu=off or iommu=soft or we found - * another HW IOMMU already, bail out. - */ - if (no_iommu || iommu_detected) - return -ENODEV; - - if (!use_calgary) - return -ENODEV; - - if (!early_pci_allowed()) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); - - ptr = (unsigned long)phys_to_virt(get_bios_ebda()); - - rio_table_hdr = NULL; - prev_offset = 0; - offset = 0x180; - /* - * The next offset is stored in the 1st word. - * Only parse up until the offset increases: - */ - while (offset > prev_offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - prev_offset = offset; - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " - "in EBDA - bailing!\n"); - return -ENODEV; - } - - ret = build_detail_arrays(); - if (ret) { - printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); - return -ENOMEM; - } - - specified_table_size = determine_tce_table_size(); - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - /* - * If it is kdump kernel, find and use tce tables - * from first kernel, else allocate tce tables here - */ - if (!is_kdump_kernel()) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - } - calgary_found = 1; - } - } - - printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", - calgary_found ? "found" : "not found"); - - if (calgary_found) { - iommu_detected = 1; - calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", - specified_table_size); - - x86_init.iommu.iommu_init = calgary_iommu_init; - } - return calgary_found; - -cleanup: - for (--bus; bus >= 0; --bus) { - struct calgary_bus_info *info = &bus_info[bus]; - - if (info->tce_space) - free_tce_table(info->tce_space); - } - return -ENOMEM; -} - -static int __init calgary_parse_options(char *p) -{ - unsigned int bridge; - unsigned long val; - size_t len; - ssize_t ret; - - while (*p) { - if (!strncmp(p, "64k", 3)) - specified_table_size = TCE_TABLE_SIZE_64K; - else if (!strncmp(p, "128k", 4)) - specified_table_size = TCE_TABLE_SIZE_128K; - else if (!strncmp(p, "256k", 4)) - specified_table_size = TCE_TABLE_SIZE_256K; - else if (!strncmp(p, "512k", 4)) - specified_table_size = TCE_TABLE_SIZE_512K; - else if (!strncmp(p, "1M", 2)) - specified_table_size = TCE_TABLE_SIZE_1M; - else if (!strncmp(p, "2M", 2)) - specified_table_size = TCE_TABLE_SIZE_2M; - else if (!strncmp(p, "4M", 2)) - specified_table_size = TCE_TABLE_SIZE_4M; - else if (!strncmp(p, "8M", 2)) - specified_table_size = TCE_TABLE_SIZE_8M; - - len = strlen("translate_empty_slots"); - if (!strncmp(p, "translate_empty_slots", len)) - translate_empty_slots = 1; - - len = strlen("disable"); - if (!strncmp(p, "disable", len)) { - p += len; - if (*p == '=') - ++p; - if (*p == '\0') - break; - ret = kstrtoul(p, 0, &val); - if (ret) - break; - - bridge = val; - if (bridge < MAX_PHB_BUS_NUM) { - printk(KERN_INFO "Calgary: disabling " - "translation for PHB %#x\n", bridge); - bus_info[bridge].translation_disabled = 1; - } - } - - p = strpbrk(p, ","); - if (!p) - break; - - p++; /* skip ',' */ - } - return 1; -} -__setup("calgary=", calgary_parse_options); - -static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) -{ - struct iommu_table *tbl; - unsigned int npages; - int i; - - tbl = pci_iommu(dev->bus); - - for (i = 0; i < 4; i++) { - struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; - - /* Don't give out TCEs that map MEM resources */ - if (!(r->flags & IORESOURCE_MEM)) - continue; - - /* 0-based? we reserve the whole 1st MB anyway */ - if (!r->start) - continue; - - /* cover the whole region */ - npages = resource_size(r) >> PAGE_SHIFT; - npages++; - - iommu_range_reserve(tbl, r->start, npages); - } -} - -static int __init calgary_fixup_tce_spaces(void) -{ - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - if (no_iommu || swiotlb || !calgary_detected) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) - continue; - - if (!info->tce_space) - continue; - - calgary_fixup_one_tce_space(dev); - - } while (1); - - return 0; -} - -/* - * We need to be call after pcibios_assign_resources (fs_initcall level) - * and before device_initcall. - */ -rootfs_initcall(calgary_fixup_tce_spaces); - -IOMMU_INIT_POST(detect_calgary); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fa4352dce491..57de2ebff7e2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -12,7 +12,6 @@ #include <asm/dma.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/calgary.h> #include <asm/x86_init.h> #include <asm/iommu_table.h> @@ -112,11 +111,6 @@ static __init int iommu_setup(char *p) gart_parse_options(p); -#ifdef CONFIG_CALGARY_IOMMU - if (!strncmp(p, "calgary", 7)) - use_calgary = 1; -#endif /* CONFIG_CALGARY_IOMMU */ - p += strcspn(p, ","); if (*p == ',') ++p; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e94c4354d4e..bd2a11ca5dd6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -41,6 +41,7 @@ #include <asm/desc.h> #include <asm/prctl.h> #include <asm/spec-ctrl.h> +#include <asm/io_bitmap.h> #include <asm/proto.h> #include "process.h" @@ -72,18 +73,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, #endif + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, }, -#ifdef CONFIG_X86_32 - /* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, -#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); @@ -110,28 +102,89 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; - if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); - - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - kfree(bp); - } + if (test_thread_flag(TIF_IO_BITMAP)) + io_bitmap_exit(); free_vm86(t); fpu__drop(fpu); } +static int set_new_tls(struct task_struct *p, unsigned long tls) +{ + struct user_desc __user *utls = (struct user_desc __user *)tls; + + if (in_ia32_syscall()) + return do_set_thread_area(p, -1, utls, 0); + else + return do_set_thread_area_64(p, ARCH_SET_FS, tls); +} + +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) +{ + struct inactive_task_frame *frame; + struct fork_frame *fork_frame; + struct pt_regs *childregs; + int ret = 0; + + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; + p->thread.io_bitmap = NULL; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + +#ifdef CONFIG_X86_64 + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); +#else + p->thread.sp0 = (unsigned long) (childregs + 1); + /* + * Clear all status flags including IF and set fixed bit. 64bit + * does not have this initialization as the frame does not contain + * flags. The flags consistency (especially vs. AC) is there + * ensured via objtool, which lacks 32bit support. + */ + frame->flags = X86_EFLAGS_FIXED; +#endif + + /* Kernel thread ? */ + if (unlikely(p->flags & PF_KTHREAD)) { + memset(childregs, 0, sizeof(struct pt_regs)); + kthread_frame_init(frame, sp, arg); + return 0; + } + + frame->bx = 0; + *childregs = *current_pt_regs(); + childregs->ax = 0; + if (sp) + childregs->sp = sp; + +#ifdef CONFIG_X86_32 + task_user_gs(p) = get_user_gs(current_pt_regs()); +#endif + + /* Set a new TLS for the child thread? */ + if (clone_flags & CLONE_SETTLS) + ret = set_new_tls(p, tls); + + if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) + io_bitmap_share(p); + + return ret; +} + void flush_thread(void) { struct task_struct *tsk = current; @@ -269,31 +322,96 @@ void arch_setup_new_exec(void) } } -static inline void switch_to_bitmap(struct thread_struct *prev, - struct thread_struct *next, - unsigned long tifp, unsigned long tifn) +#ifdef CONFIG_X86_IOPL_IOPERM +static inline void tss_invalidate_io_bitmap(struct tss_struct *tss) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + /* + * Invalidate the I/O bitmap by moving io_bitmap_base outside the + * TSS limit so any subsequent I/O access from user space will + * trigger a #GP. + * + * This is correct even when VMEXIT rewrites the TSS limit + * to 0x67 as the only requirement is that the base points + * outside the limit. + */ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; +} - if (tifn & _TIF_IO_BITMAP) { - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); +static inline void switch_to_bitmap(unsigned long tifp) +{ + /* + * Invalidate I/O bitmap if the previous task used it. This prevents + * any possible leakage of an active I/O bitmap. + * + * If the next task has an I/O bitmap it will handle it on exit to + * user mode. + */ + if (tifp & _TIF_IO_BITMAP) + tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw)); +} + +static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) +{ + /* + * Copy at least the byte range of the incoming tasks bitmap which + * covers the permitted I/O ports. + * + * If the previous task which used an I/O bitmap had more bits + * permitted, then the copy needs to cover those as well so they + * get turned off. + */ + memcpy(tss->io_bitmap.bitmap, iobm->bitmap, + max(tss->io_bitmap.prev_max, iobm->max)); + + /* + * Store the new max and the sequence number of this bitmap + * and a pointer to the bitmap itself. + */ + tss->io_bitmap.prev_max = iobm->max; + tss->io_bitmap.prev_sequence = iobm->sequence; +} + +/** + * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + */ +void tss_update_io_bitmap(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + u16 *base = &tss->x86_tss.io_bitmap_base; + + if (test_thread_flag(TIF_IO_BITMAP)) { + struct thread_struct *t = ¤t->thread; + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; + /* + * Only copy bitmap data when the sequence number + * differs. The update time is accounted to the + * incoming task. + */ + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; + } /* - * Make sure that the TSS limit is correct for the CPU - * to notice the IO bitmap. + * Make sure that the TSS limit is covering the io bitmap. + * It might have been cut down by a VMEXIT to 0x67 which + * would cause a subsequent I/O access from user space to + * trigger a #GP because tbe bitmap is outside the TSS + * limit. */ refresh_tss_limit(); - } else if (tifp & _TIF_IO_BITMAP) { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } else { + tss_invalidate_io_bitmap(tss); } } +#else /* CONFIG_X86_IOPL_IOPERM */ +static inline void switch_to_bitmap(unsigned long tifp) { } +#endif #ifdef CONFIG_SMP @@ -505,7 +623,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) tifn = READ_ONCE(task_thread_info(next_p)->flags); tifp = READ_ONCE(task_thread_info(prev_p)->flags); - switch_to_bitmap(prev, next, tifp, tifn); + + switch_to_bitmap(tifp); propagate_user_return_notify(prev_p, next_p); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b8ceec4974fe..323499f48858 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -112,74 +112,6 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) -{ - struct pt_regs *childregs = task_pt_regs(p); - struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); - struct inactive_task_frame *frame = &fork_frame->frame; - struct task_struct *tsk; - int err; - - /* - * For a new task use the RESET flags value since there is no before. - * All the status flags are zero; DF and all the system flags must also - * be 0, specifically IF must be 0 because we context switch to the new - * task with interrupts disabled. - */ - frame->flags = X86_EFLAGS_FIXED; - frame->bp = 0; - frame->ret_addr = (unsigned long) ret_from_fork; - p->thread.sp = (unsigned long) fork_frame; - p->thread.sp0 = (unsigned long) (childregs+1); - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - - if (unlikely(p->flags & PF_KTHREAD)) { - /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - frame->bx = sp; /* function */ - frame->di = arg; - p->thread.io_bitmap_ptr = NULL; - return 0; - } - frame->bx = 0; - *childregs = *current_pt_regs(); - childregs->ax = 0; - if (sp) - childregs->sp = sp; - - task_user_gs(p) = get_user_gs(current_pt_regs()); - - p->thread.io_bitmap_ptr = NULL; - tsk = current; - err = -ENOMEM; - - if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - err = 0; - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) - err = do_set_thread_area(p, -1, - (struct user_desc __user *)tls, 0); - - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { @@ -255,15 +187,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ load_TLS(next, cpu); - /* - * Restore IOPL if needed. In normal use, the flags restore - * in the switch assembly will handle this. But if the kernel - * is running virtualized at a non-zero CPL, the popf will - * not restore flags, so it must be done in a separate step. - */ - if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) - set_iopl_mask(next->iopl); - switch_to_extra(prev_p, next_p); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index af64519b2695..506d66830d4d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -371,81 +371,6 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) task->thread.gsbase = gsbase; } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) -{ - int err; - struct pt_regs *childregs; - struct fork_frame *fork_frame; - struct inactive_task_frame *frame; - struct task_struct *me = current; - - childregs = task_pt_regs(p); - fork_frame = container_of(childregs, struct fork_frame, regs); - frame = &fork_frame->frame; - - frame->bp = 0; - frame->ret_addr = (unsigned long) ret_from_fork; - p->thread.sp = (unsigned long) fork_frame; - p->thread.io_bitmap_ptr = NULL; - - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; - savesegment(es, p->thread.es); - savesegment(ds, p->thread.ds); - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - - if (unlikely(p->flags & PF_KTHREAD)) { - /* kernel thread */ - memset(childregs, 0, sizeof(struct pt_regs)); - frame->bx = sp; /* function */ - frame->r12 = arg; - return 0; - } - frame->bx = 0; - *childregs = *current_pt_regs(); - - childregs->ax = 0; - if (sp) - childregs->sp = sp; - - err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { -#ifdef CONFIG_IA32_EMULATION - if (in_ia32_syscall()) - err = do_set_thread_area(p, -1, - (struct user_desc __user *)tls, 0); - else -#endif - err = do_arch_prctl_64(p, ARCH_SET_FS, tls); - if (err) - goto out; - } - err = 0; -out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - - return err; -} - static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, @@ -572,17 +497,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_to_extra(prev_p, next_p); -#ifdef CONFIG_XEN_PV - /* - * On Xen PV, IOPL bits in pt_regs->flags have no effect, and - * current_pt_regs()->flags may not match the current task's - * intended IOPL. We need to switch it manually. - */ - if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && - prev->iopl != next->iopl)) - xen_set_iopl_mask(next->iopl); -#endif - if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3c5bbe8e4120..066e5b01a7e0 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -42,6 +42,7 @@ #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> +#include <asm/io_bitmap.h> #include "tls.h" @@ -697,7 +698,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { - return target->thread.io_bitmap_max / regset->size; + struct io_bitmap *iobm = target->thread.io_bitmap; + + return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, @@ -705,12 +708,13 @@ static int ioperm_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - if (!target->thread.io_bitmap_ptr) + struct io_bitmap *iobm = target->thread.io_bitmap; + + if (!iobm) return -ENXIO; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - target->thread.io_bitmap_ptr, - 0, IO_BITMAP_BYTES); + iobm->bitmap, 0, IO_BITMAP_BYTES); } /* diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index ee26df08002e..94b33885f8d2 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -35,8 +35,7 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .globl relocate_kernel -relocate_kernel: +SYM_CODE_START_NOALIGN(relocate_kernel) /* Save the CPU context, used for jumping back */ pushl %ebx @@ -93,8 +92,9 @@ relocate_kernel: addl $(identity_mapped - relocate_kernel), %eax pushl %eax ret +SYM_CODE_END(relocate_kernel) -identity_mapped: +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* set return address to 0 if not preserving context */ pushl $0 /* store the start address on the stack */ @@ -191,8 +191,9 @@ identity_mapped: addl $(virtual_mapped - relocate_kernel), %eax pushl %eax ret +SYM_CODE_END(identity_mapped) -virtual_mapped: +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) movl CR4(%edi), %eax movl %eax, %cr4 movl CR3(%edi), %eax @@ -208,9 +209,10 @@ virtual_mapped: popl %esi popl %ebx ret +SYM_CODE_END(virtual_mapped) /* Do the copies */ -swap_pages: +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl 8(%esp), %edx movl 4(%esp), %ecx pushl %ebp @@ -270,6 +272,7 @@ swap_pages: popl %ebx popl %ebp ret +SYM_CODE_END(swap_pages) .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c51ccff5cd01..ef3ba99068d3 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -38,8 +38,7 @@ .text .align PAGE_SIZE .code64 - .globl relocate_kernel -relocate_kernel: +SYM_CODE_START_NOALIGN(relocate_kernel) /* * %rdi indirection_page * %rsi page_list @@ -103,8 +102,9 @@ relocate_kernel: addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 ret +SYM_CODE_END(relocate_kernel) -identity_mapped: +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -209,8 +209,9 @@ identity_mapped: movq $virtual_mapped, %rax pushq %rax ret +SYM_CODE_END(identity_mapped) -virtual_mapped: +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -228,9 +229,10 @@ virtual_mapped: popq %rbp popq %rbx ret +SYM_CODE_END(virtual_mapped) /* Do the copies */ -swap_pages: +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi @@ -283,6 +285,7 @@ swap_pages: jmp 0b 3: ret +SYM_CODE_END(swap_pages) .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 77ea96b794bd..cedfe2077a69 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -143,6 +143,13 @@ struct boot_params boot_params; /* * Machine setup.. */ +static struct resource rodata_resource = { + .name = "Kernel rodata", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + static struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -438,6 +445,12 @@ static void __init memblock_x86_reserve_range_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + memblock_reserve(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len); + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -459,7 +472,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) * due to mapping restrictions. * * On 64bit, kdump kernel need be restricted to be under 64TB, which is - * the upper limit of system RAM in 4-level paing mode. Since the kdump + * the upper limit of system RAM in 4-level paging mode. Since the kdump * jumping could be from 5-level to 4-level, the jumping will fail if * kernel is put above 64TB, and there's no way to detect the paging mode * of the kernel which will be loaded for dumping during the 1st kernel @@ -743,8 +756,8 @@ static void __init trim_bios_range(void) e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* - * special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); @@ -951,7 +964,9 @@ void __init setup_arch(char **cmdline_p) code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; - data_resource.start = __pa_symbol(_etext); + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); data_resource.end = __pa_symbol(_edata)-1; bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; @@ -1040,6 +1055,7 @@ void __init setup_arch(char **cmdline_p) /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); @@ -1122,17 +1138,15 @@ void __init setup_arch(char **cmdline_p) reserve_bios_regions(); - if (efi_enabled(EFI_MEMMAP)) { - efi_fake_memmap(); - efi_find_mirror(); - efi_esrt_init(); + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be - * called after ExitBootServices(). This is, in fact, a lie. - */ - efi_reserve_boot_services(); - } + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ e820__memblock_alloc_reserved_mpc_new(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 86663874ef04..e6d7894ad127 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("%s allocator failed (%d), falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a49fe1dcb47e..4c61f0713832 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -57,7 +57,7 @@ void __init tboot_probe(void) */ if (!e820__mapped_any(boot_params.tboot_addr, boot_params.tboot_addr, E820_TYPE_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); + pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -65,13 +65,12 @@ void __init tboot_probe(void) set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warning("tboot at 0x%llx is invalid\n", - boot_params.tboot_addr); + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); tboot = NULL; return; } if (tboot->version < 5) { - pr_warning("tboot version is invalid: %u\n", tboot->version); + pr_warn("tboot version is invalid: %u\n", tboot->version); tboot = NULL; return; } @@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { - pr_warning("unsupported sleep state 0x%x\n", sleep_state); + pr_warn("unsupported sleep state 0x%x\n", sleep_state); return -1; } @@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) if (!tboot_enabled()) return 0; - pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); return -ENODEV; } @@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps) } if (timeout) - pr_warning("tboot wait for APs timeout\n"); + pr_warn("tboot wait for APs timeout\n"); return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } @@ -516,7 +515,7 @@ int tboot_force_iommu(void) return 1; if (no_iommu || swiotlb || dmar_disabled) - pr_warning("Forcing Intel-IOMMU to enabled\n"); + pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; #ifdef CONFIG_SWIOTLB diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c deleted file mode 100644 index 6384be751eff..000000000000 --- a/arch/x86/kernel/tce_64.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file manages the translation entries for the IBM Calgary IOMMU. - * - * Derived from arch/powerpc/platforms/pseries/iommu.c - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <linux/memblock.h> -#include <asm/tce.h> -#include <asm/calgary.h> -#include <asm/proto.h> -#include <asm/cacheflush.h> - -/* flush a tce at 'tceaddr' to main memory */ -static inline void flush_tce(void* tceaddr) -{ - /* a single tce can't cross a cache line */ - if (boot_cpu_has(X86_FEATURE_CLFLUSH)) - clflush(tceaddr); - else - wbinvd(); -} - -void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction) -{ - u64* tp; - u64 t; - u64 rpn; - - t = (1 << TCE_READ_SHIFT); - if (direction != DMA_TO_DEVICE) - t |= (1 << TCE_WRITE_SHIFT); - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; - t &= ~TCE_RPN_MASK; - t |= (rpn << TCE_RPN_SHIFT); - - *tp = cpu_to_be64(t); - flush_tce(tp); - - uaddr += PAGE_SIZE; - tp++; - } -} - -void tce_free(struct iommu_table *tbl, long index, unsigned int npages) -{ - u64* tp; - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - *tp = cpu_to_be64(0); - flush_tce(tp); - tp++; - } -} - -static inline unsigned int table_size_to_number_of_entries(unsigned char size) -{ - /* - * size is the order of the table, 0-7 - * smallest table is 8K entries, so shift result by 13 to - * multiply by 8K - */ - return (1 << size) << 13; -} - -static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) -{ - unsigned int bitmapsz; - unsigned long bmppages; - int ret; - - tbl->it_busno = dev->bus->number; - - /* set the tce table size - measured in entries */ - tbl->it_size = table_size_to_number_of_entries(specified_table_size); - - /* - * number of bytes needed for the bitmap size in number of - * entries; we need one bit per entry - */ - bitmapsz = tbl->it_size / BITS_PER_BYTE; - bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); - if (!bmppages) { - printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); - ret = -ENOMEM; - goto done; - } - - tbl->it_map = (unsigned long*)bmppages; - - memset(tbl->it_map, 0, bitmapsz); - - tbl->it_hint = 0; - - spin_lock_init(&tbl->it_lock); - - return 0; - -done: - return ret; -} - -int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) -{ - struct iommu_table *tbl; - int ret; - - if (pci_iommu(dev->bus)) { - printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", - dev, pci_iommu(dev->bus)); - BUG(); - } - - tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); - if (!tbl) { - printk(KERN_ERR "Calgary: error allocating iommu_table\n"); - ret = -ENOMEM; - goto done; - } - - ret = tce_table_setparms(dev, tbl); - if (ret) - goto free_tbl; - - tbl->bbar = bbar; - - set_pci_iommu(dev->bus, tbl); - - return 0; - -free_tbl: - kfree(tbl); -done: - return ret; -} - -void * __init alloc_tce_table(void) -{ - unsigned int size; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - return memblock_alloc_low(size, size); -} - -void __init free_tce_table(void *tbl) -{ - unsigned int size; - - if (!tbl) - return; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - memblock_free(__pa(tbl), size); -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bb0f8447112..c90312146da0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,11 +37,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index ec534f978867..b8acf639abd1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -364,12 +364,12 @@ retry: /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); + pr_warn("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); if (random_warps) - pr_warning("TSC warped randomly between CPUs\n"); + pr_warn("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 548fefed71ee..4d732a444711 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -1,6 +1,6 @@ /* - * umip.c Emulation for instruction protected by the Intel User-Mode - * Instruction Prevention feature + * umip.c Emulation for instruction protected by the User-Mode Instruction + * Prevention feature * * Copyright (c) 2017, Intel Corporation. * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> @@ -18,10 +18,10 @@ /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * - * The feature User-Mode Instruction Prevention present in recent Intel - * processor prevents a group of instructions (SGDT, SIDT, SLDT, SMSW and STR) - * from being executed with CPL > 0. Otherwise, a general protection fault is - * issued. + * User-Mode Instruction Prevention is a security feature present in recent + * x86 processors that, when enabled, prevents a group of instructions (SGDT, + * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general + * protection fault if the instruction is executed with CPL > 0. * * Rather than relaying to the user space the general protection fault caused by * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be @@ -91,7 +91,7 @@ const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warning(regs, fmt, ...) \ +#define umip_pr_warn(regs, fmt, ...) \ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) /** @@ -380,14 +380,14 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); /* Do not emulate (spoof) SLDT or STR. */ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; - umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8cd745ef8c7b..15e5aad8ac2c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -842,8 +842,8 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @auprobe: the probepoint information. * @mm: the probed address space. - * @arch_uprobe: the probepoint information. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index a024c4f7ba56..641f0fe1e5b4 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -31,7 +31,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> -ENTRY(verify_cpu) +SYM_FUNC_START_LOCAL(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -137,4 +137,4 @@ ENTRY(verify_cpu) popf # Restore caller passed flags xorl %eax, %eax ret -ENDPROC(verify_cpu) +SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e2feacf921a0..3a1a819da137 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,9 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 16 + #include <asm-generic/vmlinux.lds.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -141,17 +144,12 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif + } :text =0xcccc - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - - EXCEPTION_TABLE(16) :text = 0x9090 - - /* .text should occupy whole number of pages */ + /* End of text section, which should occupy whole number of pages */ + _etext = .; . = ALIGN(PAGE_SIZE); + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) X86_ALIGN_RODATA_END diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 18a799c8fa28..ce89430a7f80 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -31,6 +31,28 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } +static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static __init void get_rtc_noop(struct timespec64 *now) { } + +static __initconst const struct of_device_id of_cmos_match[] = { + { .compatible = "motorola,mc146818" }, + {} +}; + +/* + * Allow devicetree configured systems to disable the RTC by setting the + * corresponding DT node's status property to disabled. Code is optimized + * out for CONFIG_OF=n builds. + */ +static __init void x86_wallclock_init(void) +{ + struct device_node *node = of_find_matching_node(NULL, of_cmos_match); + + if (node && !of_device_is_available(node)) { + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + } +} /* * The platform setup functions are preset with the default functions @@ -73,7 +95,7 @@ struct x86_init_ops x86_init __initdata = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, - .wallclock_init = x86_init_noop, + .wallclock_init = x86_wallclock_init, }, .iommu = { |