diff options
Diffstat (limited to 'arch/x86')
58 files changed, 951 insertions, 566 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b1494bd92b65..6c304438b503 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -106,7 +106,7 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select HAVE_RCU_USER_QS if X86_64 + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select GENERIC_KERNEL_THREAD select GENERIC_KERNEL_EXECVE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c760e073963e..e87b0cac14b5 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -12,6 +12,8 @@ #include <asm/setup.h> #include <asm/desc.h> +#undef memcpy /* Use memcpy from misc.c */ + #include "eboot.h" static efi_system_table_t *sys_table; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2a017441b8b2..8c132a625b94 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -476,6 +476,3 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" - - .data -dummy: .long 0 diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 66e5f0ef0523..79fd8a3418f9 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -12,6 +12,7 @@ header-y += mce.h header-y += msr-index.h header-y += msr.h header-y += mtrr.h +header-y += perf_regs.h header-y += posix_types_32.h header-y += posix_types_64.h header-y += posix_types_x32.h @@ -19,8 +20,10 @@ header-y += prctl.h header-y += processor-flags.h header-y += ptrace-abi.h header-y += sigcontext32.h +header-y += svm.h header-y += ucontext.h header-y += vm86.h +header-y += vmx.h header-y += vsyscall.h genhdr-y += unistd_32.h diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/context_tracking.h index d1ac07a23979..1616562683e9 100644 --- a/arch/x86/include/asm/rcu.h +++ b/arch/x86/include/asm/context_tracking.h @@ -1,27 +1,26 @@ -#ifndef _ASM_X86_RCU_H -#define _ASM_X86_RCU_H +#ifndef _ASM_X86_CONTEXT_TRACKING_H +#define _ASM_X86_CONTEXT_TRACKING_H #ifndef __ASSEMBLY__ - -#include <linux/rcupdate.h> +#include <linux/context_tracking.h> #include <asm/ptrace.h> static inline void exception_enter(struct pt_regs *regs) { - rcu_user_exit(); + user_exit(); } static inline void exception_exit(struct pt_regs *regs) { -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING if (user_mode(regs)) - rcu_user_enter(); + user_enter(); #endif } #else /* __ASSEMBLY__ */ -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING # define SCHEDULE_USER call schedule_user #else # define SCHEDULE_USER call schedule diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 93e1c55f14ab..03dd72957d2f 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -2,9 +2,6 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#ifdef CONFIG_ACPI - void *acpi_handle; -#endif #ifdef CONFIG_X86_DEV_DMA_OPS struct dma_map_ops *dma_ops; #endif diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c9dcc181d4d1..6e8fdf5ad113 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -35,7 +35,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -89,7 +89,7 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type); + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ @@ -98,6 +98,8 @@ extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_unmap_memmap(void); +extern void efi_memory_uc(u64 addr, unsigned long size); #ifndef CONFIG_EFI /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5939f44fe0c0..9c999c1674fa 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void) return 0; } -/* The first two values are special, do not change. See align_addr() */ +/* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), ALIGN_VA_64 = BIT(1), - ALIGN_VDSO = BIT(2), - ALIGN_TOPDOWN = BIT(3), }; struct va_alignment { @@ -368,5 +366,5 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); +extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 831dbb9c6c02..41ab26ea6564 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -399,14 +399,17 @@ static inline void drop_init_fpu(struct task_struct *tsk) typedef struct { int preload; } fpu_switch_t; /* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return new == this_cpu_read_stable(fpu_owner_task) && diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 593e51d4643f..513b05f15bb4 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -3,6 +3,9 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) + #include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index dcfde52979c3..54d80fddb739 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -205,21 +205,14 @@ static inline bool user_64bit_mode(struct pt_regs *regs) } #endif -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * This is valid only for kernel mode traps. - */ -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ #ifdef CONFIG_X86_32 - return (unsigned long)(®s->sp); +extern unsigned long kernel_stack_pointer(struct pt_regs *regs); #else +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ return regs->sp; -#endif } +#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) @@ -246,6 +239,15 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, { if (unlikely(offset > MAX_REG_OFFSET)) return 0; +#ifdef CONFIG_X86_32 + /* + * Traps from the kernel do not save sp and ss. + * Use the helper function to retrieve sp. + */ + if (offset == offsetof(struct pt_regs, sp) && + regs->cs == __KERNEL_CS) + return kernel_stack_pointer(regs); +#endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h new file mode 100644 index 000000000000..beab86cc282d --- /dev/null +++ b/arch/x86/include/asm/trace_clock.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +#include <linux/compiler.h> +#include <linux/types.h> + +#ifdef CONFIG_X86_TSC + +extern u64 notrace trace_clock_x86_tsc(void); + +# define ARCH_TRACE_CLOCKS \ + { trace_clock_x86_tsc, "x86-tsc", .in_ns = 0 }, + +#else /* !CONFIG_X86_TSC */ + +#define ARCH_TRACE_CLOCKS + +#endif + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 59c226d120cd..c20d1ce62dc6 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -359,18 +359,14 @@ HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, return _hypercall4(int, update_va_mapping, va, new_val.pte, new_val.pte >> 32, flags); } +extern int __must_check xen_event_channel_op_compat(int, void *); static inline int HYPERVISOR_event_channel_op(int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct evtchn_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = xen_event_channel_op_compat(cmd, arg); return rc; } @@ -386,17 +382,14 @@ HYPERVISOR_console_io(int cmd, int count, char *str) return _hypercall3(int, console_io, cmd, count, str); } +extern int __must_check HYPERVISOR_physdev_op_compat(int, void *); + static inline int HYPERVISOR_physdev_op(int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct physdev_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = HYPERVISOR_physdev_op_compat(cmd, arg); return rc; } diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 66d0fff1ee84..125f344f06a9 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,7 +33,6 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -/* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 91ce48f05f9f..34e923a53762 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -9,7 +9,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg -CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg @@ -62,6 +61,7 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589ac..e48cafcf92ae 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -574,6 +574,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) return irq; } +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); void __init acpi_set_irq_model_pic(void) { diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf65aee..d5e0d717005a 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,6 +101,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2cd..1817fa911024 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2257,6 +2257,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); + if (!cfg) + continue; + raw_spin_lock(&desc->lock); /* diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d12..1b7d1656a042 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -631,6 +631,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + u64 val; + + if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { + val |= 0x1E; + wrmsrl_safe(0xc0011021, val); + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 698b6ec12e0f..1ac581f38dfa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -6,7 +6,7 @@ * * Written by Jacob Shin - AMD, Inc. * - * Support: borislav.petkov@amd.com + * Maintained by: Borislav Petkov <bp@alien8.de> * * April 2006 * - added support for AMD Family 0x10 processors diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 5f88abf07e9c..4f9a3cbfc4a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -285,34 +285,39 @@ void cmci_clear(void) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +static long cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); + + return 0; +} + /* * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ void cmci_rediscover(int dying) { - int banks; - int cpu; - cpumask_var_t old; + int cpu, banks; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); for_each_online_cpu(cpu) { if (cpu == dying) continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + + if (cpu == smp_processor_id()) { + cmci_rediscover_work_func(NULL); continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks); - } + } - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + work_on_cpu(cpu, cmci_rediscover_work_func, NULL); + } } /* diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3373f84d1397..4428fd178bce 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -208,12 +208,14 @@ static bool check_hw_exists(void) } /* - * Now write a value and read it back to see if it matches, - * this is needed to detect certain hardware emulators (qemu/kvm) - * that don't trap on the MSR access and always return 0s. + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - val = 0xabcdUL; reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) @@ -1314,6 +1316,121 @@ static struct attribute_group x86_pmu_format_group = { .attrs = NULL, }; +struct perf_pmu_events_attr { + struct device_attribute attr; + u64 id; +}; + +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) +{ + int i, j; + + for (i = 0; attrs[i]; i++) { + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + +static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + + u64 config = x86_pmu.event_map(pmu_attr->id); + return x86_pmu.events_sysfs_show(page, config); +} + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ +}; + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1360,6 +1477,11 @@ static int __init init_hw_perf_events(void) x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1649,6 +1771,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, + &x86_pmu_events_group, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 271d25700297..115c1ea97746 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -354,6 +354,8 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; + ssize_t (*events_sysfs_show)(char *page, u64 config); + /* * CPU Hotplug hooks */ @@ -536,6 +538,9 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4528ae7b6ec4..c93bc4e813a0 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -568,6 +568,14 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -591,6 +599,7 @@ static __initconst const struct x86_pmu amd_pmu = { .put_event_constraints = amd_put_event_constraints, .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 324bb523d9d9..93b9e1181f83 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1603,6 +1603,13 @@ static struct attribute *intel_arch_formats_attr[] = { NULL, }; +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1628,6 +1635,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1766,6 +1774,7 @@ static __initconst const struct x86_pmu intel_pmu = { .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 5df8d32ba91e..3cf3d97cce3a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -118,22 +118,24 @@ static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -156,7 +158,7 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; - u64 count; + u64 count = 0; pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); @@ -603,11 +605,12 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static void snbep_pci2phy_map_init(void) +static int snbep_pci2phy_map_init(void) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid; - u32 config; + int err = 0; + u32 config = 0; while (1) { /* find the UBOX device */ @@ -618,10 +621,14 @@ static void snbep_pci2phy_map_init(void) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; nodeid = config; /* get the Node ID mapping */ - pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; /* * every three bits in the Node ID mapping register maps * to a particular node. @@ -633,7 +640,11 @@ static void snbep_pci2phy_map_init(void) } } }; - return; + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; } /* end of Sandy Bridge-EP uncore support */ @@ -1547,7 +1558,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int port; /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { @@ -1559,7 +1569,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) } /* adjust extra register config */ - port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { case 2: /* shift the 8~15 bits to the 0~7 bits */ @@ -2578,9 +2587,11 @@ static int __init uncore_pci_init(void) switch (boot_cpu_data.x86_model) { case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(); + if (ret) + return ret; pci_uncores = snbep_pci_uncores; uncore_pci_driver = &snbep_uncore_pci_driver; - snbep_pci2phy_map_init(); break; default: return 0; diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 7c46bfdbc373..4b7731bf23a8 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -3,6 +3,8 @@ #include <linux/perf_event.h> #include <linux/types.h> +#include <asm/hardirq.h> + #include "perf_event.h" static const u64 knc_perfmon_event_map[] = @@ -173,30 +175,100 @@ static void knc_pmu_enable_all(int added) static inline void knc_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } static void knc_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -214,7 +286,7 @@ static struct attribute *intel_knc_formats_attr[] = { static __initconst struct x86_pmu knc_pmu = { .name = "knc", - .handle_irq = x86_pmu_handle_irq, + .handle_irq = knc_pmu_handle_irq, .disable_all = knc_pmu_disable_all, .enable_all = knc_pmu_enable_all, .enable = knc_pmu_enable_event, @@ -226,12 +298,11 @@ static __initconst struct x86_pmu knc_pmu = { .event_map = knc_pmu_event_map, .max_events = ARRAY_SIZE(knc_perfmon_event_map), .apic = 1, - .max_period = (1ULL << 31) - 1, + .max_period = (1ULL << 39) - 1, .version = 0, .num_counters = 2, - /* in theory 40 bits, early silicon is buggy though */ - .cntval_bits = 32, - .cntval_mask = (1ULL << 32) - 1, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = knc_event_constraints, .format_attrs = intel_knc_formats_attr, diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index e4dd0f7a0453..f2af39f5dc3d 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -8,13 +8,106 @@ */ static const u64 p6_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static __initconst u64 p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p6_pmu_event_map(int hw_event) @@ -34,7 +127,7 @@ static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ @@ -64,25 +157,25 @@ static void p6_pmu_enable_all(int added) static inline void p6_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ (void)wrmsrl_safe(hwc->config_base, val); } @@ -134,6 +227,8 @@ static __initconst const struct x86_pmu p6_pmu = { .event_constraints = p6_event_constraints, .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + }; __init int p6_pmu_init(void) @@ -158,5 +253,9 @@ __init int p6_pmu_init(void) x86_pmu = p6_pmu; + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ed858e9e9a74..df06ade26bef 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1077,6 +1077,9 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } + /* throw away partial pages */ + memblock_trim_memory(PAGE_SIZE); + memblock_dump_all(); } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51f..31b46128a63d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,7 +56,7 @@ #include <asm/ftrace.h> #include <asm/percpu.h> #include <asm/asm.h> -#include <asm/rcu.h> +#include <asm/context_tracking.h> #include <asm/smap.h> #include <linux/err.h> @@ -995,8 +995,8 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - ASM_CLAC XCPT_FRAME + ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): old_rsp-ARGOFFSET */ @@ -1135,8 +1135,8 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC pushq_cfi $~(\num) .Lcommon_\sym: interrupt \do_sym @@ -1190,8 +1190,8 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1208,8 +1208,8 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1227,8 +1227,8 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1247,8 +1247,8 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1266,8 +1266,8 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1699,9 +1699,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1709,8 +1710,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1736,18 +1737,18 @@ first_nmi: * +-------------------------+ * | NMI executing variable | * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ * | pt_regs | * +-------------------------+ * @@ -1763,9 +1764,14 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1786,12 +1792,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1842,8 +1851,12 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + + /* Pop the extra iret frame */ + addq $(5*8), %rsp + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64e..4dac2f68ed4a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -292,8 +292,8 @@ default_entry: * be using the global pages. * * NOTE! If we are on a 486 we may have no cr4 at all! - * Specifically, cr4 exists if and only if CPUID exists, - * which in turn exists if and only if EFLAGS.ID exists. + * Specifically, cr4 exists if and only if CPUID exists + * and has flags other than the FPU flag set. */ movl $X86_EFLAGS_ID,%ecx pushl %ecx @@ -308,6 +308,11 @@ default_entry: testl %ecx,%eax jz 6f # No ID flag = no CPUID = no CR4 + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz 6f # No flags or only CPUID.FPU = no CR4 + movl pa(mmu_cr4_features),%eax movl %eax,%cr4 diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7720ff5a9ee2..efdec7cd8e01 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -8,8 +8,8 @@ * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * * Maintainers: - * Andreas Herrmann <andreas.herrmann3@amd.com> - * Borislav Petkov <borislav.petkov@amd.com> + * Andreas Herrmann <herrmann.der.user@googlemail.com> + * Borislav Petkov <bp@alien8.de> * * This driver allows to upgrade microcode on F10h AMD * CPUs and later. @@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 switch (c->x86) { case 0x14: @@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, case 0x15: max_size = F15H_MPB_MAX_SIZE; break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a18390..b629bbe0d9bd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> +#include <linux/module.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -166,6 +168,35 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); + unsigned long sp = (unsigned long)®s->sp; + struct thread_info *tinfo; + + if (context == (sp & ~(THREAD_SIZE - 1))) + return sp; + + tinfo = (struct thread_info *)context; + if (tinfo->previous_esp) + return tinfo->previous_esp; + + return (unsigned long)regs; +} +EXPORT_SYMBOL_GPL(kernel_stack_pointer); + static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); @@ -1461,7 +1492,7 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; - rcu_user_exit(); + user_exit(); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1511,6 +1542,13 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) @@ -1527,5 +1565,5 @@ void syscall_trace_leave(struct pt_regs *regs) if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); - rcu_user_enter(); + user_enter(); } diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4929c1be0ac0..801602b5d745 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -195,12 +195,6 @@ void read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } -unsigned long long native_read_tsc(void) -{ - return __native_read_tsc(); -} -EXPORT_SYMBOL(native_read_tsc); - static struct resource rtc_resources[] = { [0] = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 468e98dfd44e..ca45696f30fb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -921,18 +921,19 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + unsigned long start_pfn, end_pfn; - if (ei->addr + ei->size <= 1UL << 32) - continue; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, + NULL) { - if (ei->type == E820_RESERVED) + end = PFN_PHYS(end_pfn); + if (end <= (1UL<<32)) continue; + start = PFN_PHYS(start_pfn); max_pfn_mapped = init_memory_mapping( - ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, - ei->addr + ei->size); + max((1UL<<32), start), end); } /* can we preseve max_low_pfn ?*/ @@ -1048,6 +1049,18 @@ void __init setup_arch(char **cmdline_p) arch_init_ideal_nops(); register_refined_jiffies(CLOCK_TICK_RATE); + +#ifdef CONFIG_EFI + /* Once setup is done above, disable efi_enabled on mismatched + * firmware/kernel archtectures since there is no support for + * runtime services. + */ + if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + efi_enabled = 0; + } +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 70b27ee6118e..fbbb604313a2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/uaccess.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> +#include <linux/context_tracking.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -816,7 +817,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - rcu_user_exit(); + user_exit(); #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ @@ -838,7 +839,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); - rcu_user_enter(); + user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528b..f3e2ec878b8c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,8 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -818,6 +820,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* the FPU context is blank, nobody can own it */ + __cpu_disable_lazy_restore(cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index b4d3c3927dd8..97ef74b88e0f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,37 +21,23 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. - * - * @flags denotes the allocation direction - bottomup or topdown - - * or vDSO; see call sites below. */ -unsigned long align_addr(unsigned long addr, struct file *filp, - enum align_flags flags) +static unsigned long get_align_mask(void) { - unsigned long tmp_addr; - /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) - return addr; + return 0; if (!(current->flags & PF_RANDOMIZE)) - return addr; - - if (!((flags & ALIGN_VDSO) || filp)) - return addr; - - tmp_addr = addr; - - /* - * We need an address which is <= than the original - * one only when in topdown direction. - */ - if (!(flags & ALIGN_TOPDOWN)) - tmp_addr += va_align.mask; + return 0; - tmp_addr &= ~va_align.mask; + return va_align.mask; +} - return tmp_addr; +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + return (addr + align_mask) & ~align_mask; } static int __init control_va_addr_alignment(char *str) @@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; unsigned long begin, end; if (flags & MAP_FIXED) @@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = begin; - } - addr = mm->free_area_cache; - if (addr < begin) - addr = begin; - start_addr = addr; - -full_search: - - addr = align_addr(addr, filp, 0); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (end - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != begin) { - start_addr = addr = begin; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - addr = align_addr(addr, filp, 0); - } + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); } - unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, @@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - addr = align_addr(addr, filp, ALIGN_TOPDOWN); - - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = addr; - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); bottomup: /* @@ -270,14 +188,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - - return addr; + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 000000000000..25b993729f9b --- /dev/null +++ b/arch/x86/kernel/trace_clock.c @@ -0,0 +1,21 @@ +/* + * X86 trace clocks + */ +#include <asm/trace_clock.h> +#include <asm/barrier.h> +#include <asm/msr.h> + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + u64 ret; + + rdtsc_barrier(); + rdtscll(ret); + + return ret; +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8276dc6794cc..eb8586693e0b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,7 +55,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/mce.h> -#include <asm/rcu.h> +#include <asm/context_tracking.h> #include <asm/mach_traps.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfa5d4f7ca56..06ccb5073a3f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -77,6 +77,12 @@ unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + int check_tsc_unstable(void) { return tsc_unstable; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aafa5557b396..c71025b67462 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -478,6 +478,11 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->ip = current->utask->xol_vaddr; pre_xol_rip_insn(auprobe, regs, autask); + autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + return 0; } @@ -603,6 +608,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (auprobe->fixups & UPROBE_FIX_CALL) result = adjust_ret_addr(regs->sp, correction); + /* + * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP + * so we can get an extra SIGTRAP if we do not clear TF. We need + * to examine the opcode to make it right. + */ + if (utask->autask.saved_tf) + send_sig(SIGTRAP, current, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + regs->flags &= ~X86_EFLAGS_TF; + return result; } @@ -647,6 +662,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) current->thread.trap_nr = utask->autask.saved_trap_nr; handle_riprel_post_xol(auprobe, regs, NULL); instruction_pointer_set(regs, utask->vaddr); + + /* clear TF if it was set by us in arch_uprobe_pre_xol() */ + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; } /* @@ -676,38 +695,3 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) send_sig(SIGTRAP, current, 0); return ret; } - -void arch_uprobe_enable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - struct pt_regs *regs = task_pt_regs(task); - - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) - set_task_blockstep(task, false); -} - -void arch_uprobe_disable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); - struct pt_regs *regs = task_pt_regs(task); - /* - * The state of TIF_BLOCKSTEP was not saved so we can get an extra - * SIGTRAP if we do not clear TF. We need to examine the opcode to - * make it right. - */ - if (unlikely(trapped)) { - if (!autask->saved_tf) - regs->flags &= ~X86_EFLAGS_TF; - } else { - if (autask->saved_tf) - send_sig(SIGTRAP, task, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) - regs->flags &= ~X86_EFLAGS_TF; - } -} diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a10e46016851..58fc51488828 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -24,6 +24,9 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39171cb307ea..bba39bfa1c4b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -426,8 +426,7 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) _ASM_EXTABLE(1b, 3b) \ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad6b1dd06f8b..f85815945fc6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6549,19 +6549,22 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } else { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1eefebe5d727..4f7641756be2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3779,7 +3779,7 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, { struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; - memcpy(vcpu->run->mmio.data, frag->data, frag->len); + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); return X86EMUL_CONTINUE; } @@ -3832,18 +3832,11 @@ mmio: bytes -= handled; val += handled; - while (bytes) { - unsigned now = min(bytes, 8U); - - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; - frag->gpa = gpa; - frag->data = val; - frag->len = now; - - gpa += now; - val += now; - bytes -= now; - } + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = bytes; return X86EMUL_CONTINUE; } @@ -3890,7 +3883,7 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, vcpu->mmio_needed = 1; vcpu->mmio_cur_fragment = 0; - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = gpa; @@ -5522,28 +5515,44 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu) * * read: * for each fragment - * write gpa, len - * exit - * copy data + * for each mmio piece in the fragment + * write gpa, len + * exit + * copy data * execute insn * * write: * for each fragment - * write gpa, len - * copy data - * exit + * for each mmio piece in the fragment + * write gpa, len + * copy data + * exit */ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; + unsigned len; BUG_ON(!vcpu->mmio_needed); /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); if (!vcpu->mmio_is_write) - memcpy(frag->data, run->mmio.data, frag->len); + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; if (vcpu->mmio_is_write) @@ -5551,13 +5560,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->mmio_read_completed = 1; return complete_emulated_io(vcpu); } - /* Initiate next fragment */ - ++frag; + run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - memcpy(run->mmio.data, frag->data, frag->len); - run->mmio.len = frag->len; + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->mmio.len = min(8u, frag->len); run->mmio.is_write = vcpu->mmio_is_write; vcpu->arch.complete_userspace_io = complete_emulated_mmio; return 0; @@ -5773,6 +5781,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6b34d04d096a..176cca67212b 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -5,91 +5,89 @@ #include <asm/alternative-asm.h> ALIGN -copy_page_c: +copy_page_rep: CFI_STARTPROC - movl $4096/8,%ecx - rep movsq + movl $4096/8, %ecx + rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_c) +ENDPROC(copy_page_rep) -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ +/* + * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. + * Could vary the prefetch distance based on SMP/UP. +*/ ENTRY(copy_page) CFI_STARTPROC - subq $2*8,%rsp + subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 - movq %rbx,(%rsp) + movq %rbx, (%rsp) CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) + movq %r12, 1*8(%rsp) CFI_REL_OFFSET r12, 1*8 - movl $(4096/64)-5,%ecx + movl $(4096/64)-5, %ecx .p2align 4 .Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 prefetcht0 5*64(%rsi) - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi - jnz .Loop64 + jnz .Loop64 - movl $5,%ecx + movl $5, %ecx .p2align 4 .Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi jnz .Loop2 - movq (%rsp),%rbx + movq (%rsp), %rbx CFI_RESTORE rbx - movq 1*8(%rsp),%r12 + movq 1*8(%rsp), %r12 CFI_RESTORE r12 - addq $2*8,%rsp + addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret .Lcopy_page_end: @@ -103,7 +101,7 @@ ENDPROC(copy_page) .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ + .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ 2: .previous .section .altinstructions,"a" diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e13ecb41bee..7a529cbab7ad 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,7 @@ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_START */ -#include <asm/rcu.h> /* exception_enter(), ... */ +#include <asm/context_tracking.h> /* exception_enter(), ... */ /* * Page fault error code bits: diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 937bff5cdaa7..ae1aa71d0115 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -274,42 +274,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - - if (len > mm->cached_hole_size) { - start_addr = mm->free_area_cache; - } else { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, @@ -317,83 +290,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long base = mm->mmap_base; - unsigned long addr = addr0; - unsigned long largest_hole = mm->cached_hole_size; - unsigned long start_addr; - - /* don't allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - - if (len <= largest_hole) { - largest_hole = 0; - mm->free_area_cache = base; - } -try_again: - start_addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; - - /* either no address requested or can't fit in requested address hole */ - addr = (mm->free_area_cache - len) & huge_page_mask(h); - do { - /* - * Lookup failure means no vma is above this address, - * i.e. return with success: - */ - vma = find_vma(mm, addr); - if (!vma) - return addr; + struct vm_unmapped_area_info info; + unsigned long addr; - if (addr + len <= vma->vm_start) { - /* remember the address as a hint for next time */ - mm->cached_hole_size = largest_hole; - return (mm->free_area_cache = addr); - } else if (mm->free_area_cache == vma->vm_end) { - /* pull free_area_cache down to the first hole */ - mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); - /* remember the largest hole we saw so far */ - if (addr + largest_hole < vma->vm_start) - largest_hole = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & huge_page_mask(h); - } while (len <= vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != base) { - mm->free_area_cache = base; - largest_hole = 0; - goto try_again; - } /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - addr = hugetlb_get_unmapped_area_bottomup(file, addr0, - len, pgoff, flags); - - /* - * Restore the topdown base: - */ - mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ab1f6a93b527..d7aea41563b3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -35,40 +35,44 @@ struct map_range { unsigned page_size_mask; }; -static void __init find_early_table_space(struct map_range *mr, unsigned long end, - int use_pse, int use_gbpages) +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) { - unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; phys_addr_t base; - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - if (use_pse) { - unsigned long extra; + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); #ifdef CONFIG_X86_32 - extra += PMD_SIZE; + extra += PMD_SIZE; #endif - /* The first 2/4M doesn't use large pages. */ - if (mr->start < PMD_SIZE) - extra += mr->end - mr->start; - - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); #ifdef CONFIG_X86_32 @@ -86,7 +90,7 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - end - 1, pgt_buf_start << PAGE_SHIFT, + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); } @@ -267,7 +271,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(&mr[0], end, use_pse, use_gbpages); + find_early_table_space(mr, nr_range); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2b6b4a3c8beb..3baff255adac 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -386,7 +386,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * these mappings are more intelligent. */ if (pte_val(*pte)) { - pages++; + if (!after_bootmem) + pages++; continue; } @@ -451,6 +452,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } @@ -526,6 +529,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0777f042e400..60f926cd8b0e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -197,7 +197,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag == VM_HUGETLB) { + || vmflag & VM_HUGETLB) { local_flush_tlb(); goto flush_all; } diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 41bd2a2d2c50..b914e20b5a00 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -115,6 +115,16 @@ static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) reg_read(reg, value); } +static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + /* force interrupt pin value to 0 */ + *value = reg->sim_reg.value & 0xfff00ff; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write) DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write) @@ -144,6 +154,7 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write) DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 7, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write) DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write) @@ -161,8 +172,10 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write) + DEFINE_REG(18, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) }; static void __init init_sim_regs(void) diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 4c61b52191eb..f8ab4945892e 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -21,12 +21,25 @@ #include <asm/i8259.h> #include <asm/io.h> #include <asm/io_apic.h> +#include <asm/emergency-restart.h> static int ce4100_i8042_detect(void) { return 0; } +/* + * The CE4100 platform has an internal 8051 Microcontroller which is + * responsible for signaling to the external Power Management Unit the + * intention to reset, reboot or power off the system. This 8051 device has + * its command register mapped at I/O port 0xcf9 and the value 0x4 is used + * to power off the system. + */ +static void ce4100_power_off(void) +{ + outb(0x4, 0xcf9); +} + #ifdef CONFIG_SERIAL_8250 static unsigned int mem_serial_in(struct uart_port *p, int offset) @@ -92,8 +105,11 @@ static void ce4100_serial_fixup(int port, struct uart_port *up, up->membase = (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE); up->membase += up->mapbase & ~PAGE_MASK; + up->mapbase += port * 0x100; + up->membase += port * 0x100; up->iotype = UPIO_MEM32; up->regshift = 2; + up->irq = 4; } #endif up->iobase = 0; @@ -139,8 +155,19 @@ void __init x86_ce4100_early_setup(void) x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.pci.init = ce4100_pci_init; + /* + * By default, the reboot method is ACPI which is supported by the + * CE4100 bootloader CEFDK using FADT.ResetReg Address and ResetValue + * the bootloader will however issue a system power off instead of + * reboot. By using BOOT_KBD we ensure proper system reboot as + * expected. + */ + reboot_type = BOOT_KBD; + #ifdef CONFIG_X86_IO_APIC x86_init.pci.init_irq = sdv_pci_init; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; #endif + + pm_power_off = ce4100_power_off; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index aded2a91162a..ad4439145f85 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,11 +70,15 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; bool efi_64bit; -static bool efi_native; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; +static inline bool efi_is_native(void) +{ + return IS_ENABLED(CONFIG_X86_64) == efi_64bit; +} + static int __init setup_noefi(char *arg) { efi_enabled = 0; @@ -420,7 +424,7 @@ void __init efi_reserve_boot_services(void) } } -static void __init efi_unmap_memmap(void) +void __init efi_unmap_memmap(void) { if (memmap.map) { early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); @@ -432,7 +436,7 @@ void __init efi_free_boot_services(void) { void *p; - if (!efi_native) + if (!efi_is_native()) return; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -684,12 +688,10 @@ void __init efi_init(void) return; } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; - efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) (boot_params.efi_info.efi_systab | ((__u64)boot_params.efi_info.efi_systab_hi<<32)); - efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -723,7 +725,7 @@ void __init efi_init(void) * that doesn't match the kernel 32/64-bit mode. */ - if (!efi_native) + if (!efi_is_native()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else if (efi_runtime_init()) { efi_enabled = 0; @@ -735,7 +737,7 @@ void __init efi_init(void) return; } #ifdef CONFIG_X86_32 - if (efi_native) { + if (efi_is_native()) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } @@ -810,6 +812,16 @@ void __iomem *efi_lookup_mapped_addr(u64 phys_addr) return NULL; } +void efi_memory_uc(u64 addr, unsigned long size) +{ + unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; + u64 npages; + + npages = round_up(size, page_shift) / page_shift; + memrange_efi_to_native(&addr, &npages); + set_memory_uc(addr, npages); +} + /* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that @@ -823,7 +835,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages, end_pfn; + u64 end, systab, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -834,7 +846,7 @@ void __init efi_enter_virtual_mode(void) * non-native EFI */ - if (!efi_native) { + if (!efi_is_native()) { efi_unmap_memmap(); return; } @@ -879,10 +891,14 @@ void __init efi_enter_virtual_mode(void) end_pfn = PFN_UP(end); if (end_pfn <= max_low_pfn_mapped || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) + && end_pfn <= max_pfn_mapped)) { va = __va(md->phys_addr); - else - va = efi_ioremap(md->phys_addr, size, md->type); + + if (!(md->attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)va, size); + } else + va = efi_ioremap(md->phys_addr, size, + md->type, md->attribute); md->virt_addr = (u64) (unsigned long) va; @@ -892,13 +908,6 @@ void __init efi_enter_virtual_mode(void) continue; } - if (!(md->attribute & EFI_MEMORY_WB)) { - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); - } - systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac3aa54e2654..95fd505dfeb6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -82,7 +82,7 @@ void __init efi_call_phys_epilog(void) } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type) + u32 type, u64 attribute) { unsigned long last_map_pfn; @@ -92,8 +92,11 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { unsigned long top = last_map_pfn << PAGE_SHIFT; - efi_ioremap(top, size - (top - phys_addr), type); + efi_ioremap(top, size - (top - phys_addr), type, attribute); } + if (!(attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)__va(phys_addr), size); + return (void __iomem *)__va(phys_addr); } diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 00aaf047b39f..431e87544411 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) * unaligned here as a result of stack start randomization. */ addr = PAGE_ALIGN(addr); - addr = align_addr(addr, NULL, ALIGN_VDSO); + addr = align_vdso_addr(addr); return addr; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6226c99729b9..dcf5f2dd91ec 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1288,6 +1288,25 @@ unsigned long xen_read_cr2_direct(void) return this_cpu_read(xen_vcpu_info.arch.cr2); } +void xen_flush_tlb_all(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_all(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} static void xen_flush_tlb(void) { struct mmuext_op *op; @@ -2518,7 +2537,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, err = 0; out: - flush_tlb_all(); + xen_flush_tlb_all(); return err; } |