summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig23
-rw-r--r--arch/x86/Kconfig.debug19
-rw-r--r--arch/x86/crypto/chacha20_glue.c2
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c26
-rw-r--r--arch/x86/entry/calling.h15
-rw-r--r--arch/x86/entry/common.c6
-rw-r--r--arch/x86/entry/entry_32.S15
-rw-r--r--arch/x86/entry/entry_64.S8
-rw-r--r--arch/x86/entry/entry_64_compat.S40
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c151
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S3
-rw-r--r--arch/x86/entry/vdso/vdso2c.c3
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S54
-rw-r--r--arch/x86/entry/vdso/vma.c14
-rw-r--r--arch/x86/include/asm/apic.h6
-rw-r--r--arch/x86/include/asm/atomic.h1
-rw-r--r--arch/x86/include/asm/atomic64_32.h1
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h2
-rw-r--r--arch/x86/include/asm/cpu.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h116
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/fpu/internal.h174
-rw-r--r--arch/x86/include/asm/fpu/xstate.h11
-rw-r--r--arch/x86/include/asm/intel_pt.h10
-rw-r--r--arch/x86/include/asm/iosf_mbi.h51
-rw-r--r--arch/x86/include/asm/ipi.h2
-rw-r--r--arch/x86/include/asm/jump_label.h63
-rw-r--r--arch/x86/include/asm/lguest.h4
-rw-r--r--arch/x86/include/asm/microcode.h39
-rw-r--r--arch/x86/include/asm/mmu_context.h34
-rw-r--r--arch/x86/include/asm/msi.h6
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/msr-trace.h57
-rw-r--r--arch/x86/include/asm/msr.h43
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h44
-rw-r--r--arch/x86/include/asm/paravirt_types.h40
-rw-r--r--arch/x86/include/asm/pgtable.h15
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h59
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/include/asm/smp.h12
-rw-r--r--arch/x86/include/asm/suspend_32.h1
-rw-r--r--arch/x86/include/asm/suspend_64.h1
-rw-r--r--arch/x86/include/asm/uaccess.h9
-rw-r--r--arch/x86/include/asm/vdso.h1
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xor_32.h2
-rw-r--r--arch/x86/include/uapi/asm/mce.h2
-rw-r--r--arch/x86/kernel/apic/apic.c45
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c19
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c7
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c10
-rw-r--r--arch/x86/kernel/apic/ipi.c18
-rw-r--r--arch/x86/kernel/apic/msi.c8
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/vector.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c65
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c93
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c12
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c16
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event.h21
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c115
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c42
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c25
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c17
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c635
-rw-r--r--arch/x86/kernel/cpu/rdrand.c25
-rw-r--r--arch/x86/kernel/cpu/scattered.c20
-rw-r--r--arch/x86/kernel/cpu/transmeta.c4
-rw-r--r--arch/x86/kernel/cpuid.c24
-rw-r--r--arch/x86/kernel/crash.c11
-rw-r--r--arch/x86/kernel/fpu/init.c182
-rw-r--r--arch/x86/kernel/fpu/xstate.c8
-rw-r--r--arch/x86/kernel/ftrace.c21
-rw-r--r--arch/x86/kernel/hw_breakpoint.c6
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kernel/livepatch.c29
-rw-r--r--arch/x86/kernel/msr.c24
-rw-r--r--arch/x86/kernel/nmi.c32
-rw-r--r--arch/x86/kernel/paravirt.c36
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c3
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c15
-rw-r--r--arch/x86/kernel/pvclock.c24
-rw-r--r--arch/x86/kernel/reboot.c38
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/smp.c4
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/verify_cpu.S50
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/cpuid.h42
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/mtrr.c25
-rw-r--r--arch/x86/kvm/svm.c21
-rw-r--r--arch/x86/kvm/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c13
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/cpu.c35
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/debug_pagetables.c46
-rw-r--r--arch/x86/mm/dump_pagetables.c36
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/pageattr.c13
-rw-r--r--arch/x86/mm/pat.c12
-rw-r--r--arch/x86/mm/pat_rbtree.c52
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/setup_nx.c4
-rw-r--r--arch/x86/mm/srat.c2
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--arch/x86/net/bpf_jit_comp.c40
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c7
-rw-r--r--arch/x86/platform/intel-quark/imr.c28
-rw-r--r--arch/x86/platform/uv/uv_nmi.c1
-rw-r--r--arch/x86/power/cpu.c92
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/syscall.h1
-rw-r--r--arch/x86/um/ptrace_32.c8
-rw-r--r--arch/x86/um/signal.c2
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/enlighten.c26
-rw-r--r--arch/x86/xen/mmu.c10
-rw-r--r--arch/x86/xen/suspend.c24
-rw-r--r--arch/x86/xen/time.c115
-rw-r--r--arch/x86/xen/xen-asm_32.S14
-rw-r--r--arch/x86/xen/xen-asm_64.S19
-rw-r--r--arch/x86/xen/xen-ops.h3
164 files changed, 2607 insertions, 1321 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f22b61..5d2293417946 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -349,6 +350,17 @@ config X86_FEATURE_NAMES
If in doubt, say Y.
+config X86_FAST_FEATURE_TESTS
+ bool "Fast CPU feature tests" if EMBEDDED
+ default y
+ ---help---
+ Some fast-paths in the kernel depend on the capabilities of the CPU.
+ Say Y here for the kernel to patch in the appropriate code at runtime
+ based on the capabilities of the CPU. The infrastructure for patching
+ code at runtime takes up some additional space; space-constrained
+ embedded systems may wish to say N here to produce smaller, slightly
+ slower code.
+
config X86_X2APIC
bool "Support x2apic"
depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
@@ -523,9 +535,10 @@ config X86_INTEL_QUARK
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
- depends on ACPI
+ depends on X86 && ACPI
select COMMON_CLK
select PINCTRL
+ select IOSF_MBI
---help---
Select to build support for Intel Low Power Subsystem such as
found on Intel Lynxpoint PCH. Selecting this option enables
@@ -687,6 +700,14 @@ config PARAVIRT_SPINLOCKS
If you are unsure how to answer this question, answer Y.
+config QUEUED_LOCK_STAT
+ bool "Paravirt queued spinlock statistics"
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+ ---help---
+ Enable the collection of statistical data on the slowpath
+ behavior of paravirtualized queued spinlocks and report
+ them on debugfs.
+
source "arch/x86/xen/Kconfig"
config KVM_GUEST
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 137dfa96aa14..9b18ed97a8a2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel. Note that with PAT support
- enabled, even in this case there are restrictions on /dev/mem
- use due to the cache aliasing requirements.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to PCI space and the BIOS code and data regions.
- This is sufficient for dosemu and X and all common users of
- /dev/mem.
-
- If in doubt, say Y.
-
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
@@ -69,7 +52,7 @@ config X86_PTDUMP_CORE
def_bool n
config X86_PTDUMP
- bool "Export kernel pagetable layout to userspace via debugfs"
+ tristate "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
select DEBUG_FS
select X86_PTDUMP_CORE
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 722bacea040e..8baaff5af0b5 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -125,7 +125,7 @@ static struct crypto_alg alg = {
static int __init chacha20_simd_mod_init(void)
{
- if (!cpu_has_ssse3)
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 81a595d75cf5..0e9871693f24 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -257,7 +257,7 @@ static int __init crc32c_intel_mod_init(void)
if (!x86_match_cpu(crc32c_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
- if (cpu_has_pclmulqdq) {
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 440df0c7a2ee..a69321a77783 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -219,6 +219,29 @@ static int ghash_async_final(struct ahash_request *req)
}
}
+static int ghash_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ ghash_async_init(req);
+ memcpy(dctx, in, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memcpy(out, dctx, sizeof(*dctx));
+ return 0;
+
+}
+
static int ghash_async_digest(struct ahash_request *req)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
@@ -288,8 +311,11 @@ static struct ahash_alg ghash_async_alg = {
.final = ghash_async_final,
.setkey = ghash_async_setkey,
.digest = ghash_async_digest,
+ .export = ghash_async_export,
+ .import = ghash_async_import,
.halg = {
.digestsize = GHASH_DIGEST_SIZE,
+ .statesize = sizeof(struct ghash_desc_ctx),
.base = {
.cra_name = "ghash",
.cra_driver_name = "ghash-clmulni",
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3c71dd947c7b..e32206e09868 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,3 +1,5 @@
+#include <linux/jump_label.h>
+
/*
x86 function call convention, 64-bit:
@@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* CONFIG_X86_64 */
+/*
+ * This does 'call enter_from_user_mode' unless we can avoid it based on
+ * kernel config or using the static jump infrastructure.
+ */
+.macro CALL_enter_from_user_mode
+#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef HAVE_JUMP_LABEL
+ STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+#endif
+ call enter_from_user_mode
+.Lafter_call_\@:
+#endif
+.endm
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc1f0be..03663740c866 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -421,7 +421,7 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
regs->ip = landing_pad;
/*
- * Fetch ECX from where the vDSO stashed it.
+ * Fetch EBP from where the vDSO stashed it.
*
* WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
*/
@@ -432,10 +432,10 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
* Micro-optimization: the pointer we're following is explicitly
* 32 bits, so it can't be out of range.
*/
- __get_user(*(u32 *)&regs->cx,
+ __get_user(*(u32 *)&regs->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
- get_user(*(u32 *)&regs->cx,
+ get_user(*(u32 *)&regs->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
) {
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3eb572ed3d7a..77d8c5112900 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
- pushl %ecx /* pt_regs->cx */
+ pushl %ebp /* pt_regs->sp (stashed in bp) */
pushfl /* pt_regs->flags (except IF = 0) */
orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
pushl $__USER_CS /* pt_regs->cs */
@@ -308,8 +308,9 @@ sysenter_past_esp:
movl %esp, %eax
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
@@ -328,7 +329,8 @@ sysenter_past_esp:
* Return back to the vDSO, which will pop ecx and edx.
* Don't bother with DS and ES (they already contain __USER_DS).
*/
- ENABLE_INTERRUPTS_SYSEXIT
+ sti
+ sysexit
.pushsection .fixup, "ax"
2: movl $0, PT_FS(%esp)
@@ -551,11 +553,6 @@ ENTRY(native_iret)
iret
_ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
#endif
ENTRY(overflow)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a55697d19824..9d34d3cfceb6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -520,9 +520,7 @@ END(irq_entries_start)
*/
TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ CALL_enter_from_user_mode
1:
/*
@@ -1066,9 +1064,7 @@ ENTRY(error_entry)
* (which can take locks).
*/
TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ CALL_enter_from_user_mode
ret
.Lerror_entry_done:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index c3201830a85e..ff1c6d61f332 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -18,13 +18,6 @@
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
- swapgs
- sysretl
-ENDPROC(native_usergs_sysret32)
-#endif
-
/*
* 32-bit SYSENTER instruction entry.
*
@@ -63,7 +56,7 @@ ENTRY(entry_SYSENTER_compat)
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
- pushq %rcx /* pt_regs->sp */
+ pushq %rbp /* pt_regs->sp (stashed in bp) */
/*
* Push flags. This is nasty. First, interrupts are currently
@@ -82,14 +75,14 @@ ENTRY(entry_SYSENTER_compat)
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx (will be overwritten) */
+ pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 = 0 */
pushq %r8 /* pt_regs->r9 = 0 */
pushq %r8 /* pt_regs->r10 = 0 */
pushq %r8 /* pt_regs->r11 = 0 */
pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
pushq %r8 /* pt_regs->r12 = 0 */
pushq %r8 /* pt_regs->r13 = 0 */
pushq %r8 /* pt_regs->r14 = 0 */
@@ -103,15 +96,15 @@ ENTRY(entry_SYSENTER_compat)
* This needs to happen before enabling interrupts so that
* we don't get preempted with NT set.
*
- * NB.: sysenter_fix_flags is a label with the code under it moved
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
* out-of-line as an optimization: NT is unlikely to be set in the
* majority of the cases and instead of polluting the I$ unnecessarily,
* we're keeping that code behind a branch which will predict as
* not-taken and therefore its instructions won't be fetched.
*/
testl $X86_EFLAGS_NT, EFLAGS(%rsp)
- jnz sysenter_fix_flags
-sysenter_flags_fixed:
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
/*
* User mode is traced as though IRQs are on, and SYSENTER
@@ -121,14 +114,15 @@ sysenter_flags_fixed:
movq %rsp, %rdi
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
jmp sysret32_from_system_call
-sysenter_fix_flags:
+.Lsysenter_fix_flags:
pushq $X86_EFLAGS_FIXED
popfq
- jmp sysenter_flags_fixed
+ jmp .Lsysenter_flags_fixed
ENDPROC(entry_SYSENTER_compat)
/*
@@ -178,7 +172,7 @@ ENTRY(entry_SYSCALL_compat)
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx (will be overwritten) */
+ pushq %rbp /* pt_regs->cx (stashed in bp) */
pushq $-ENOSYS /* pt_regs->ax */
xorq %r8,%r8
pushq %r8 /* pt_regs->r8 = 0 */
@@ -186,7 +180,7 @@ ENTRY(entry_SYSCALL_compat)
pushq %r8 /* pt_regs->r10 = 0 */
pushq %r8 /* pt_regs->r11 = 0 */
pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
pushq %r8 /* pt_regs->r12 = 0 */
pushq %r8 /* pt_regs->r13 = 0 */
pushq %r8 /* pt_regs->r14 = 0 */
@@ -200,8 +194,9 @@ ENTRY(entry_SYSCALL_compat)
movq %rsp, %rdi
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
@@ -236,7 +231,8 @@ sysret32_from_system_call:
xorq %r9, %r9
xorq %r10, %r10
movq RSP-ORIG_RAX(%rsp), %rsp
- USERGS_SYSRET32
+ swapgs
+ sysretl
END(entry_SYSCALL_compat)
/*
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f17705e1332c..cb713df81180 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
+377 i386 copy_file_range sys_copy_file_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..dc1040a50bdc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
+326 common copy_file_range sys_copy_file_range
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa649251..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,8 +17,10 @@
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
+#include <asm/pvclock.h>
#include <linux/math64.h>
#include <linux/time.h>
+#include <linux/kernel.h>
#define gtod (&VVAR(vsyscall_gtod_data))
@@ -36,12 +38,12 @@ static notrace cycle_t vread_hpet(void)
}
#endif
-#ifndef BUILD_VDSO32
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+ __attribute__((visibility("hidden")));
+#endif
-#include <linux/kernel.h>
-#include <asm/vsyscall.h>
-#include <asm/fixmap.h>
-#include <asm/pvclock.h>
+#ifndef BUILD_VDSO32
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -60,75 +62,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
-#ifdef CONFIG_PARAVIRT_CLOCK
-
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
-{
- const struct pvclock_vsyscall_time_info *pvti_base;
- int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
- int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
- BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
- pvti_base = (struct pvclock_vsyscall_time_info *)
- __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
- return &pvti_base[offset];
-}
-
-static notrace cycle_t vread_pvclock(int *mode)
-{
- const struct pvclock_vsyscall_time_info *pvti;
- cycle_t ret;
- u64 last;
- u32 version;
- u8 flags;
- unsigned cpu, cpu1;
-
-
- /*
- * Note: hypervisor must guarantee that:
- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
- * 2. that per-CPU pvclock time info is updated if the
- * underlying CPU changes.
- * 3. that version is increased whenever underlying CPU
- * changes.
- *
- */
- do {
- cpu = __getcpu() & VGETCPU_CPU_MASK;
- /* TODO: We can put vcpu id into higher bits of pvti.version.
- * This will save a couple of cycles by getting rid of
- * __getcpu() calls (Gleb).
- */
-
- pvti = get_pvti(cpu);
-
- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
- /*
- * Test we're still on the cpu as well as the version.
- * We could have been migrated just after the first
- * vgetcpu but before fetching the version, so we
- * wouldn't notice a version change.
- */
- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
- } while (unlikely(cpu != cpu1 ||
- (pvti->pvti.version & 1) ||
- pvti->pvti.version != version));
-
- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
- *mode = VCLOCK_NONE;
-
- /* refer to tsc.c read_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
-}
-#endif
#else
@@ -162,15 +95,77 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
+#endif
+
#ifdef CONFIG_PARAVIRT_CLOCK
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
+{
+ return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
+}
static notrace cycle_t vread_pvclock(int *mode)
{
- *mode = VCLOCK_NONE;
- return 0;
-}
-#endif
+ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
+ cycle_t ret;
+ u64 tsc, pvti_tsc;
+ u64 last, delta, pvti_system_time;
+ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+ *
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+
+ do {
+ version = pvti->version;
+
+ smp_rmb();
+
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
+ tsc = rdtsc_ordered();
+ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+ pvti_tsc_shift = pvti->tsc_shift;
+ pvti_system_time = pvti->system_time;
+ pvti_tsc = pvti->tsc_timestamp;
+
+ /* Make sure that the version double-check is last. */
+ smp_rmb();
+ } while (unlikely((version & 1) || version != pvti->version));
+
+ delta = tsc - pvti_tsc;
+ ret = pvti_system_time +
+ pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+ pvti_tsc_shift);
+
+ /* refer to vread_tsc() comment for rationale */
+ last = gtod->cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ return last;
+}
#endif
notrace static cycle_t vread_tsc(void)
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..4158acc17df0 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
- vvar_start = . - 2 * PAGE_SIZE;
+ vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
+ pvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 785d9922b106..491020b2826d 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -80,6 +81,7 @@ enum {
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
};
struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
+ [sym_pvclock_page] = {"pvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 93bd8452383f..3a1d9297074b 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -1,5 +1,5 @@
/*
- * Code for the vDSO. This version uses the old int $0x80 method.
+ * AT_SYSINFO entry point
*/
#include <asm/dwarf2.h>
@@ -21,35 +21,67 @@ __kernel_vsyscall:
/*
* Reshuffle regs so that all of any of the entry instructions
* will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may eludicate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
*/
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
pushl %edx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edx, 0
- pushl %ecx
+ pushl %ebp
CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- movl %esp, %ecx
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
#ifdef CONFIG_X86_64
/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
- ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
- "syscall", X86_FEATURE_SYSCALL32
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
#else
- ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
#endif
/* Enter using int $0x80 */
- movl (%esp), %ecx
int $0x80
GLOBAL(int80_landing_pad)
- /* Restore ECX and EDX in case they were clobbered. */
- popl %ecx
- CFI_RESTORE ecx
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
CFI_ADJUST_CFA_OFFSET -4
popl %edx
CFI_RESTORE edx
CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
ret
CFI_ENDPROC
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 64df47148160..b8f69e264ac4 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
+#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
@@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
.name = "[vvar]",
.pages = no_pages,
};
+ struct pvclock_vsyscall_time_info *pvti;
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
#endif
+ pvti = pvclock_pvti_cpu0_va();
+ if (pvti && image->sym_pvclock_page) {
+ ret = remap_pfn_range(vma,
+ text_start + image->sym_pvclock_page,
+ __pa(pvti) >> PAGE_SHIFT,
+ PAGE_SIZE,
+ PAGE_READONLY);
+
+ if (ret)
+ goto up_fail;
+ }
+
up_fail:
if (ret)
current->mm->context.vdso = NULL;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a30316bf801a..c80f6b6f3da2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -23,6 +23,11 @@
#define APIC_VERBOSE 1
#define APIC_DEBUG 2
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP 0 /* Default */
+#define APIC_EXTNMI_ALL 1
+#define APIC_EXTNMI_NONE 2
+
/*
* Define the default level of output to be very little
* This can be turned up by using apic=verbose for more
@@ -303,6 +308,7 @@ struct apic {
unsigned int *apicid);
/* ipi */
+ void (*send_IPI)(int cpu, int vector);
void (*send_IPI_mask)(const struct cpumask *mask, int vector);
void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
int vector);
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index ae5fb83e6d91..3e8674288198 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -3,7 +3,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <asm/processor.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index a11c30b77fb5..a984111135b1 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -3,7 +3,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <asm/processor.h>
//#include <asm/cmpxchg.h>
/* An 64bit atomic type */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..6b8d6e8cd449 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
-#define BOOT_HEAP_SIZE 0x8000
+#define BOOT_HEAP_SIZE 0x10000
#endif /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0d467b338835..a8303ebe089f 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -31,7 +31,7 @@
#include <asm/types.h>
struct iommu_table {
- struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
+ const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
unsigned long it_base; /* mapped address of tce table */
unsigned long it_hint; /* Hint for next alloc */
unsigned long *it_map; /* A simple allocation bitmap for now */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f7e142926481..e4959d023af8 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -109,6 +109,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
#endif
-#define system_has_cmpxchg_double() cpu_has_cx8
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 1af94697aae5..caa23a34c963 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -18,6 +18,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
cmpxchg_local((ptr), (o), (n)); \
})
-#define system_has_cmpxchg_double() cpu_has_cx16
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bf2caa1dedc5..678637ad7476 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -36,4 +36,7 @@ extern int _debug_hotplug_cpu(int cpu, int action);
int mwait_usable(const struct cpuinfo_x86 *);
+unsigned int x86_family(unsigned int sig);
+unsigned int x86_model(unsigned int sig);
+unsigned int x86_stepping(unsigned int sig);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e4f8010f22e0..7ad8c9464297 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
-#define NCAPINTS 14 /* N 32-bit words worth of info */
+#define NCAPINTS 16 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -181,22 +181,17 @@
/*
* Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
+
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
+
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */
-#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
-#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
-#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
-#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
/* Virtualization flags: Linux defined, word 8 */
@@ -205,17 +200,9 @@
#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
+
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
@@ -258,6 +245,30 @@
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+
/*
* BUG word(s)
*/
@@ -278,6 +289,26 @@
#include <asm/asm.h>
#include <linux/bitops.h>
+enum cpuid_leafs
+{
+ CPUID_1_EDX = 0,
+ CPUID_8000_0001_EDX,
+ CPUID_8086_0001_EDX,
+ CPUID_LNX_1,
+ CPUID_1_ECX,
+ CPUID_C000_0001_EDX,
+ CPUID_8000_0001_ECX,
+ CPUID_LNX_2,
+ CPUID_LNX_3,
+ CPUID_7_0_EBX,
+ CPUID_D_1_EAX,
+ CPUID_F_0_EDX,
+ CPUID_F_1_EDX,
+ CPUID_8000_0008_EBX,
+ CPUID_6_EAX,
+ CPUID_8000_000A_EDX,
+};
+
#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
@@ -355,60 +386,31 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
} while (0)
#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
-#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
-#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
-#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
-#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
-#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
-#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
-#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
-#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
-#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
-#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
-#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
-#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
-#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
-#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
-#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
-#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH)
-#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
-#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
-#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
-#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT)
#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
-#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
-#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
-#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
-#define cpu_has_perfctr_l2 boot_cpu_has(X86_FEATURE_PERFCTR_L2)
-#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8)
-#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
-#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU)
-#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT)
-#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT)
-
-#if __GNUC__ >= 4
+/*
+ * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * fast paths and boot_cpu_has() otherwise!
+ */
+
+#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
extern void warn_pre_alternatives(void);
extern bool __static_cpu_has_safe(u16 bit);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index f80d70009ff8..6d7d0e52ed5a 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
-#include <asm/pvclock.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -72,10 +71,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_VSYSCALL_EMULATION
VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
#endif
-#ifdef CONFIG_PARAVIRT_CLOCK
- PVCLOCK_FIXMAP_BEGIN,
- PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
-#endif
#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 3c3550c3a4a3..0fd440df63f1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
@@ -224,18 +225,67 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-/* xstate instruction fault handler: */
-#define xstate_fault(__err) \
- \
- ".section .fixup,\"ax\"\n" \
- \
- "3: movl $-2,%[_err]\n" \
- " jmp 2b\n" \
- \
- ".previous\n" \
- \
- _ASM_EXTABLE(1b, 3b) \
- : [_err] "=r" (__err)
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n\t" \
+ ".pushsection .fixup,\"ax\"\n\t" \
+ "3: movl $-2,%[err]\n\t" \
+ "jmp 2b\n\t" \
+ ".popsection\n\t" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE_2(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ ".pushsection .fixup,\"ax\"\n" \
+ "4: movl $-2, %[err]\n" \
+ "jmp 3b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(661b, 4b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ ".pushsection .fixup,\"ax\"\n" \
+ "4: movl $-2, %[err]\n" \
+ "jmp 3b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(661b, 4b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
/*
* This function is called only during boot time when x86 caps are not set
@@ -246,22 +296,14 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
WARN_ON(system_state != SYSTEM_BOOTING);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- asm volatile("1:"XSAVES"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+ XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
else
- asm volatile("1:"XSAVE"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
@@ -276,22 +318,14 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
WARN_ON(system_state != SYSTEM_BOOTING);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- asm volatile("1:"XRSTORS"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
else
- asm volatile("1:"XRSTOR"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
/* We should never fault when copying from a kernel buffer: */
WARN_ON_FPU(err);
@@ -305,33 +339,11 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
WARN_ON(!alternatives_patched);
- /*
- * If xsaves is enabled, xsaves replaces xsaveopt because
- * it supports compact format and supervisor states in addition to
- * modified optimization in xsaveopt.
- *
- * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
- * because xsaveopt supports modified optimization which is not
- * supported by xsave.
- *
- * If none of xsaves and xsaveopt is enabled, use xsave.
- */
- alternative_input_2(
- "1:"XSAVE,
- XSAVEOPT,
- X86_FEATURE_XSAVEOPT,
- XSAVES,
- X86_FEATURE_XSAVES,
- [xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
- "memory");
- asm volatile("2:\n\t"
- xstate_fault(err)
- : "0" (err)
- : "memory");
+ XSTATE_XSAVE(xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
@@ -344,23 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
- /*
- * Use xrstors to restore context if it is enabled. xrstors supports
- * compacted format of xsave area which is not supported by xrstor.
- */
- alternative_input(
- "1: " XRSTOR,
- XRSTORS,
- X86_FEATURE_XSAVES,
- "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
- : "memory");
-
- asm volatile("2:\n"
- xstate_fault(err)
- : "0" (err)
- : "memory");
+ XSTATE_XRESTORE(xstate, lmask, hmask, err);
/* We should never fault when copying from a kernel buffer: */
WARN_ON_FPU(err);
@@ -388,12 +386,10 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
if (unlikely(err))
return -EFAULT;
- __asm__ __volatile__(ASM_STAC "\n"
- "1:"XSAVE"\n"
- "2: " ASM_CLAC "\n"
- xstate_fault(err)
- : "D" (buf), "a" (-1), "d" (-1), "0" (err)
- : "memory");
+ stac();
+ XSTATE_OP(XSAVE, buf, -1, -1, err);
+ clac();
+
return err;
}
@@ -405,14 +401,12 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
struct xregs_state *xstate = ((__force struct xregs_state *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
-
- __asm__ __volatile__(ASM_STAC "\n"
- "1:"XRSTOR"\n"
- "2: " ASM_CLAC "\n"
- xstate_fault(err)
- : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory"); /* memory required? */
+ int err;
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
return err;
}
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 3a6c89b70307..af30fdeb140d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,15 +20,16 @@
/* Supported features which support lazy state saving */
#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_SSE)
+
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
-
/* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
new file mode 100644
index 000000000000..e1a411786bf5
--- /dev/null
+++ b/arch/x86/include/asm/intel_pt.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index b72ad0faa6c5..b41ee164930a 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -1,5 +1,5 @@
/*
- * iosf_mbi.h: Intel OnChip System Fabric MailBox access support
+ * Intel OnChip System Fabric MailBox access support
*/
#ifndef IOSF_MBI_SYMS_H
@@ -16,6 +16,18 @@
#define MBI_MASK_LO 0x000000FF
#define MBI_ENABLE 0xF0
+/* IOSF SB read/write opcodes */
+#define MBI_MMIO_READ 0x00
+#define MBI_MMIO_WRITE 0x01
+#define MBI_CFG_READ 0x04
+#define MBI_CFG_WRITE 0x05
+#define MBI_CR_READ 0x06
+#define MBI_CR_WRITE 0x07
+#define MBI_REG_READ 0x10
+#define MBI_REG_WRITE 0x11
+#define MBI_ESRAM_READ 0x12
+#define MBI_ESRAM_WRITE 0x13
+
/* Baytrail available units */
#define BT_MBI_UNIT_AUNIT 0x00
#define BT_MBI_UNIT_SMC 0x01
@@ -28,50 +40,13 @@
#define BT_MBI_UNIT_SATA 0xA3
#define BT_MBI_UNIT_PCIE 0xA6
-/* Baytrail read/write opcodes */
-#define BT_MBI_AUNIT_READ 0x10
-#define BT_MBI_AUNIT_WRITE 0x11
-#define BT_MBI_SMC_READ 0x10
-#define BT_MBI_SMC_WRITE 0x11
-#define BT_MBI_CPU_READ 0x10
-#define BT_MBI_CPU_WRITE 0x11
-#define BT_MBI_BUNIT_READ 0x10
-#define BT_MBI_BUNIT_WRITE 0x11
-#define BT_MBI_PMC_READ 0x06
-#define BT_MBI_PMC_WRITE 0x07
-#define BT_MBI_GFX_READ 0x00
-#define BT_MBI_GFX_WRITE 0x01
-#define BT_MBI_SMIO_READ 0x06
-#define BT_MBI_SMIO_WRITE 0x07
-#define BT_MBI_USB_READ 0x06
-#define BT_MBI_USB_WRITE 0x07
-#define BT_MBI_SATA_READ 0x00
-#define BT_MBI_SATA_WRITE 0x01
-#define BT_MBI_PCIE_READ 0x00
-#define BT_MBI_PCIE_WRITE 0x01
-
/* Quark available units */
#define QRK_MBI_UNIT_HBA 0x00
#define QRK_MBI_UNIT_HB 0x03
#define QRK_MBI_UNIT_RMU 0x04
#define QRK_MBI_UNIT_MM 0x05
-#define QRK_MBI_UNIT_MMESRAM 0x05
#define QRK_MBI_UNIT_SOC 0x31
-/* Quark read/write opcodes */
-#define QRK_MBI_HBA_READ 0x10
-#define QRK_MBI_HBA_WRITE 0x11
-#define QRK_MBI_HB_READ 0x10
-#define QRK_MBI_HB_WRITE 0x11
-#define QRK_MBI_RMU_READ 0x10
-#define QRK_MBI_RMU_WRITE 0x11
-#define QRK_MBI_MM_READ 0x10
-#define QRK_MBI_MM_WRITE 0x11
-#define QRK_MBI_MMESRAM_READ 0x12
-#define QRK_MBI_MMESRAM_WRITE 0x13
-#define QRK_MBI_SOC_READ 0x06
-#define QRK_MBI_SOC_WRITE 0x07
-
#if IS_ENABLED(CONFIG_IOSF_MBI)
bool iosf_mbi_available(void);
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 615fa9061b57..cfc9a0d2d07c 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -119,6 +119,8 @@ static inline void
native_apic_mem_write(APIC_ICR, cfg);
}
+extern void default_send_IPI_single(int cpu, int vector);
+extern void default_send_IPI_single_phys(int cpu, int vector);
extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
int vector);
extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 5daeca3d0f9e..adc54c12cbd1 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,12 +1,18 @@
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H
-#ifndef __ASSEMBLY__
-
-#include <linux/stringify.h>
-#include <linux/types.h>
-#include <asm/nops.h>
-#include <asm/asm.h>
+#ifndef HAVE_JUMP_LABEL
+/*
+ * For better or for worse, if jump labels (the gcc extension) are missing,
+ * then the entire static branch patching infrastructure is compiled out.
+ * If that happens, the code in here will malfunction. Raise a compiler
+ * error instead.
+ *
+ * In theory, jump labels and the static branch patching infrastructure
+ * could be decoupled to fix this.
+ */
+#error asm/jump_label.h included on a non-jump-label kernel
+#endif
#define JUMP_LABEL_NOP_SIZE 5
@@ -16,6 +22,14 @@
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
@@ -59,5 +73,40 @@ struct jump_entry {
jump_label_t key;
};
-#endif /* __ASSEMBLY__ */
+#else /* __ASSEMBLY__ */
+
+.macro STATIC_JUMP_IF_TRUE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .else
+ .byte STATIC_KEY_INIT_NOP
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key
+ .popsection
+.endm
+
+.macro STATIC_JUMP_IF_FALSE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ .byte STATIC_KEY_INIT_NOP
+ .else
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key + 1
+ .popsection
+.endm
+
+#endif /* __ASSEMBLY__ */
+
#endif
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a57a31..73d0c9b92087 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
#define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+#define SWITCHER_TEXT_PAGES (1)
+#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
+#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 34e62b1dcfce..1e1b07a5a738 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_MICROCODE_H
#define _ASM_X86_MICROCODE_H
+#include <asm/cpu.h>
#include <linux/earlycpio.h>
#define native_rdmsr(msr, val1, val2) \
@@ -95,14 +96,14 @@ static inline void __exit exit_amd_microcode(void) {}
/*
* In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
+ * x86_cpuid_vendor() gets vendor id for BSP.
*
* In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
*
- * x86_vendor() gets vendor information directly from CPUID.
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
*/
-static inline int x86_vendor(void)
+static inline int x86_cpuid_vendor(void)
{
u32 eax = 0x00000000;
u32 ebx, ecx = 0, edx;
@@ -118,40 +119,14 @@ static inline int x86_vendor(void)
return X86_VENDOR_UNKNOWN;
}
-static inline unsigned int __x86_family(unsigned int sig)
-{
- unsigned int x86;
-
- x86 = (sig >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (sig >> 20) & 0xff;
-
- return x86;
-}
-
-static inline unsigned int x86_family(void)
+static inline unsigned int x86_cpuid_family(void)
{
u32 eax = 0x00000001;
u32 ebx, ecx = 0, edx;
native_cpuid(&eax, &ebx, &ecx, &edx);
- return __x86_family(eax);
-}
-
-static inline unsigned int x86_model(unsigned int sig)
-{
- unsigned int x86, model;
-
- x86 = __x86_family(sig);
-
- model = (sig >> 4) & 0xf;
-
- if (x86 == 0x6 || x86 == 0xf)
- model += ((sig >> 16) & 0xf) << 4;
-
- return model;
+ return x86_family(eax);
}
#ifdef CONFIG_MICROCODE
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..bfd9b2a35a0b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
#endif
cpumask_set_cpu(cpu, mm_cpumask(next));
- /* Re-load page tables */
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
load_cr3(next->pgd);
+
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* schedule, protecting us from simultaneous changes.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
+
/*
* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 93724cc62177..eb4b09b41df5 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -1,7 +1,13 @@
#ifndef _ASM_X86_MSI_H
#define _ASM_X86_MSI_H
#include <asm/hw_irq.h>
+#include <asm/irqdomain.h>
typedef struct irq_alloc_info msi_alloc_info_t;
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
+
#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 690b4027e17c..b05402ef3b84 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -321,6 +321,7 @@
#define MSR_F15H_PERF_CTR 0xc0010201
#define MSR_F15H_NB_PERF_CTL 0xc0010240
#define MSR_F15H_NB_PERF_CTR 0xc0010241
+#define MSR_F15H_IC_CFG 0xc0011021
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h
new file mode 100644
index 000000000000..7567225747d8
--- /dev/null
+++ b/arch/x86/include/asm/msr-trace.h
@@ -0,0 +1,57 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed),
+ TP_STRUCT__entry(
+ __field( unsigned, msr )
+ __field( u64, val )
+ __field( int, failed )
+ ),
+ TP_fast_assign(
+ __entry->msr = msr;
+ __entry->val = val;
+ __entry->failed = failed;
+ ),
+ TP_printk("%x, value %llx%s",
+ __entry->msr,
+ __entry->val,
+ __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b284e4a7..93fb7c1cffda 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -32,6 +32,16 @@ struct msr_regs_info {
int err;
};
+struct saved_msr {
+ bool valid;
+ struct msr_info info;
+};
+
+struct saved_msrs {
+ unsigned int num;
+ struct saved_msr *array;
+};
+
static inline unsigned long long native_read_tscp(unsigned int *aux)
{
unsigned long low, high;
@@ -57,11 +67,34 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Be very careful with includes. This header is prone to include loops.
+ */
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_read_msr;
+extern struct tracepoint __tracepoint_write_msr;
+extern struct tracepoint __tracepoint_rdpmc;
+#define msr_tracepoint_active(t) static_key_false(&(t).key)
+extern void do_trace_write_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned msr, u64 val, int failed);
+#else
+#define msr_tracepoint_active(t) false
+static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {}
+#endif
+
static inline unsigned long long native_read_msr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);
asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
@@ -78,6 +111,8 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
_ASM_EXTABLE(2b, 3b)
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
: "c" (msr), [fault] "i" (-EIO));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
}
@@ -85,6 +120,8 @@ static inline void native_write_msr(unsigned int msr,
unsigned low, unsigned high)
{
asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}
/* Can be uninlined because referenced by paravirt */
@@ -102,6 +139,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
: "c" (msr), "0" (low), "d" (high),
[fault] "i" (-EIO)
: "memory");
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), err);
return err;
}
@@ -160,6 +199,8 @@ static inline unsigned long long native_read_pmc(int counter)
DECLARE_ARGS(val, low, high);
asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ if (msr_tracepoint_active(__tracepoint_rdpmc))
+ do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
@@ -190,7 +231,7 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
static inline void wrmsrl(unsigned msr, u64 val)
{
- native_write_msr(msr, (u32)val, (u32)(val >> 32));
+ native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}
/* wrmsr with exception handling */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index cc071c6f7d4d..7bd0099384ca 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -5,9 +5,9 @@
#include <linux/types.h>
/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT 12
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596433f8..f6192502149e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -19,6 +19,12 @@ static inline int paravirt_enabled(void)
return pv_info.paravirt_enabled;
}
+static inline int paravirt_has_feature(unsigned int feature)
+{
+ WARN_ON_ONCE(!pv_info.paravirt_enabled);
+ return (pv_info.features & feature);
+}
+
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -285,15 +291,6 @@ static inline void slow_down_io(void)
#endif
}
-#ifdef CONFIG_SMP
-static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
- phys_apicid, start_eip, start_esp);
-}
-#endif
-
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
@@ -375,23 +372,6 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
{
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
-static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
-}
-
-static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
-}
-
-static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
-}
static inline pte_t __pte(pteval_t val)
{
@@ -922,23 +902,11 @@ extern void default_banner(void);
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-#define USERGS_SYSRET32 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
-
#ifdef CONFIG_X86_32
#define GET_CR0_INTO_EAX \
push %ecx; push %edx; \
call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
pop %edx; pop %ecx
-
-#define ENABLE_INTERRUPTS_SYSEXIT \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-
-
#else /* !CONFIG_X86_32 */
/*
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 31247b5bff7c..77db5616a473 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -70,9 +70,14 @@ struct pv_info {
#endif
int paravirt_enabled;
+ unsigned int features; /* valid only if paravirt_enabled is set */
const char *name;
};
+#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
+/* Supported features */
+#define PV_SUPPORTED_RTC (1<<0)
+
struct pv_init_ops {
/*
* Patch may replace one of the defined code sequences with
@@ -157,15 +162,6 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
-#ifdef CONFIG_X86_32
- /*
- * Atomically enable interrupts and return to userspace. This
- * is only used in 32-bit kernels. 64-bit kernels use
- * usergs_sysret32 instead.
- */
- void (*irq_enable_sysexit)(void);
-#endif
-
/*
* Switch to usermode gs and return to 64-bit usermode using
* sysret. Only used in 64-bit kernels to return to 64-bit
@@ -174,14 +170,6 @@ struct pv_cpu_ops {
*/
void (*usergs_sysret64)(void);
- /*
- * Switch to usermode gs and return to 32-bit usermode using
- * sysret. Used to return to 32-on-64 compat processes.
- * Other usermode register state, including %esp, must already
- * be restored.
- */
- void (*usergs_sysret32)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
@@ -215,14 +203,6 @@ struct pv_irq_ops {
#endif
};
-struct pv_apic_ops {
-#ifdef CONFIG_X86_LOCAL_APIC
- void (*startup_ipi_hook)(int phys_apicid,
- unsigned long start_eip,
- unsigned long start_esp);
-#endif
-};
-
struct pv_mmu_ops {
unsigned long (*read_cr2)(void);
void (*write_cr2)(unsigned long);
@@ -274,12 +254,6 @@ struct pv_mmu_ops {
pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
- void (*pte_update_defer)(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep);
- void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp);
- void (*pmd_update_defer)(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
@@ -354,7 +328,6 @@ struct paravirt_patch_template {
struct pv_time_ops pv_time_ops;
struct pv_cpu_ops pv_cpu_ops;
struct pv_irq_ops pv_irq_ops;
- struct pv_apic_ops pv_apic_ops;
struct pv_mmu_ops pv_mmu_ops;
struct pv_lock_ops pv_lock_ops;
};
@@ -364,7 +337,6 @@ extern struct pv_init_ops pv_init_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
-extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
extern struct pv_lock_ops pv_lock_ops;
@@ -402,10 +374,8 @@ extern struct pv_lock_ops pv_lock_ops;
__visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \
asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name))
-unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
-unsigned paravirt_patch_ignore(unsigned len);
unsigned paravirt_patch_call(void *insnbuf,
const void *target, u16 tgt_clobbers,
unsigned long addr, u16 site_clobbers,
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b2e9df..d3eee663c41f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -69,9 +69,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pmd_clear(pmd) native_pmd_clear(pmd)
#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
-#define pmd_update(mm, addr, ptep) do { } while (0)
-#define pmd_update_defer(mm, addr, ptep) do { } while (0)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -731,14 +728,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
* updates should either be sets, clears, or set_pte_atomic for P->P
* transitions, which means this hook should only be called for user PTEs.
* This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush. The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
+ * requires a subsequent TLB flush.
*/
#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
#endif
/*
@@ -830,9 +822,7 @@ static inline int pmd_write(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- pmd_t pmd = native_pmdp_get_and_clear(pmdp);
- pmd_update(mm, addr, pmdp);
- return pmd;
+ return native_pmdp_get_and_clear(pmdp);
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -840,7 +830,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
- pmd_update(mm, addr, pmdp);
}
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 67522256c7ff..2d5a50cb61a2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void)
#else
#define __cpuid native_cpuid
#define paravirt_enabled() 0
+#define paravirt_has(x) 0
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7a6bed5c08bc..fdcc04020636 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,6 +4,15 @@
#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
+#ifdef CONFIG_KVM_GUEST
+extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
+
/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
@@ -91,10 +100,5 @@ struct pvclock_vsyscall_time_info {
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
-#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
- int size);
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e711ba88..9f92c180ed2f 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * struct __qspinlock *l = (void *)lock;
+ * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ "lock cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ "ret;"
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ "ret;"
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif /* CONFIG_64BIT */
#endif
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index a82c4f1b4d83..2cb1cc253d51 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type);
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+void run_crash_ipi_callback(struct pt_regs *regs);
#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 222a6a3ca2b5..dfcf0727623b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,15 +21,6 @@
extern int smp_num_siblings;
extern unsigned int num_processors;
-static inline bool cpu_has_ht_siblings(void)
-{
- bool has_siblings = false;
-#ifdef CONFIG_SMP
- has_siblings = cpu_has_ht && smp_num_siblings > 1;
-#endif
- return has_siblings;
-}
-
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
/* cpus sharing the last level cache: */
@@ -74,9 +65,6 @@ struct smp_ops {
extern void set_cpu_sibling_map(int cpu);
#ifdef CONFIG_SMP
-#ifndef CONFIG_PARAVIRT
-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
-#endif
extern struct smp_ops smp_ops;
static inline void smp_send_stop(void)
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index d1793f06854d..8e9dbe7b73a1 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -15,6 +15,7 @@ struct saved_context {
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
bool misc_enable_saved;
+ struct saved_msrs saved_msrs;
struct desc_ptr gdt_desc;
struct desc_ptr idt;
u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 7ebf0ebe4e68..6136a18152af 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -24,6 +24,7 @@ struct saved_context {
unsigned long cr0, cr2, cr3, cr4, cr8;
u64 misc_enable;
bool misc_enable_saved;
+ struct saved_msrs saved_msrs;
unsigned long efer;
u16 gdt_pad; /* Unused */
struct desc_ptr gdt_desc;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 09b1b0ab94b7..660458af425d 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -745,5 +745,14 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
#undef __copy_from_user_overflow
#undef __copy_to_user_overflow
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ *
+ * Caller must use pagefault_enable/disable, or run in interrupt context,
+ * and also do a uaccess_ok() check
+ */
+#define __copy_from_user_nmi __copy_from_user_inatomic
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 756de9190aec..deabaf9759b6 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -22,6 +22,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_hpet_page;
+ long sym_pvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index cd0fc0cc78bc..1ae89a2721d6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -82,13 +82,11 @@ struct x86_init_paging {
* struct x86_init_timers - platform specific timer setup
* @setup_perpcu_clockev: set up the per cpu clock event device for the
* boot cpu
- * @tsc_pre_init: platform function called before TSC init
* @timer_init: initialize the platform timer (default PIT/HPET)
* @wallclock_init: init the wallclock device
*/
struct x86_init_timers {
void (*setup_percpu_clockev)(void);
- void (*tsc_pre_init)(void);
void (*timer_init)(void);
void (*wallclock_init)(void);
};
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 4c20dd333412..3bcdcc84259d 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -310,10 +310,10 @@ HYPERVISOR_mca(struct xen_mc *mc_op)
}
static inline int
-HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
+HYPERVISOR_platform_op(struct xen_platform_op *op)
{
- platform_op->interface_version = XENPF_INTERFACE_VERSION;
- return _hypercall1(int, dom0_op, platform_op);
+ op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, op);
}
static inline int
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 5a08bc8bff33..c54beb44c4c1 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -553,7 +553,7 @@ do { \
if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
xor_speed(&xor_block_sse_pf64); \
- } else if (cpu_has_mmx) { \
+ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
} else { \
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 03429da2fa80..2184943341bf 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -16,7 +16,7 @@ struct mce {
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
__u8 severity;
- __u8 usable_addr;
+ __u8 pad;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2f69e3b184f6..8a5cddac7d44 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -82,6 +82,12 @@ physid_mask_t phys_cpu_present_map;
static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
/*
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
+ */
+static int apic_extnmi = APIC_EXTNMI_BSP;
+
+/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1161,6 +1167,8 @@ void __init init_bsp_APIC(void)
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
}
@@ -1378,9 +1386,11 @@ void setup_local_APIC(void)
apic_write(APIC_LVT0, value);
/*
- * only the BP should see the LINT1 NMI signal, obviously.
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
*/
- if (!cpu)
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -2270,6 +2280,7 @@ static struct {
unsigned int apic_tmict;
unsigned int apic_tdcr;
unsigned int apic_thmr;
+ unsigned int apic_cmci;
} apic_pm_state;
static int lapic_suspend(void)
@@ -2299,6 +2310,10 @@ static int lapic_suspend(void)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
local_irq_save(flags);
disable_local_APIC();
@@ -2355,10 +2370,14 @@ static void lapic_resume(void)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
if (maxlvt >= 4)
apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
@@ -2548,3 +2567,23 @@ static int __init apic_set_disabled_cpu_apicid(char *arg)
return 0;
}
early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f92ab36979a2..9968f30cca3e 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,6 +185,7 @@ static struct apic apic_flat = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
.send_IPI_allbutself = flat_send_IPI_allbutself,
@@ -230,17 +231,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- default_send_IPI_mask_sequence_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
- int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpumask, vector);
-}
-
static void physflat_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -248,7 +238,7 @@ static void physflat_send_IPI_allbutself(int vector)
static void physflat_send_IPI_all(int vector)
{
- physflat_send_IPI_mask(cpu_online_mask, vector);
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
}
static int physflat_probe(void)
@@ -292,8 +282,9 @@ static struct apic apic_physflat = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 0d96749cfcac..331a7a07c48f 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -30,6 +30,7 @@
#include <asm/e820.h>
static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
@@ -144,6 +145,7 @@ struct apic apic_noop = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
.send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
.send_IPI_allbutself = noop_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 38dd5efdd04c..c80c02c6ec49 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -193,20 +193,17 @@ static int __init numachip_system_init(void)
case 1:
init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
numachip_apic_icr_write = numachip1_apic_icr_write;
- x86_init.pci.arch_init = pci_numachip_init;
break;
case 2:
init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
numachip_apic_icr_write = numachip2_apic_icr_write;
-
- /* Use MCFG config cycles rather than locked CF8 cycles */
- raw_pci_ops = &pci_mmcfg;
break;
default:
return 0;
}
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
return 0;
}
@@ -276,6 +273,7 @@ static const struct apic apic_numachip1 __refconst = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
.send_IPI_allbutself = numachip_send_IPI_allbutself,
@@ -327,6 +325,7 @@ static const struct apic apic_numachip2 __refconst = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
.send_IPI_allbutself = numachip_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 971cf8875939..cf9bd896c12d 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
return cpuid_apic >> index_msb;
}
-static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
static void bigsmp_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector)
static void bigsmp_send_IPI_all(int vector)
{
- bigsmp_send_IPI_mask(cpu_online_mask, vector);
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
}
static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -180,7 +175,8 @@ static struct apic apic_bigsmp = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
- .send_IPI_mask = bigsmp_send_IPI_mask,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
.send_IPI_mask_allbutself = NULL,
.send_IPI_allbutself = bigsmp_send_IPI_allbutself,
.send_IPI_all = bigsmp_send_IPI_all,
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 62071569bd50..eb45fc9b6124 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,16 @@
#include <asm/proto.h>
#include <asm/ipi.h>
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+}
+
void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
unsigned long query_cpu;
@@ -55,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
local_irq_restore(flags);
}
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
+{
+ apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 5f1feb6854af..ade25320df96 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -96,8 +96,8 @@ static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
return arg->msi_hwirq;
}
-static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *arg)
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
{
struct pci_dev *pdev = to_pci_dev(dev);
struct msi_desc *desc = first_pci_msi_entry(pdev);
@@ -113,11 +113,13 @@ static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
-static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
{
arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
}
+EXPORT_SYMBOL_GPL(pci_msi_set_desc);
static struct msi_domain_ops pci_msi_domain_ops = {
.get_hwirq = pci_msi_get_hwirq,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7694ae6c1199..f316e34abb42 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,6 +105,7 @@ static struct apic apic_default = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
.send_IPI_allbutself = default_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 861bc59c8f25..908cb37da171 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -29,6 +29,7 @@ struct apic_chip_data {
};
struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
static DEFINE_RAW_SPINLOCK(vector_lock);
static cpumask_var_t vector_cpumask;
static struct irq_chip lapic_controller;
@@ -66,6 +67,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
return data ? &data->cfg : NULL;
}
+EXPORT_SYMBOL_GPL(irqd_cfg);
struct irq_cfg *irq_cfg(unsigned int irq)
{
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cc8311c4d298..aca8b75c1552 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu)
return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
}
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+}
+
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
@@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = {
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 662e9150ea6f..a1242e2c12e6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
+}
+
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
@@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 4a139465f1d4..d760c6bb37b5 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 439df975bc7a..84a7524b202c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -65,9 +65,6 @@ void common(void) {
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-#ifdef CONFIG_X86_32
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
-#endif
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d8f42f902a0f..f2edafb5f24e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,7 +23,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
- OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a8816b325162..a07956a08936 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -304,7 +304,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
int cpu = smp_processor_id();
/* get information required for multi-node processors */
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
*/
int ht_nodeid = c->initial_apicid;
- if (ht_nodeid >= 0 &&
- __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
@@ -678,9 +677,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
* Disable it on the affected CPUs.
*/
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
- if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
+ if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
value |= 0x1E;
- wrmsrl_safe(0xc0011021, value);
+ wrmsrl_safe(MSR_F15H_IC_CFG, value);
}
}
}
@@ -922,7 +921,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
void set_dr_addr_mask(unsigned long mask, int dr)
{
- if (!cpu_has_bpext)
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
return;
switch (dr) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index d8fba5c15fbd..ae20be6e483c 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -43,7 +43,7 @@ static void init_c3(struct cpuinfo_x86 *c)
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
- c->x86_capability[5] = cpuid_edx(0xC0000001);
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c2b7522cbf35..37830de8f60a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -581,14 +581,9 @@ void cpu_detect(struct cpuinfo_x86 *c)
u32 junk, tfms, cap0, misc;
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
-
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xf) << 4;
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_mask = x86_stepping(tfms);
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -599,50 +594,47 @@ void cpu_detect(struct cpuinfo_x86 *c)
void get_cpu_cap(struct cpuinfo_x86 *c)
{
- u32 tfms, xlvl;
- u32 ebx;
+ u32 eax, ebx, ecx, edx;
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
}
/* Additional Intel-defined flags: level 0x00000007 */
if (c->cpuid_level >= 0x00000007) {
- u32 eax, ebx, ecx, edx;
-
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
- c->x86_capability[9] = ebx;
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
}
/* Extended state features: level 0x0000000d */
if (c->cpuid_level >= 0x0000000d) {
- u32 eax, ebx, ecx, edx;
-
cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
- c->x86_capability[10] = eax;
+ c->x86_capability[CPUID_D_1_EAX] = eax;
}
/* Additional Intel-defined flags: level 0x0000000F */
if (c->cpuid_level >= 0x0000000F) {
- u32 eax, ebx, ecx, edx;
/* QoS sub-leaf, EAX=0Fh, ECX=0 */
cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
- c->x86_capability[11] = edx;
+ c->x86_capability[CPUID_F_0_EDX] = edx;
+
if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = ebx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
- c->x86_capability[12] = edx;
+ c->x86_capability[CPUID_F_1_EDX] = edx;
+
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
@@ -654,22 +646,24 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
}
/* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = eax;
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
+ if ((eax & 0xffff0000) == 0x80000000) {
+ if (eax >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
}
if (c->extended_cpuid_level >= 0x80000008) {
- u32 eax = cpuid_eax(0x80000008);
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
- c->x86_capability[13] = cpuid_ebx(0x80000008);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
}
#ifdef CONFIG_X86_32
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -679,6 +673,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
init_scattered_cpuid_features(c);
}
@@ -1185,7 +1182,7 @@ void syscall_init(void)
* They both write to the same internal register. STAR allows to
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
@@ -1443,7 +1440,9 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
- if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
+ if (cpu_feature_enabled(X86_FEATURE_VME) ||
+ cpu_has_tsc ||
+ boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 209ac1e7d1f0..565648bc1a0a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -445,7 +445,8 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- if (cpu_has_ds) {
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
unsigned int l1;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
if (!(l1 & (1<<11)))
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index e38d338a6447..0b6c52388cf4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -591,7 +591,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
unsigned edx;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (cpu_has_topoext)
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
cpuid_count(0x8000001d, index, &eax.full,
&ebx.full, &ecx.full, &edx);
else
@@ -637,7 +637,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
num_cache_leaves = find_num_cache_leaves(c);
} else if (c->extended_cpuid_level >= 0x80000006) {
if (cpuid_edx(0x80000006) & 0xf000)
@@ -809,7 +809,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct cacheinfo *this_leaf;
int i, sibling;
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
this_leaf = this_cpu_ci->info_list + index;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index c5b0d562dbf5..a006f4cd792b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -114,7 +114,6 @@ static struct work_struct mce_work;
static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-static int mce_usable_address(struct mce *m);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
@@ -475,6 +474,28 @@ static void mce_report_event(struct pt_regs *regs)
irq_work_queue(&mce_irq_work);
}
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ return 0;
+
+ /* Checks after this one are Intel-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 1;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return 0;
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return 0;
+ return 1;
+}
+
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -484,7 +505,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
if (!mce)
return NOTIFY_DONE;
- if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+ if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
memory_failure(pfn, MCE_VECTOR, 0);
}
@@ -522,10 +543,10 @@ static bool memory_error(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
- /*
- * coming soon
- */
- return false;
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -567,7 +588,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
*/
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
- bool error_logged = false;
+ bool error_seen = false;
struct mce m;
int severity;
int i;
@@ -601,6 +622,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
(m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
+ error_seen = true;
+
mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
@@ -608,27 +631,24 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- /*
- * In the cases where we don't have a valid address after all,
- * do not add it into the ring buffer.
- */
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
- if (m.status & MCI_STATUS_ADDRV) {
+ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
- m.usable_addr = mce_usable_address(&m);
-
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
- }
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
- error_logged = true;
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
mce_log(&m);
+ else if (mce_usable_address(&m)) {
+ /*
+ * Although we skipped logging this, we still want
+ * to take action. Add to the pool so the registered
+ * notifiers will see it.
+ */
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
}
/*
@@ -644,7 +664,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
sync_core();
- return error_logged;
+ return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -931,23 +951,6 @@ reset:
return ret;
}
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
- return 0;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
- return 1;
-}
-
static void mce_clear_state(unsigned long *toclear)
{
int i;
@@ -999,6 +1002,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int flags = MF_ACTION_REQUIRED;
int lmce = 0;
+ /* If this CPU is offline, just bail out. */
+ if (cpu_is_offline(smp_processor_id())) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return;
+ }
+ }
+
ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1089,7 +1103,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/* assuming valid severity level != 0 */
m.severity = severity;
- m.usable_addr = mce_usable_address(&m);
mce_log(&m);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3e94ef461fd..faec7120c508 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -129,8 +129,8 @@ void __init load_ucode_bsp(void)
if (!have_cpuid_p())
return;
- vendor = x86_vendor();
- family = x86_family();
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
switch (vendor) {
case X86_VENDOR_INTEL:
@@ -165,8 +165,8 @@ void load_ucode_ap(void)
if (!have_cpuid_p())
return;
- vendor = x86_vendor();
- family = x86_family();
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
switch (vendor) {
case X86_VENDOR_INTEL:
@@ -206,8 +206,8 @@ void reload_early_microcode(void)
{
int vendor, family;
- vendor = x86_vendor();
- family = x86_family();
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
switch (vendor) {
case X86_VENDOR_INTEL:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ce47402eb2f9..ee81c544ee0d 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -145,10 +145,10 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
int ext_sigcount, i;
struct extended_signature *ext_sig;
- fam = __x86_family(sig);
+ fam = x86_family(sig);
model = x86_model(sig);
- fam_ucode = __x86_family(mc_header->sig);
+ fam_ucode = x86_family(mc_header->sig);
model_ucode = x86_model(mc_header->sig);
if (fam == fam_ucode && model == model_ucode)
@@ -163,7 +163,7 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
ext_sigcount = ext_header->count;
for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = __x86_family(ext_sig->sig);
+ fam_ucode = x86_family(ext_sig->sig);
model_ucode = x86_model(ext_sig->sig);
if (fam == fam_ucode && model == model_ucode)
@@ -365,7 +365,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- family = __x86_family(csig.sig);
+ family = x86_family(csig.sig);
model = x86_model(csig.sig);
if ((model >= 5) || (family > 6)) {
@@ -521,16 +521,12 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
{
#ifdef CONFIG_X86_64
unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
- unsigned int family, model, stepping;
char name[30];
native_cpuid(&eax, &ebx, &ecx, &edx);
- family = __x86_family(eax);
- model = x86_model(eax);
- stepping = eax & 0xf;
-
- sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
return get_builtin_firmware(cp, name);
#else
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 70d7c93f4550..0d98503c2245 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -593,9 +593,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct range range_new[RANGE_NUM];
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
unsigned long range_sums_new;
- static int nr_range_new;
+ int nr_range_new;
int num_reg;
/* Convert ranges to var ranges state: */
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3b533cf37c74..c870af161008 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -349,7 +349,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (cpu_has_mtrr)
+ if (boot_cpu_has(X86_FEATURE_MTRR))
get_fixed_ranges(mtrr_state.fixed_ranges);
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index f891b4750f04..5c3d149ee91c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -682,7 +682,7 @@ void __init mtrr_bp_init(void)
phys_addr = 32;
- if (cpu_has_mtrr) {
+ if (boot_cpu_has(X86_FEATURE_MTRR)) {
mtrr_if = &generic_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(36);
size_and_mask = 0x00f00000;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bf79d7c97df..1b443db2db50 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -482,6 +482,9 @@ int x86_pmu_hw_config(struct perf_event *event)
/* Support for IP fixup */
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
+
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
}
if (event->attr.precise_ip > precise)
@@ -1531,6 +1534,7 @@ static void __init filter_events(struct attribute **attrs)
{
struct device_attribute *d;
struct perf_pmu_events_attr *pmu_attr;
+ int offset = 0;
int i, j;
for (i = 0; attrs[i]; i++) {
@@ -1539,7 +1543,7 @@ static void __init filter_events(struct attribute **attrs)
/* str trumps id */
if (pmu_attr->event_str)
continue;
- if (x86_pmu.event_map(i))
+ if (x86_pmu.event_map(i + offset))
continue;
for (j = i; attrs[j]; j++)
@@ -1547,6 +1551,14 @@ static void __init filter_events(struct attribute **attrs)
/* Check the shifted attr. */
i--;
+
+ /*
+ * event_map() is index based, the attrs array is organized
+ * by increasing event index. If we shift the events, then
+ * we need to compensate for the event_map(), otherwise
+ * we are looking up the wrong event in the map
+ */
+ offset++;
}
}
@@ -2250,12 +2262,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
ss_base = get_segment_base(regs->ss);
fp = compat_ptr(ss_base + regs->bp);
+ pagefault_disable();
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (!access_ok(VERIFY_READ, fp, 8))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
if (bytes != 0)
break;
@@ -2265,6 +2284,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
perf_callchain_store(entry, cs_base + frame.return_address);
fp = compat_ptr(ss_base + frame.next_frame);
}
+ pagefault_enable();
return 1;
}
#else
@@ -2302,12 +2322,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (perf_callchain_user32(regs, entry))
return;
+ pagefault_disable();
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (!access_ok(VERIFY_READ, fp, 16))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
if (bytes != 0)
break;
@@ -2315,8 +2342,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
break;
perf_callchain_store(entry, frame.return_address);
- fp = frame.next_frame;
+ fp = (void __user *)frame.next_frame;
}
+ pagefault_enable();
}
/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index d0e35ebb2adb..7bb61e32fb29 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,17 +14,7 @@
#include <linux/perf_event.h>
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) \
-do { \
- unsigned int _msr = (msr); \
- u64 _val = (val); \
- trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
- (unsigned long long)(_val)); \
- native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
-} while (0)
-#endif
+/* To enable MSR tracing please use the generic trace points. */
/*
* | NHM/WSM | SNB |
@@ -318,6 +308,10 @@ struct cpu_hw_events {
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
/* Like UEVENT_CONSTRAINT, but match flags too */
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
@@ -589,7 +583,8 @@ struct x86_pmu {
bts_active :1,
pebs :1,
pebs_active :1,
- pebs_broken :1;
+ pebs_broken :1,
+ pebs_prec_dist :1;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
@@ -907,6 +902,8 @@ void intel_pmu_lbr_init_hsw(void);
void intel_pmu_lbr_init_skl(void);
+void intel_pmu_lbr_init_knl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 1cee5d2d7ece..58610539b048 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -18,7 +18,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+ [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
@@ -160,7 +160,7 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel)
if (offset)
return offset;
- if (!cpu_has_perfctr_core)
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
offset = index;
else
offset = index << 1;
@@ -652,7 +652,7 @@ static __initconst const struct x86_pmu amd_pmu = {
static int __init amd_core_pmu_init(void)
{
- if (!cpu_has_perfctr_core)
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
return 0;
switch (boot_cpu_data.x86) {
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index cc6cedb8f25d..49742746a6c9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -523,10 +523,10 @@ static int __init amd_uncore_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
goto fail_nodev;
- if (!cpu_has_topoext)
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
goto fail_nodev;
- if (cpu_has_perfctr_nb) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_nb) {
ret = -ENOMEM;
@@ -540,7 +540,7 @@ static int __init amd_uncore_init(void)
ret = 0;
}
- if (cpu_has_perfctr_l2) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_l2) {
ret = -ENOMEM;
@@ -583,10 +583,11 @@ fail_online:
/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
amd_uncore_nb = amd_uncore_l2 = NULL;
- if (cpu_has_perfctr_l2)
+
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
perf_pmu_unregister(&amd_l2_pmu);
fail_l2:
- if (cpu_has_perfctr_nb)
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
perf_pmu_unregister(&amd_nb_pmu);
if (amd_uncore_l2)
free_percpu(amd_uncore_l2);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e2a430021e46..a667078a5180 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7,
+ MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7,
+ MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -255,7 +263,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
EVENT_CONSTRAINT_END
};
@@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL BIT_ULL(21)
+#define KNL_MCDRAM_FAR BIT_ULL(22)
+#define KNL_DDR_LOCAL BIT_ULL(23)
+#define KNL_DDR_FAR BIT_ULL(24)
+#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+ KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ SLM_DMND_READ
+#define KNL_L2_WRITE SLM_DMND_WRITE
+#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS SLM_LLC_ACCESS
+#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+ KNL_DRAM_ANY | SNB_SNP_ANY | \
+ SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
+ },
+ },
+};
+
/*
* Use from PMIs where the LBRs are already disabled.
*/
@@ -2475,6 +2519,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
}
}
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+ * (0x01c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * The PREC_DIST event has special support to minimize sample
+ * shadowing effects. One drawback is that it can be
+ * only programmed on counter 1, but that seems like an
+ * acceptable trade off.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_snb(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_core2(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
{
unsigned long flags = x86_pmu.free_running_flags;
@@ -3332,6 +3414,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_gen_event_constraints;
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
break;
@@ -3431,7 +3514,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_ivb_event_constraints;
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
if (boot_cpu_data.x86_model == 62)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
@@ -3464,7 +3548,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3499,7 +3584,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_bdw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3511,6 +3597,24 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 87: /* Knights Landing Xeon Phi */
+ memcpy(hw_cache_event_ids,
+ slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs,
+ knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_knl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_knl_extra_regs;
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ pr_cont("Knights Landing events, ");
+ break;
+
case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
x86_pmu.late_ack = true;
@@ -3521,7 +3625,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_skl_event_constraints;
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
x86_pmu.extra_regs = intel_skl_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5db1c7755548..10602f0a438f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -620,6 +620,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
};
@@ -686,6 +688,8 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -700,6 +704,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -718,9 +724,10 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
struct event_constraint intel_skl_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
@@ -1101,6 +1108,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
void *at;
u64 pebs_status;
+ /*
+ * fmt0 does not have a status bitfield (does not use
+ * perf_record_nhm format)
+ */
+ if (x86_pmu.intel_cap.pebs_format < 1)
+ return base;
+
if (base == NULL)
return NULL;
@@ -1186,7 +1200,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
if (!event->attr.precise_ip)
return;
- n = (top - at) / x86_pmu.pebs_record_size;
+ n = top - at;
if (n <= 0)
return;
@@ -1230,12 +1244,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
pebs_status = p->status & cpuc->pebs_enabled;
pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+ /*
+ * On some CPUs the PEBS status can be zero when PEBS is
+ * racing with clearing of GLOBAL_STATUS.
+ *
+ * Normally we would drop that record, but in the
+ * case when there is only a single active PEBS event
+ * we can assume it's for that event.
+ */
+ if (!pebs_status && cpuc->pebs_enabled &&
+ !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+ pebs_status = cpuc->pebs_enabled;
+
bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);
- if (WARN(bit >= x86_pmu.max_pebs_events,
- "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
- (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
- *(unsigned long long *)cpuc->active_mask))
+ if (bit >= x86_pmu.max_pebs_events)
continue;
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 659f01e165d5..653f88d25987 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -42,6 +42,13 @@ static enum {
#define LBR_FAR_BIT 8 /* do not capture far branches */
#define LBR_CALL_STACK_BIT 9 /* enable call stack */
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */
+
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
#define LBR_JCC (1 << LBR_JCC_BIT)
@@ -52,6 +59,7 @@ static enum {
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -152,8 +160,8 @@ static void __intel_pmu_lbr_enable(bool pmi)
* did not change.
*/
if (cpuc->lbr_sel)
- lbr_select = cpuc->lbr_sel->config;
- if (!pmi)
+ lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+ if (!pmi && cpuc->lbr_sel)
wrmsrl(MSR_LBR_SELECT, lbr_select);
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -422,6 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
*/
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
+ bool need_info = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
@@ -429,8 +438,11 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
int out = 0;
int num = x86_pmu.lbr_nr;
- if (cpuc->lbr_sel->config & LBR_CALL_STACK)
- num = tos;
+ if (cpuc->lbr_sel) {
+ need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ num = tos;
+ }
for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
@@ -442,7 +454,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
- if (lbr_format == LBR_FORMAT_INFO) {
+ if (lbr_format == LBR_FORMAT_INFO && need_info) {
u64 info;
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
@@ -590,6 +602,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
if (v != LBR_IGN)
mask |= v;
}
+
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
@@ -600,6 +613,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
*/
reg->config = mask ^ x86_pmu.lbr_sel_mask;
+ if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+ (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+ (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ reg->config |= LBR_NO_INFO;
+
return 0;
}
@@ -1028,3 +1046,17 @@ void __init intel_pmu_lbr_init_atom(void)
*/
pr_cont("8-deep LBR, ");
}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 868e1194337f..c0bbd1033b7c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -27,6 +27,7 @@
#include <asm/perf_event.h>
#include <asm/insn.h>
#include <asm/io.h>
+#include <asm/intel_pt.h>
#include "perf_event.h"
#include "intel_pt.h"
@@ -1122,6 +1123,14 @@ static int pt_event_init(struct perf_event *event)
return 0;
}
+void cpu_emergency_stop_pt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ if (pt->handle.event)
+ pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
static __init int pt_init(void)
{
int ret, cpu, prior_warn = 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index ed446bdcbf31..24a351ad628d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -63,7 +63,7 @@
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
#define NR_RAPL_DOMAINS 0x4
-static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@@ -109,11 +109,11 @@ static struct kobj_attribute format_attr_##_var = \
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-#define RAPL_EVENT_ATTR_STR(_name, v, str) \
-static struct perf_pmu_events_attr event_attr_##v = { \
- .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
- .id = 0, \
- .event_str = str, \
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
};
struct rapl_pmu {
@@ -405,19 +405,6 @@ static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
-static ssize_t rapl_sysfs_show(struct device *dev,
- struct device_attribute *attr,
- char *page)
-{
- struct perf_pmu_events_attr *pmu_attr = \
- container_of(attr, struct perf_pmu_events_attr, attr);
-
- if (pmu_attr->event_str)
- return sprintf(page, "%s", pmu_attr->event_str);
-
- return 0;
-}
-
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 61215a69b03d..f97f8075bf04 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -884,6 +884,15 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
* each box has a different function id.
*/
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ /* Knights Landing uses a common PCI device ID for multiple instances of
+ * an uncore PMU device type. There is only one entry per device type in
+ * the knl_uncore_pci_ids table inspite of multiple devices present for
+ * some device types. Hence PCI device idx would be 0 for all devices.
+ * So increment pmu pointer to point to an unused array element.
+ */
+ if (boot_cpu_data.x86_model == 87)
+ while (pmu->func_id >= 0)
+ pmu++;
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
@@ -966,6 +975,7 @@ static int __init uncore_pci_init(void)
case 63: /* Haswell-EP */
ret = hswep_uncore_pci_init();
break;
+ case 79: /* BDX-EP */
case 86: /* BDX-DE */
ret = bdx_uncore_pci_init();
break;
@@ -982,6 +992,9 @@ static int __init uncore_pci_init(void)
case 61: /* Broadwell */
ret = bdw_uncore_pci_init();
break;
+ case 87: /* Knights Landing */
+ ret = knl_uncore_pci_init();
+ break;
default:
return 0;
}
@@ -1287,9 +1300,13 @@ static int __init uncore_cpu_init(void)
case 63: /* Haswell-EP */
hswep_uncore_cpu_init();
break;
+ case 79: /* BDX-EP */
case 86: /* BDX-DE */
bdx_uncore_cpu_init();
break;
+ case 87: /* Knights Landing */
+ knl_uncore_cpu_init();
+ break;
default:
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 2f0a4a98e16b..07aa2d6bd710 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -338,6 +338,7 @@ int hsw_uncore_pci_init(void);
int bdw_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
/* perf_event_intel_uncore_snbep.c */
int snbep_uncore_pci_init(void);
@@ -348,6 +349,8 @@ int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
int bdx_uncore_pci_init(void);
void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 845256158a10..0b934820fafd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -417,7 +417,7 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
}
}
-static int snb_pci2phy_map_init(int devid)
+int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
struct pci2phy_map *map;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index f0f4fcba252e..33acb884ccf1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -209,31 +209,98 @@
#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710
#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET 0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400
+#define KNL_UCLK_MSR_PMON_CTL0 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454
+#define KNL_PMON_FIXED_CTL_EN 0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+ KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_CBO_PMON_CTL_TID_EN | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PMON_CTL_INVERT | \
+ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -315,8 +382,9 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe
static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+ pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
}
static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
@@ -1728,6 +1796,419 @@ int ivbep_uncore_pci_init(void)
}
/* end of IvyTown uncore support */
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link3.attr,
+ &format_attr_filter_state4.attr,
+ &format_attr_filter_local.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_nnm.attr,
+ &format_attr_filter_opc3.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+ EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x4)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+ return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = knl_cha_hw_config,
+ .get_constraint = knl_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+ .name = "cha",
+ .num_counters = 4,
+ .num_boxes = 38,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = KNL_CHA_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = knl_uncore_cha_constraints,
+ .ops = &knl_uncore_cha_ops,
+ .format_group = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+ &format_attr_event2.attr,
+ &format_attr_use_occ_ctr.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh6.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+ &knl_uncore_ubox,
+ &knl_uncore_cha,
+ &knl_uncore_pcu,
+ NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+ == UNCORE_FIXED_EVENT)
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | KNL_PMON_FIXED_CTL_EN);
+ else
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = knl_uncore_imc_enable_box,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .enable_event = knl_uncore_imc_enable_event,
+ .disable_event = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+ .name = "imc_uclk",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+ .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+ .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+ .name = "edc_uclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+ .name = "edc_eclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+ .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+ .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = knl_uncore_m2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL,
+ .ops = &snbep_uncore_pci_ops,
+ .format_group = &knl_uncore_irp_format_group,
+};
+
+enum {
+ KNL_PCI_UNCORE_MC_UCLK,
+ KNL_PCI_UNCORE_MC_DCLK,
+ KNL_PCI_UNCORE_EDC_UCLK,
+ KNL_PCI_UNCORE_EDC_ECLK,
+ KNL_PCI_UNCORE_M2PCIE,
+ KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+ [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk,
+ [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk,
+ [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk,
+ [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk,
+ [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie,
+ [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp,
+ NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ * PCI Device ID Uncore PMU Devices
+ * ----------------------------------
+ * 0x7841 MC0 UClk, MC1 UClk
+ * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ * 0x7817 M2PCIe
+ * 0x7814 IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+ { /* MC UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+ },
+ { /* MC DClk Channel */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+ },
+ { /* EDC UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+ },
+ { /* EDC EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+ },
+ { /* M2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+ .name = "knl_uncore",
+ .id_table = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+ int ret;
+
+ /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+ ret = snb_pci2phy_map_init(0x7814); /* IRP */
+ if (ret)
+ return ret;
+ ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+ if (ret)
+ return ret;
+ uncore_pci_uncores = knl_pci_uncores;
+ uncore_pci_driver = &knl_uncore_pci_driver;
+ return 0;
+}
+
+/* end of KNL uncore support */
+
/* Haswell-EP uncore support */
static struct attribute *hswep_uncore_ubox_formats_attr[] = {
&format_attr_event.attr,
@@ -2338,7 +2819,7 @@ int hswep_uncore_pci_init(void)
}
/* end of Haswell-EP uncore support */
-/* BDX-DE uncore support */
+/* BDX uncore support */
static struct intel_uncore_type bdx_uncore_ubox = {
.name = "ubox",
@@ -2360,13 +2841,14 @@ static struct event_constraint bdx_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type bdx_uncore_cbox = {
.name = "cbox",
.num_counters = 4,
- .num_boxes = 8,
+ .num_boxes = 24,
.perf_ctr_bits = 48,
.event_ctl = HSWEP_C0_MSR_PMON_CTL0,
.perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
@@ -2379,9 +2861,24 @@ static struct intel_uncore_type bdx_uncore_cbox = {
.format_group = &hswep_uncore_cbox_format_group,
};
+static struct intel_uncore_type bdx_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
static struct intel_uncore_type *bdx_msr_uncores[] = {
&bdx_uncore_ubox,
&bdx_uncore_cbox,
+ &bdx_uncore_sbox,
&hswep_uncore_pcu,
NULL,
};
@@ -2396,7 +2893,7 @@ void bdx_uncore_cpu_init(void)
static struct intel_uncore_type bdx_uncore_ha = {
.name = "ha",
.num_counters = 4,
- .num_boxes = 1,
+ .num_boxes = 2,
.perf_ctr_bits = 48,
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
@@ -2404,7 +2901,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
static struct intel_uncore_type bdx_uncore_imc = {
.name = "imc",
.num_counters = 5,
- .num_boxes = 2,
+ .num_boxes = 8,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
@@ -2424,6 +2921,19 @@ static struct intel_uncore_type bdx_uncore_irp = {
.format_group = &snbep_uncore_format_group,
};
+static struct intel_uncore_type bdx_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
@@ -2432,6 +2942,8 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
EVENT_CONSTRAINT_END
};
@@ -2445,18 +2957,65 @@ static struct intel_uncore_type bdx_uncore_r2pcie = {
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
enum {
BDX_PCI_UNCORE_HA,
BDX_PCI_UNCORE_IMC,
BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_QPI,
BDX_PCI_UNCORE_R2PCIE,
+ BDX_PCI_UNCORE_R3QPI,
};
static struct intel_uncore_type *bdx_pci_uncores[] = {
[BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
[BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
[BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi,
[BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi,
NULL,
};
@@ -2465,6 +3024,10 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
},
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+ },
{ /* MC0 Channel 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
@@ -2473,14 +3036,74 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
},
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+ },
{ /* IRP */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
},
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+ },
{ /* R2PCIe */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
},
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+ },
+ { /* QPI Port 2 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+ },
{ /* end: all zeroes */ }
};
@@ -2500,4 +3123,4 @@ int bdx_uncore_pci_init(void)
return 0;
}
-/* end of BDX-DE uncore support */
+/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 136ac74dee82..819d94982e07 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -33,28 +33,27 @@ static int __init x86_rdrand_setup(char *s)
__setup("nordrand", x86_rdrand_setup);
/*
- * Force a reseed cycle; we are architecturally guaranteed a reseed
- * after no more than 512 128-bit chunks of random data. This also
- * acts as a test of the CPU capability.
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check.
+ * If it fails, it is simple to disable RDRAND here.
*/
-#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
+#define SANITY_CHECK_LOOPS 8
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_ARCH_RANDOM
unsigned long tmp;
- int i, count, ok;
+ int i;
if (!cpu_has(c, X86_FEATURE_RDRAND))
- return; /* Nothing to do */
+ return;
- for (count = i = 0; i < RESEED_LOOP; i++) {
- ok = rdrand_long(&tmp);
- if (ok)
- count++;
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (!rdrand_long(&tmp)) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ printk_once(KERN_WARNING "rdrand: disabled\n");
+ return;
+ }
}
-
- if (count != RESEED_LOOP)
- clear_cpu_cap(c, X86_FEATURE_RDRAND);
#endif
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 608fb26c7254..8cb57df9398d 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,32 +31,12 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
- { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
- { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
- { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 },
- { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 },
- { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
- { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
- { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
- { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
- { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
- { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
- { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
- { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
- { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 3fa0e5ad86b4..252da7aceca6 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -12,7 +12,7 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
xlvl = cpuid_eax(0x80860000);
if ((xlvl & 0xffff0000) == 0x80860000) {
if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
}
}
@@ -82,7 +82,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
/* Unhide possibly hidden capability flags */
rdmsr(0x80860004, cap_mask, uk);
wrmsr(0x80860004, ~0, uk);
- c->x86_capability[0] = cpuid_edx(0x00000001);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index bd3507da39f0..2836de390f95 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -58,28 +58,6 @@ static void cpuid_smp_cpuid(void *cmd_block)
&cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
}
-static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -132,7 +110,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
*/
static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
- .llseek = cpuid_seek,
+ .llseek = no_seek_end_llseek,
.read = cpuid_read,
.open = cpuid_open,
};
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2c1910f6717e..58f34319b29a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -35,6 +35,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
+#include <asm/intel_pt.h>
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
@@ -125,6 +126,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
disable_local_APIC();
}
@@ -169,6 +175,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
#ifdef CONFIG_X86_IO_APIC
/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
ioapic_zap_locks();
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index be39b5fde4b9..6d9f0a7ef4c8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
*/
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
#include <linux/sched.h>
+#include <linux/init.h>
/*
* Initialize the TS bit in CR0 according to the style of context-switches
@@ -12,7 +15,7 @@
*/
static void fpu__init_cpu_ctx_switch(void)
{
- if (!cpu_has_eager_fpu)
+ if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
stts();
else
clts();
@@ -143,9 +146,18 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
-/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+/* Get alignment of the TYPE. */
+#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
- BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+ BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
+ TYPE_ALIGN(TYPE)))
/*
* We append the 'struct fpu' to the task_struct:
@@ -188,7 +200,7 @@ static void __init fpu__init_task_struct_size(void)
*/
static void __init fpu__init_system_xstate_size_legacy(void)
{
- static int on_boot_cpu = 1;
+ static int on_boot_cpu __initdata = 1;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -261,24 +273,56 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
{
- if (!strcmp(s, "on"))
- eagerfpu = ENABLE;
- else if (!strcmp(s, "off"))
- eagerfpu = DISABLE;
- else if (!strcmp(s, "auto"))
- eagerfpu = AUTO;
- return 1;
+ /* Support all xfeatures known to us */
+ if (eagerfpu != DISABLE)
+ return XCNTXT_MASK;
+
+ /* Warning of xfeatures being disabled for no eagerfpu mode */
+ if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+ pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+ xfeatures_mask & XFEATURE_MASK_EAGER);
+ }
+
+ /* Return a mask that masks out all features requiring eagerfpu mode */
+ return ~XFEATURE_MASK_EAGER;
+}
+
+/*
+ * Disable features dependent on eagerfpu.
+ */
+static void __init fpu__clear_eager_fpu_features(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
}
-__setup("eagerfpu=", eager_fpu_setup);
/*
* Pick the FPU context switching strategy:
+ *
+ * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
+ * the following is true:
+ *
+ * (1) the cpu has xsaveopt, as it has the optimization and doing eager
+ * FPU switching has a relatively low cost compared to a plain xsave;
+ * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
+ * switching. Should the kernel boot with noxsaveopt, we support MPX
+ * with eager FPU switching at a higher cost.
*/
static void __init fpu__init_system_ctx_switch(void)
{
- static bool on_boot_cpu = 1;
+ static bool on_boot_cpu __initdata = 1;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -286,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(current->thread.fpu.fpstate_active);
current_thread_info()->status = 0;
- /* Auto enable eagerfpu for xsaveopt */
- if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+ if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- if (eagerfpu == DISABLE) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- xfeatures_mask &= ~XFEATURE_MASK_EAGER;
- } else {
- eagerfpu = ENABLE;
- }
- }
+ if (xfeatures_mask & XFEATURE_MASK_EAGER)
+ eagerfpu = ENABLE;
if (eagerfpu == ENABLE)
setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -307,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void)
}
/*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+ /*
+ * No need to check "eagerfpu=auto" again, since it is the
+ * initial default.
+ */
+ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
+ eagerfpu = DISABLE;
+ fpu__clear_eager_fpu_features();
+ } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
+ eagerfpu = ENABLE;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ fpu__xstate_clear_all_cpu_caps();
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpu__init_parse_early_param();
fpu__init_system_early_generic(c);
/*
@@ -335,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_ctx_switch();
}
-
-/*
- * Boot parameter to turn off FPU support and fall back to math-emu:
- */
-static int __init no_387(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FPU);
- return 1;
-}
-__setup("no387", no_387);
-
-/*
- * Disable all xstate CPU features:
- */
-static int __init x86_noxsave_setup(char *s)
-{
- if (strlen(s))
- return 0;
-
- fpu__xstate_clear_all_cpu_caps();
-
- return 1;
-}
-__setup("noxsave", x86_noxsave_setup);
-
-/*
- * Disable the XSAVEOPT instruction specifically:
- */
-static int __init x86_noxsaveopt_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
- return 1;
-}
-__setup("noxsaveopt", x86_noxsaveopt_setup);
-
-/*
- * Disable the XSAVES instruction:
- */
-static int __init x86_noxsaves_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
- return 1;
-}
-__setup("noxsaves", x86_noxsaves_setup);
-
-/*
- * Disable FX save/restore and SSE support:
- */
-static int __init x86_nofxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
-
- return 1;
-}
-__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 70fc312221fc..d425cda5ae6d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
}
/*
@@ -297,7 +298,7 @@ static void __init setup_xstate_comp(void)
*/
static void __init setup_init_fpu_buf(void)
{
- static int on_boot_cpu = 1;
+ static int on_boot_cpu __initdata = 1;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -608,7 +609,7 @@ static void fpu__init_disable_system_xstate(void)
void __init fpu__init_system_xstate(void)
{
unsigned int eax, ebx, ecx, edx;
- static int on_boot_cpu = 1;
+ static int on_boot_cpu __initdata = 1;
int err;
WARN_ON_FPU(!on_boot_cpu);
@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
BUG();
}
- /* Support only the state known to the OS: */
- xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+ xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 311bcf338f07..29408d6d6626 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -105,14 +105,14 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
{
unsigned char replaced[MCOUNT_INSN_SIZE];
+ ftrace_expected = old_code;
+
/*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
*/
/* read the text we want to modify */
@@ -154,6 +154,8 @@ int ftrace_make_nop(struct module *mod,
if (addr == MCOUNT_ADDR)
return ftrace_modify_code_direct(rec->ip, old, new);
+ ftrace_expected = NULL;
+
/* Normal cases use add_brk_on_nop */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
@@ -220,6 +222,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
WARN_ON(1);
+ ftrace_expected = NULL;
return -EINVAL;
}
@@ -314,6 +317,8 @@ static int add_break(unsigned long ip, const char *old)
if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
+ ftrace_expected = old;
+
/* Make sure it is what we expect it to be */
if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
@@ -413,6 +418,8 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
ftrace_addr = ftrace_get_addr_curr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
+ ftrace_expected = nop;
+
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 50a3fad5b89f..2bcfb5f2bc44 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -300,6 +300,10 @@ static int arch_build_bp_info(struct perf_event *bp)
return -EINVAL;
if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return -EOPNOTSUPP;
+
/*
* It's impossible to use a range breakpoint to fake out
* user vs kernel detection because bp_len - 1 can't
@@ -307,8 +311,6 @@ static int arch_build_bp_info(struct perf_event *bp)
* breakpoints, then we'll have to check for kprobe-blacklisted
* addresses anywhere in the range.
*/
- if (!cpu_has_bpext)
- return -EOPNOTSUPP;
info->mask = bp->attr.bp_len - 1;
info->len = X86_BREAKPOINT_LEN_1;
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2bd81e302427..72cef58693c7 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock wall_clock;
+struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+ return hv_clock;
+}
+
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -305,7 +310,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
int cpu;
- int ret;
u8 flags;
struct pvclock_vcpu_time_info *vcpu_time;
unsigned int size;
@@ -325,11 +329,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
return 1;
}
- if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
- put_cpu();
- return ret;
- }
-
put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
index d1d35ccffed3..92fc1a51f994 100644
--- a/arch/x86/kernel/livepatch.c
+++ b/arch/x86/kernel/livepatch.c
@@ -20,8 +20,6 @@
#include <linux/module.h>
#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/page_types.h>
#include <asm/elf.h>
#include <asm/livepatch.h>
@@ -38,11 +36,10 @@
int klp_write_module_reloc(struct module *mod, unsigned long type,
unsigned long loc, unsigned long value)
{
- int ret, numpages, size = 4;
- bool readonly;
+ size_t size = 4;
unsigned long val;
- unsigned long core = (unsigned long)mod->module_core;
- unsigned long core_size = mod->core_size;
+ unsigned long core = (unsigned long)mod->core_layout.base;
+ unsigned long core_size = mod->core_layout.size;
switch (type) {
case R_X86_64_NONE:
@@ -69,23 +66,5 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
/* loc does not point to any symbol inside the module */
return -EINVAL;
- readonly = false;
-
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
- if (loc < core + mod->core_ro_size)
- readonly = true;
-#endif
-
- /* determine if the relocation spans a page boundary */
- numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
-
- if (readonly)
- set_memory_rw(loc & PAGE_MASK, numpages);
-
- ret = probe_kernel_write((void *)loc, &val, size);
-
- if (readonly)
- set_memory_ro(loc & PAGE_MASK, numpages);
-
- return ret;
+ return probe_kernel_write((void *)loc, &val, size);
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 113e70784854..64f9616f93f1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -45,28 +45,6 @@
static struct class *msr_class;
-static loff_t msr_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file_inode(file);
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case SEEK_SET:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case SEEK_CUR:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -194,7 +172,7 @@ static int msr_open(struct inode *inode, struct file *file)
*/
static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
- .llseek = msr_seek,
+ .llseek = no_seek_end_llseek,
.read = msr_read,
.write = msr_write,
.open = msr_open,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 697f90db0e37..8a2cdd736fa4 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,6 +29,7 @@
#include <asm/mach_traps.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/reboot.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -231,7 +232,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
#endif
if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
@@ -255,8 +256,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason, smp_processor_id());
show_regs(regs);
- if (panic_on_io_nmi)
- panic("NMI IOCK error: Not continuing");
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
@@ -297,7 +306,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
}
@@ -348,8 +357,19 @@ static void default_do_nmi(struct pt_regs *regs)
return;
}
- /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
- raw_spin_lock(&nmi_reason_lock);
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
reason = x86_platform.get_nmi_reason();
if (reason & NMI_REASON_MASK) {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130aef3f9d..f08ac28b8136 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -74,16 +74,6 @@ void __init default_banner(void)
/* Undefined instruction for dealing with missing ops pointers. */
static const unsigned char ud2a[] = { 0x0f, 0x0b };
-unsigned paravirt_patch_nop(void)
-{
- return 0;
-}
-
-unsigned paravirt_patch_ignore(unsigned len)
-{
- return len;
-}
-
struct branch {
unsigned char opcode;
u32 delta;
@@ -133,7 +123,6 @@ static void *get_call_destination(u8 type)
.pv_time_ops = pv_time_ops,
.pv_cpu_ops = pv_cpu_ops,
.pv_irq_ops = pv_irq_ops,
- .pv_apic_ops = pv_apic_ops,
.pv_mmu_ops = pv_mmu_ops,
#ifdef CONFIG_PARAVIRT_SPINLOCKS
.pv_lock_ops = pv_lock_ops,
@@ -152,8 +141,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
/* If there's no function, patch it with a ud2a (BUG) */
ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
else if (opfunc == _paravirt_nop)
- /* If the operation is a nop, then nop the callsite */
- ret = paravirt_patch_nop();
+ ret = 0;
/* identity functions just return their single argument */
else if (opfunc == _paravirt_ident_32)
@@ -162,10 +150,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_ident_64(insnbuf, len);
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-#ifdef CONFIG_X86_32
- type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-#endif
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
@@ -220,8 +204,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret32(void);
extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
@@ -379,13 +361,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.load_sp0 = native_load_sp0,
-#if defined(CONFIG_X86_32)
- .irq_enable_sysexit = native_irq_enable_sysexit,
-#endif
#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
- .usergs_sysret32 = native_usergs_sysret32,
-#endif
.usergs_sysret64 = native_usergs_sysret64,
#endif
.iret = native_iret,
@@ -403,12 +379,6 @@ NOKPROBE_SYMBOL(native_get_debugreg);
NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
- .startup_ipi_hook = paravirt_nop,
-#endif
-};
-
#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
/* 32-bit pagetable entries */
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
@@ -444,9 +414,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_pmd = native_set_pmd,
.set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
- .pmd_update = paravirt_nop,
- .pmd_update_defer = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
@@ -492,6 +459,5 @@ struct pv_mmu_ops pv_mmu_ops = {
EXPORT_SYMBOL_GPL(pv_time_ops);
EXPORT_SYMBOL (pv_cpu_ops);
EXPORT_SYMBOL (pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
EXPORT_SYMBOL_GPL(pv_info);
EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index c89f50a76e97..158dc0650d5d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, restore_fl);
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa05583bc42..e70087a04cc8 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -13,9 +13,7 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
@@ -55,7 +53,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
PATCH_SITE(pv_cpu_ops, swapgs);
PATCH_SITE(pv_mmu_ops, read_cr2);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 0497f719977d..833b1d329c47 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -180,13 +180,13 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl);
static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
static void get_tce_space_from_tar(void);
-static struct cal_chipset_ops calgary_chip_ops = {
+static const struct cal_chipset_ops calgary_chip_ops = {
.handle_quirks = calgary_handle_quirks,
.tce_cache_blast = calgary_tce_cache_blast,
.dump_error_regs = calgary_dump_error_regs
};
-static struct cal_chipset_ops calioc2_chip_ops = {
+static const struct cal_chipset_ops calioc2_chip_ops = {
.handle_quirks = calioc2_handle_quirks,
.tce_cache_blast = calioc2_tce_cache_blast,
.dump_error_regs = calioc2_dump_error_regs
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index adf0392d549a..7c577a178859 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -88,7 +88,7 @@ int __init pci_swiotlb_detect_4gb(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
return swiotlb;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e835d263a33b..b9d99e0f82c4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -125,7 +125,7 @@ void release_thread(struct task_struct *dead_task)
if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
- dead_task->mm->context.ldt,
+ dead_task->mm->context.ldt->entries,
dead_task->mm->context.ldt->size);
BUG();
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 558f50edebca..32e9d9cbb884 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -124,21 +124,6 @@ const char *regs_query_register_name(unsigned int offset)
return NULL;
}
-static const int arg_offs_table[] = {
-#ifdef CONFIG_X86_32
- [0] = offsetof(struct pt_regs, ax),
- [1] = offsetof(struct pt_regs, dx),
- [2] = offsetof(struct pt_regs, cx)
-#else /* CONFIG_X86_64 */
- [0] = offsetof(struct pt_regs, di),
- [1] = offsetof(struct pt_regs, si),
- [2] = offsetof(struct pt_regs, dx),
- [3] = offsetof(struct pt_regs, cx),
- [4] = offsetof(struct pt_regs, r8),
- [5] = offsetof(struct pt_regs, r9)
-#endif
-};
-
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..99bfc025111d 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -140,27 +140,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
-
-#ifdef CONFIG_X86_64
-/*
- * Initialize the generic pvclock vsyscall state. This will allocate
- * a/some page(s) for the per-vcpu pvclock information, set up a
- * fixmap mapping for the page(s)
- */
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
- int size)
-{
- int idx;
-
- WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-
- for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
- __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
- __pa(i) + (idx*PAGE_SIZE),
- PAGE_KERNEL_VVAR);
- }
-
- return 0;
-}
-#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 02693dd9a079..ab0adc0fa5db 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
/* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */
@@ -718,6 +726,7 @@ static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
{
@@ -780,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
smp_send_nmi_allbutself();
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
+
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
mdelay(1);
@@ -788,9 +800,35 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* Leave the nmi callback set */
}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
+}
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+}
+
#else /* !CONFIG_SMP */
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
/* No other CPUs to shoot down */
}
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
#endif
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index cd9685235df9..4af8d063fb36 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void)
}
#endif
+ if (paravirt_enabled() && !paravirt_has(RTC))
+ return -ENODEV;
+
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d2bbe343fda7..d3d80e6d42a2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1048,6 +1048,8 @@ void __init setup_arch(char **cmdline_p)
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820_end_of_ram_pfn();
+ max_possible_pfn = max_pfn;
+
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 12c8286206ce..658777cf3851 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -125,12 +125,12 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+ apic->send_IPI(cpu, RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
}
void native_send_call_func_ipi(const struct cpumask *mask)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fbabe4fcc7fb..24d57f77b3c1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -304,7 +304,7 @@ do { \
static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
if (c->phys_proc_id == o->phys_proc_id &&
@@ -630,13 +630,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
num_starts = 0;
/*
- * Paravirt / VMI wants a startup IPI hook here to set up the
- * target processor state.
- */
- startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- stack_start);
-
- /*
* Run STARTUP IPI loop.
*/
pr_debug("#startup loops: %d\n", num_starts);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c7c4d9c51e99..3d743da828d3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1185,8 +1185,6 @@ void __init tsc_init(void)
u64 lpj;
int cpu;
- x86_init.timers.tsc_pre_init();
-
if (!cpu_has_tsc) {
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 4cf401f581e7..07efb35ee4bc 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -48,31 +48,31 @@ verify_cpu:
pushfl
popl %eax
cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
#endif
movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
+ jb .Lverify_cpu_no_longmode # no cpuid 1
xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
mov $1,%di # cpu is from AMD
- jmp verify_cpu_check
+ jmp .Lverify_cpu_check
-verify_cpu_noamd:
+.Lverify_cpu_noamd:
cmpl $0x756e6547,%ebx # GenuineIntel?
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x49656e69,%edx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x6c65746e,%ecx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
# only call IA32_MISC_ENABLE when:
# family > 6 || (family == 6 && model >= 0xd)
@@ -83,59 +83,59 @@ verify_cpu_noamd:
andl $0x0ff00f00, %eax # mask family and extended family
shrl $8, %eax
cmpl $6, %eax
- ja verify_cpu_clear_xd # family > 6, ok
- jb verify_cpu_check # family < 6, skip
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
andl $0x000f00f0, %ecx # mask model and extended model
shrl $4, %ecx
cmpl $0xd, %ecx
- jb verify_cpu_check # family == 6, model < 0xd, skip
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
-verify_cpu_clear_xd:
+.Lverify_cpu_clear_xd:
movl $MSR_IA32_MISC_ENABLE, %ecx
rdmsr
btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
- jnc verify_cpu_check # only write MSR if bit was changed
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
wrmsr
-verify_cpu_check:
+.Lverify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
+ jb .Lverify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode
-verify_cpu_sse_test:
+.Lverify_cpu_sse_test:
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
+ je .Lverify_cpu_sse_ok
test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
+ jmp .Lverify_cpu_sse_test # try again
-verify_cpu_no_longmode:
+.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
ret
-verify_cpu_sse_ok:
+.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 524619351961..483231ebbb0b 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -357,8 +357,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
tsk->thread.sp0 += 16;
- if (cpu_has_sep)
+
+ if (static_cpu_has_safe(X86_FEATURE_SEP))
tsk->thread.sysenter_cs = 0;
+
load_sp0(tss, &tsk->thread);
put_cpu();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 3839628d962e..dad5fe9633a3 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -68,7 +68,6 @@ struct x86_init_ops x86_init __initdata = {
.timers = {
.setup_percpu_clockev = setup_boot_APIC_clock,
- .tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
.wallclock_init = x86_init_noop,
},
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 06332cb7e7d1..c8eda1498121 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -2,6 +2,7 @@
#define ARCH_X86_KVM_CPUID_H
#include "x86.h"
+#include <asm/cpu.h>
int kvm_update_cpuid(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -38,6 +39,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->edx & bit(X86_FEATURE_MTRR));
+}
+
static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -170,4 +179,37 @@ static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
}
#undef BIT_NRIPS
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 08116ff227cc..b0ea42b78ccd 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -420,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_s
u8 saved_mode;
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
pit_load_count(kvm, channel, val);
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 9e8bf13572e6..3f8c732117ec 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -120,14 +120,22 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
-static u8 mtrr_disabled_type(void)
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
{
/*
* Intel SDM 11.11.2.2: all MTRRs are disabled when
* IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
* memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
*/
- return MTRR_TYPE_UNCACHABLE;
+ if (guest_cpuid_has_mtrr(vcpu))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
}
/*
@@ -267,7 +275,7 @@ static int fixed_mtrr_addr_to_seg(u64 addr)
for (seg = 0; seg < seg_num; seg++) {
mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
return seg;
}
@@ -300,7 +308,6 @@ static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
*start = range->base & PAGE_MASK;
mask = range->mask & PAGE_MASK;
- mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
/* This cannot overflow because writing to the reserved bits of
* variable MTRRs causes a #GP.
@@ -356,10 +363,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
+ /* Extend the mask with all 1 bits to the left, since those
+ * bits must implicitly be 0. The bits are then cleared
+ * when reading them.
+ */
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data;
+ cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
@@ -426,6 +437,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
*pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+ *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
}
return 0;
@@ -670,7 +683,7 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
if (iter.mtrr_disabled)
- return mtrr_disabled_type();
+ return mtrr_disabled_type(vcpu);
/* not contained in any MTRRs. */
if (type == -1)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af342150558a..c13a64b7d789 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3063,6 +3063,23 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
default:
return kvm_get_msr_common(vcpu, msr_info);
}
@@ -3444,6 +3461,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
@@ -3918,8 +3937,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
-
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_handle_nmi(&svm->vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index be3f1735d5eb..04d61d496b14 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2868,7 +2868,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu))
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Otherwise falls through */
default:
@@ -2974,7 +2974,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu))
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -8101,6 +8101,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
@@ -8726,7 +8728,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fad1d096919e..f53f5b13c677 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3613,9 +3613,11 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
+ int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -3634,6 +3636,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int start = 0;
+ int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3643,7 +3646,9 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ start && i == 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -6590,6 +6595,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
+ trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
__kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -6602,8 +6609,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- trace_kvm_entry(vcpu->vcpu_id);
- wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a0d09f6c6533..4ba229ac3f4f 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1414,6 +1414,7 @@ __init void lguest_init(void)
pv_info.kernel_rpl = 1;
/* Everyone except Xen runs with this set. */
pv_info.shared_kernel_pmd = 1;
+ pv_info.features = 0;
/*
* We set up all the lguest overrides for sensitive operations. These
@@ -1472,7 +1473,6 @@ __init void lguest_init(void)
pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
pv_mmu_ops.pte_update = lguest_pte_update;
- pv_mmu_ops.pte_update_defer = lguest_pte_update;
#ifdef CONFIG_X86_LOCAL_APIC
/* APIC read/write intercepts */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..a501fa25da41 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
-lib-y := delay.o misc.o cmdline.o
+lib-y := delay.o misc.o cmdline.o cpu.o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
new file mode 100644
index 000000000000..aa417a97511c
--- /dev/null
+++ b/arch/x86/lib/cpu.c
@@ -0,0 +1,35 @@
+#include <linux/module.h>
+
+unsigned int x86_family(unsigned int sig)
+{
+ unsigned int x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+EXPORT_SYMBOL_GPL(x86_family);
+
+unsigned int x86_model(unsigned int sig)
+{
+ unsigned int fam, model;
+
+ fam = x86_family(sig);
+
+ model = (sig >> 4) & 0xf;
+
+ if (fam >= 0x6)
+ model += ((sig >> 16) & 0xf) << 4;
+
+ return model;
+}
+EXPORT_SYMBOL_GPL(x86_model);
+
+unsigned int x86_stepping(unsigned int sig)
+{
+ return sig & 0xf;
+}
+EXPORT_SYMBOL_GPL(x86_stepping);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 43623739c7cf..004c861b1648 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,6 +1,8 @@
#include <linux/module.h>
#include <linux/preempt.h>
#include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <asm/msr-trace.h>
struct msr *msrs_alloc(void)
{
@@ -108,3 +110,27 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
+
+#ifdef CONFIG_TRACEPOINTS
+void do_trace_write_msr(unsigned msr, u64 val, int failed)
+{
+ trace_write_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_write_msr);
+EXPORT_TRACEPOINT_SYMBOL(write_msr);
+
+void do_trace_read_msr(unsigned msr, u64 val, int failed)
+{
+ trace_read_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_read_msr);
+EXPORT_TRACEPOINT_SYMBOL(read_msr);
+
+void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+{
+ trace_rdpmc(counter, val, failed);
+}
+EXPORT_SYMBOL(do_trace_rdpmc);
+EXPORT_TRACEPOINT_SYMBOL(rdpmc);
+
+#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 65c47fda26fc..f9d38a48e3c8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
new file mode 100644
index 000000000000..bfcffdf6c577
--- /dev/null
+++ b/arch/x86/mm/debug_pagetables.c
@@ -0,0 +1,46 @@
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/pgtable.h>
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ ptdump_walk_pgd_level(m, NULL);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *pe;
+
+static int __init pt_dump_debug_init(void)
+{
+ pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
+ &ptdump_fops);
+ if (!pe)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit pt_dump_debug_exit(void)
+{
+ debugfs_remove_recursive(pe);
+}
+
+module_init(pt_dump_debug_init);
+module_exit(pt_dump_debug_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a035c2aa7801..4a6f1d9b5106 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -89,7 +89,7 @@ static struct addr_marker address_markers[] = {
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+ { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
@@ -426,38 +426,15 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
ptdump_walk_pgd_level_core(m, pgd, false);
}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true);
}
-#ifdef CONFIG_X86_PTDUMP
-static int ptdump_show(struct seq_file *m, void *v)
+static int __init pt_dump_init(void)
{
- ptdump_walk_pgd_level(m, NULL);
- return 0;
-}
-
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-#endif
-
-static int pt_dump_init(void)
-{
-#ifdef CONFIG_X86_PTDUMP
- struct dentry *pe;
-#endif
-
#ifdef CONFIG_X86_32
/* Not a compile-time constant on x86-32 */
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
@@ -468,13 +445,6 @@ static int pt_dump_init(void)
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
#endif
-#ifdef CONFIG_X86_PTDUMP
- pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
- &ptdump_fops);
- if (!pe)
- return -ENOMEM;
-#endif
-
return 0;
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..8829482d69ec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
- if (IS_ALIGNED(addr, PAGE_SIZE) &&
- IS_ALIGNED(next, PAGE_SIZE)) {
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
/*
* Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use.
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b9c78f3bcd67..0d8d53d1f5cc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -194,8 +194,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* Check if the request spans more than any BAR in the iomem resource
* tree.
*/
- WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
- KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+ if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size))
+ pr_warn("caller %pS mapping multiple BARs\n", caller);
return ret_addr;
err_free_area:
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4feed1..fc6a4c8f6e2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
static void split_page_count(int level)
{
+ if (direct_pages_count[level] == 0)
+ return;
+
direct_pages_count[level]--;
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
@@ -129,14 +132,16 @@ within(unsigned long addr, unsigned long start, unsigned long end)
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
- unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
+ void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
void *vend = vaddr + size;
- void *p;
+
+ if (p >= vend)
+ return;
mb();
- for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
- p < vend; p += boot_cpu_data.x86_clflush_size)
+ for (; p < vend; p += clflush_size)
clflushopt(p);
mb();
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 188e3e07eeeb..031782e74231 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -586,7 +586,7 @@ int free_memtype(u64 start, u64 end)
entry = rbt_memtype_erase(start, end);
spin_unlock(&memtype_lock);
- if (!entry) {
+ if (IS_ERR(entry)) {
pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid, start, end - 1);
return -EINVAL;
@@ -992,6 +992,16 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
vma->vm_flags &= ~VM_PAT;
}
+/*
+ * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
+ * with the old vma after its pfnmap page table has been removed. The new
+ * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ */
+void untrack_pfn_moved(struct vm_area_struct *vma)
+{
+ vma->vm_flags &= ~VM_PAT;
+}
+
pgprot_t pgprot_writecombine(pgprot_t prot)
{
return __pgprot(pgprot_val(prot) |
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 63931080366a..2f7702253ccf 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -98,8 +98,13 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
return last_lower; /* Returns NULL if there is no overlap */
}
-static struct memtype *memtype_rb_exact_match(struct rb_root *root,
- u64 start, u64 end)
+enum {
+ MEMTYPE_EXACT_MATCH = 0,
+ MEMTYPE_END_MATCH = 1
+};
+
+static struct memtype *memtype_rb_match(struct rb_root *root,
+ u64 start, u64 end, int match_type)
{
struct memtype *match;
@@ -107,7 +112,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
while (match != NULL && match->start < end) {
struct rb_node *node;
- if (match->start == start && match->end == end)
+ if ((match_type == MEMTYPE_EXACT_MATCH) &&
+ (match->start == start) && (match->end == end))
+ return match;
+
+ if ((match_type == MEMTYPE_END_MATCH) &&
+ (match->start < start) && (match->end == end))
return match;
node = rb_next(&match->rb);
@@ -117,7 +127,7 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
match = NULL;
}
- return NULL; /* Returns NULL if there is no exact match */
+ return NULL; /* Returns NULL if there is no match */
}
static int memtype_rb_check_conflict(struct rb_root *root,
@@ -210,12 +220,36 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end)
{
struct memtype *data;
- data = memtype_rb_exact_match(&memtype_rbroot, start, end);
- if (!data)
- goto out;
+ /*
+ * Since the memtype_rbroot tree allows overlapping ranges,
+ * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
+ * a whole node for the munmap case. If no such entry is found,
+ * it then checks with END_MATCH, i.e. shrink the size of a node
+ * from the end for the mremap case.
+ */
+ data = memtype_rb_match(&memtype_rbroot, start, end,
+ MEMTYPE_EXACT_MATCH);
+ if (!data) {
+ data = memtype_rb_match(&memtype_rbroot, start, end,
+ MEMTYPE_END_MATCH);
+ if (!data)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (data->start == start) {
+ /* munmap: erase this node */
+ rb_erase_augmented(&data->rb, &memtype_rbroot,
+ &memtype_rb_augment_cb);
+ } else {
+ /* mremap: update the end value of this node */
+ rb_erase_augmented(&data->rb, &memtype_rbroot,
+ &memtype_rb_augment_cb);
+ data->end = start;
+ data->subtree_max_end = data->end;
+ memtype_rb_insert(&memtype_rbroot, data);
+ return NULL;
+ }
- rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
-out:
return data;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..ee9c2e3a7199 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -414,7 +414,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*ptep = entry;
- pte_update_defer(vma->vm_mm, address, ptep);
+ pte_update(vma->vm_mm, address, ptep);
}
return changed;
@@ -431,7 +431,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*pmdp = entry;
- pmd_update_defer(vma->vm_mm, address, pmdp);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -469,9 +468,6 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *)pmdp);
- if (ret)
- pmd_update(vma->vm_mm, addr, pmdp);
-
return ret;
}
#endif
@@ -518,7 +514,6 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
(unsigned long *)pmdp);
if (set) {
- pmd_update(vma->vm_mm, address, pmdp);
/* need tlb flush only to serialize against gup-fast */
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 90555bf60aa4..92e2eacb3321 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -31,7 +31,7 @@ early_param("noexec", noexec_setup);
void x86_configure_nx(void)
{
- if (cpu_has_nx && !disable_nx)
+ if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
else
__supported_pte_mask &= ~_PAGE_NX;
@@ -39,7 +39,7 @@ void x86_configure_nx(void)
void __init x86_report_nx(void)
{
- if (!cpu_has_nx) {
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"missing in CPU!\n");
} else {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index c2aea63bee20..b5f821881465 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -203,6 +203,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
(unsigned long long)start, (unsigned long long)end - 1);
+ max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
+
return 0;
out_err_bad_srat:
bad_srat();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
preempt_disable();
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+ /* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb();
+
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
- if (current->active_mm != mm)
+ if (current->active_mm != mm) {
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
+ }
if (!current->mm) {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
}
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+ /*
+ * Both branches below are implicit full barriers (MOV to CR or
+ * INVLPG) that synchronize with switch_mm.
+ */
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
preempt_disable();
if (current->active_mm == mm) {
- if (current->mm)
+ if (current->mm) {
+ /*
+ * Implicit full barrier (INVLPG) that synchronizes
+ * with switch_mm.
+ */
__flush_tlb_one(start);
- else
+ } else {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+ }
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 75991979f667..4286f3618bd0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -193,7 +193,7 @@ struct jit_context {
32 /* space for rbx, r13, r14, r15 */ + \
8 /* space for skb_copy_bits() buffer */)
-#define PROLOGUE_SIZE 51
+#define PROLOGUE_SIZE 48
/* emit x64 prologue code for BPF program and check it's size.
* bpf_tail_call helper will skip it while jumping into another program
@@ -229,11 +229,15 @@ static void emit_prologue(u8 **pprog)
/* mov qword ptr [rbp-X],r15 */
EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
- /* clear A and X registers */
- EMIT2(0x31, 0xc0); /* xor eax, eax */
- EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+ /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
+ * we need to reset the counter to 0. It's done in two instructions,
+ * resetting rax register to 0 (xor on eax gets 0 extended), and
+ * moving it to the counter location.
+ */
- /* clear tail_cnt: mov qword ptr [rbp-X], rax */
+ /* xor eax, eax */
+ EMIT2(0x31, 0xc0);
+ /* mov qword ptr [rbp-X], rax */
EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
@@ -455,6 +459,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
case BPF_ALU | BPF_MOV | BPF_K:
+ /* optimization: if imm32 is zero, use 'xor <dst>,<dst>'
+ * to save 3 bytes.
+ */
+ if (imm32 == 0) {
+ if (is_ereg(dst_reg))
+ EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+ break;
+ }
+
/* mov %eax, imm32 */
if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
@@ -469,6 +485,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
return -EINVAL;
}
+ /* optimization: if imm64 is zero, use 'xor <dst>,<dst>'
+ * to save 7 bytes.
+ */
+ if (insn[0].imm == 0 && insn[1].imm == 0) {
+ b1 = add_2mod(0x48, dst_reg, dst_reg);
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg));
+
+ insn++;
+ i++;
+ break;
+ }
+
/* movabsq %rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
EMIT(insn[0].imm, 4);
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 5ca8ead91579..81c769e80614 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -25,8 +25,6 @@
#include <asm/cpu_device_id.h>
#include <asm/iosf_mbi.h>
-/* Side band Interface port */
-#define PUNIT_PORT 0x04
/* Power gate status reg */
#define PWRGT_STATUS 0x61
/* Subsystem config/status Video processor */
@@ -85,9 +83,8 @@ static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
while (punit_devp->name) {
- status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
- punit_devp->reg,
- &punit_pwr_status);
+ status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ,
+ punit_devp->reg, &punit_pwr_status);
if (status) {
seq_printf(seq_file, "%9s : Read Failed\n",
punit_devp->name);
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 0ee619f9fcb7..c1bdafaac3ca 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -111,23 +111,19 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
int ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->addr_lo);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_lo);
if (ret)
return ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->addr_hi);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_hi);
if (ret)
return ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->rmask);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->rmask);
if (ret)
return ret;
- return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->wmask);
+ return iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->wmask);
}
/**
@@ -151,31 +147,27 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
local_irq_save(flags);
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++,
- imr->addr_lo);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_lo);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->addr_hi);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_hi);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->rmask);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->rmask);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->wmask);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->wmask);
if (ret)
goto failed;
/* Lock bit must be set separately to addr_lo address bits. */
if (lock) {
imr->addr_lo |= IMR_LOCK;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg - IMR_NUM_REGS, imr->addr_lo);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
+ reg - IMR_NUM_REGS, imr->addr_lo);
if (ret)
goto failed;
}
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 327f21c3bde1..8dd80050d705 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -28,6 +28,7 @@
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/clocksource.h>
#include <asm/apic.h>
#include <asm/current.h>
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9ab52791fed5..d5f64996394a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
#include <asm/debugreg.h>
#include <asm/cpu.h>
#include <asm/mmu_context.h>
+#include <linux/dmi.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -32,6 +33,29 @@ __visible unsigned long saved_context_eflags;
#endif
struct saved_context saved_context;
+static void msr_save_context(struct saved_context *ctxt)
+{
+ struct saved_msr *msr = ctxt->saved_msrs.array;
+ struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+ while (msr < end) {
+ msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+ msr++;
+ }
+}
+
+static void msr_restore_context(struct saved_context *ctxt)
+{
+ struct saved_msr *msr = ctxt->saved_msrs.array;
+ struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+ while (msr < end) {
+ if (msr->valid)
+ wrmsrl(msr->info.msr_no, msr->info.reg.q);
+ msr++;
+ }
+}
+
/**
* __save_processor_state - save CPU registers before creating a
* hibernation image and before restoring the memory state from it
@@ -111,6 +135,7 @@ static void __save_processor_state(struct saved_context *ctxt)
#endif
ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
&ctxt->misc_enable);
+ msr_save_context(ctxt);
}
/* Needed by apm.c */
@@ -229,6 +254,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
+ msr_restore_context(ctxt);
}
/* Needed by apm.c */
@@ -320,3 +346,69 @@ static int __init bsp_pm_check_init(void)
}
core_initcall(bsp_pm_check_init);
+
+static int msr_init_context(const u32 *msr_id, const int total_num)
+{
+ int i = 0;
+ struct saved_msr *msr_array;
+
+ if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) {
+ pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n");
+ return -EINVAL;
+ }
+
+ msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL);
+ if (!msr_array) {
+ pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < total_num; i++) {
+ msr_array[i].info.msr_no = msr_id[i];
+ msr_array[i].valid = false;
+ msr_array[i].info.reg.q = 0;
+ }
+ saved_context.saved_msrs.num = total_num;
+ saved_context.saved_msrs.array = msr_array;
+
+ return 0;
+}
+
+/*
+ * The following section is a quirk framework for problematic BIOSen:
+ * Sometimes MSRs are modified by the BIOSen after suspended to
+ * RAM, this might cause unexpected behavior after wakeup.
+ * Thus we save/restore these specified MSRs across suspend/resume
+ * in order to work around it.
+ *
+ * For any further problematic BIOSen/platforms,
+ * please add your own function similar to msr_initialize_bdw.
+ */
+static int msr_initialize_bdw(const struct dmi_system_id *d)
+{
+ /* Add any extra MSR ids into this array. */
+ u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL };
+
+ pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident);
+ return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+}
+
+static struct dmi_system_id msr_save_dmi_table[] = {
+ {
+ .callback = msr_initialize_bdw,
+ .ident = "BROADWELL BDX_EP",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"),
+ },
+ },
+ {}
+};
+
+static int pm_check_save_msr(void)
+{
+ dmi_check_system(msr_save_dmi_table);
+ return 0;
+}
+
+device_initcall(pm_check_save_msr);
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index a8fecc226946..3ee2bb6b440b 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -17,7 +17,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \
ifeq ($(CONFIG_X86_32),y)
obj-y += checksum_32.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index 81d6562ce01d..11ab90dc5f14 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -1,6 +1,7 @@
#ifndef __UM_ASM_SYSCALL_H
#define __UM_ASM_SYSCALL_H
+#include <asm/syscall-generic.h>
#include <uapi/linux/audit.h>
typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index a29756f2d940..47c78d5e5c32 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -68,6 +68,7 @@ static const int reg_offsets[] = {
[EFL] = HOST_EFLAGS,
[UESP] = HOST_SP,
[SS] = HOST_SS,
+ [ORIG_EAX] = HOST_ORIG_AX,
};
int putreg(struct task_struct *child, int regno, unsigned long value)
@@ -83,6 +84,7 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
case EAX:
case EIP:
case UESP:
+ case ORIG_EAX:
break;
case FS:
if (value && (value & 3) != 3)
@@ -108,9 +110,6 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
value &= FLAG_MASK;
child->thread.regs.regs.gp[HOST_EFLAGS] |= value;
return 0;
- case ORIG_EAX:
- child->thread.regs.regs.syscall = value;
- return 0;
default :
panic("Bad register in putreg() : %d\n", regno);
}
@@ -143,8 +142,6 @@ unsigned long getreg(struct task_struct *child, int regno)
regno >>= 2;
switch (regno) {
- case ORIG_EAX:
- return child->thread.regs.regs.syscall;
case FS:
case GS:
case DS:
@@ -163,6 +160,7 @@ unsigned long getreg(struct task_struct *child, int regno)
case EDI:
case EBP:
case EFL:
+ case ORIG_EAX:
break;
default:
panic("Bad register in getreg() : %d\n", regno);
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index e5f854ce2d72..14fcd01ed992 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -470,7 +470,7 @@ long sys_sigreturn(void)
struct sigcontext __user *sc = &frame->sc;
int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
- if (copy_from_user(&set.sig[0], (void *)sc->oldmask, sizeof(set.sig[0])) ||
+ if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) ||
copy_from_user(&set.sig[1], frame->extramask, sig_size))
goto segfault;
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index acda713ab5be..abf4901c917b 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -64,7 +64,7 @@ static u32 xen_apic_read(u32 reg)
if (reg != APIC_ID)
return 0;
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
return 0;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5774800ff583..d09e4c9d7cc5 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -415,7 +415,7 @@ static bool __init xen_check_mwait(void)
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
- if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ if ((HYPERVISOR_platform_op(&op) == 0) &&
(buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
@@ -1192,7 +1192,7 @@ static const struct pv_info xen_info __initconst = {
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
#endif
-
+ .features = 0,
.name = "Xen",
};
@@ -1229,10 +1229,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.iret = xen_iret,
#ifdef CONFIG_X86_64
- .usergs_sysret32 = xen_sysret32,
.usergs_sysret64 = xen_sysret64,
-#else
- .irq_enable_sysexit = xen_sysexit,
#endif
.load_tr_desc = paravirt_nop,
@@ -1265,12 +1262,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.end_context_switch = xen_end_context_switch,
};
-static const struct pv_apic_ops xen_apic_ops __initconst = {
-#ifdef CONFIG_X86_LOCAL_APIC
- .startup_ipi_hook = paravirt_nop,
-#endif
-};
-
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
@@ -1374,7 +1365,7 @@ static void __init xen_boot_params_init_edd(void)
info->params.length = sizeof(info->params);
set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
&info->params);
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
@@ -1392,7 +1383,7 @@ static void __init xen_boot_params_init_edd(void)
op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
op.u.firmware_info.index = nr;
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
@@ -1535,8 +1526,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
+ if (xen_initial_domain())
+ pv_info.features |= PV_SUPPORTED_RTC;
pv_init_ops = xen_init_ops;
- pv_apic_ops = xen_apic_ops;
if (!xen_pvh_domain()) {
pv_cpu_ops = xen_cpu_ops;
@@ -1698,7 +1690,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
- if (HYPERVISOR_dom0_op(&op) == 0)
+ if (HYPERVISOR_platform_op(&op) == 0)
boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
/* Make sure ACS will be enabled */
@@ -1886,8 +1878,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
static void xen_set_cpu_features(struct cpuinfo_x86 *c)
{
- if (xen_pv_domain())
+ if (xen_pv_domain()) {
clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+ set_cpu_cap(c, X86_FEATURE_XENPV);
+ }
}
const struct hypervisor_x86 x86_hyper_xen = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ac161db63388..c913ca4f6958 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2436,7 +2436,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_others = xen_flush_tlb_others,
.pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
@@ -2495,14 +2494,9 @@ void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
- /* Optimization - we can use the HVM one but it has no idea which
- * VCPUs are descheduled - which means that it will needlessly IPI
- * them. Xen knows so let it do the job.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
- pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
return;
- }
+
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index feddabdab448..7f664c416faf 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,6 +1,7 @@
#include <linux/types.h>
#include <linux/tick.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
@@ -33,7 +34,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
- xen_hvm_init_shared_info();
+ if (!suspend_cancelled)
+ xen_hvm_init_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
@@ -68,26 +70,16 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- xen_pmu_finish(cpu);
-
if (xen_pv_domain())
xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
- int cpu;
-
if (xen_pv_domain())
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
-
- for_each_online_cpu(cpu)
- xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)
@@ -106,10 +98,20 @@ static void xen_vcpu_notify_suspend(void *data)
void xen_arch_resume(void)
{
+ int cpu;
+
on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
}
void xen_arch_suspend(void)
{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index f1ba6a092854..a0a4e554c6f1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/pvclock_gtod.h>
+#include <linux/timekeeper_internal.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
@@ -32,86 +33,12 @@
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
-/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
-
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
/* unused ns of stolen time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
-/* return an consistent snapshot of 64-bit time/counter value */
-static u64 get64(const u64 *p)
-{
- u64 ret;
-
- if (BITS_PER_LONG < 64) {
- u32 *p32 = (u32 *)p;
- u32 h, l;
-
- /*
- * Read high then low, and then make sure high is
- * still the same; this will only loop if low wraps
- * and carries into high.
- * XXX some clean way to make this endian-proof?
- */
- do {
- h = p32[1];
- barrier();
- l = p32[0];
- barrier();
- } while (p32[1] != h);
-
- ret = (((u64)h) << 32) | l;
- } else
- ret = *p;
-
- return ret;
-}
-
-/*
- * Runstate accounting
- */
-static void get_runstate_snapshot(struct vcpu_runstate_info *res)
-{
- u64 state_time;
- struct vcpu_runstate_info *state;
-
- BUG_ON(preemptible());
-
- state = this_cpu_ptr(&xen_runstate);
-
- /*
- * The runstate info is always updated by the hypervisor on
- * the current CPU, so there's no need to use anything
- * stronger than a compiler barrier when fetching it.
- */
- do {
- state_time = get64(&state->state_entry_time);
- barrier();
- *res = *state;
- barrier();
- } while (get64(&state->state_entry_time) != state_time);
-}
-
-/* return true when a vcpu could run but has no real cpu to run on */
-bool xen_vcpu_stolen(int vcpu)
-{
- return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
-}
-
-void xen_setup_runstate_info(int cpu)
-{
- struct vcpu_register_runstate_memory_area area;
-
- area.addr.v = &per_cpu(xen_runstate, cpu);
-
- if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
- cpu, &area))
- BUG();
-}
-
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
@@ -119,7 +46,7 @@ static void do_stolen_accounting(void)
s64 runnable, offline, stolen;
cputime_t ticks;
- get_runstate_snapshot(&state);
+ xen_get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
@@ -194,26 +121,46 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb,
unsigned long was_set, void *priv)
{
/* Protected by the calling core code serialization */
- static struct timespec next_sync;
+ static struct timespec64 next_sync;
struct xen_platform_op op;
- struct timespec now;
+ struct timespec64 now;
+ struct timekeeper *tk = priv;
+ static bool settime64_supported = true;
+ int ret;
- now = __current_kernel_time();
+ now.tv_sec = tk->xtime_sec;
+ now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
/*
* We only take the expensive HV call when the clock was set
* or when the 11 minutes RTC synchronization time elapsed.
*/
- if (!was_set && timespec_compare(&now, &next_sync) < 0)
+ if (!was_set && timespec64_compare(&now, &next_sync) < 0)
return NOTIFY_OK;
- op.cmd = XENPF_settime;
- op.u.settime.secs = now.tv_sec;
- op.u.settime.nsecs = now.tv_nsec;
- op.u.settime.system_time = xen_clocksource_read();
+again:
+ if (settime64_supported) {
+ op.cmd = XENPF_settime64;
+ op.u.settime64.mbz = 0;
+ op.u.settime64.secs = now.tv_sec;
+ op.u.settime64.nsecs = now.tv_nsec;
+ op.u.settime64.system_time = xen_clocksource_read();
+ } else {
+ op.cmd = XENPF_settime32;
+ op.u.settime32.secs = now.tv_sec;
+ op.u.settime32.nsecs = now.tv_nsec;
+ op.u.settime32.system_time = xen_clocksource_read();
+ }
+
+ ret = HYPERVISOR_platform_op(&op);
- (void)HYPERVISOR_dom0_op(&op);
+ if (ret == -ENOSYS && settime64_supported) {
+ settime64_supported = false;
+ goto again;
+ }
+ if (ret < 0)
+ return NOTIFY_BAD;
/*
* Move the next drift compensation time 11 minutes
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index fd92a64d748e..feb6d40a0860 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -35,20 +35,6 @@ check_events:
ret
/*
- * We can't use sysexit directly, because we're not running in ring0.
- * But we can easily fake it up using iret. Assuming xen_sysexit is
- * jumped to with a standard stack frame, we can just strip it back to
- * a standard iret frame and use iret.
- */
-ENTRY(xen_sysexit)
- movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
- orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
- lea PT_EIP(%esp), %esp
-
- jmp xen_iret
-ENDPROC(xen_sysexit)
-
-/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index f22667abf7b9..cc8acc410ddb 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,25 +68,6 @@ ENTRY(xen_sysret64)
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
-ENTRY(xen_sysret32)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER32_DS
- pushq PER_CPU_VAR(rsp_scratch)
- pushq %r11
- pushq $__USER32_CS
- pushq %rcx
-
- pushq $0
-1: jmp hypercall_iret
-ENDPATCH(xen_sysret32)
-RELOC(xen_sysret32, 1b+1)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1399423f3418..4140b070f2e9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-#ifdef CONFIG_X86_32
-__visible void xen_sysexit(void);
-#endif
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
__visible void xen_adjust_exception_frame(void);