summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig68
-rw-r--r--arch/x86/boot/compressed/aslr.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c5
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/video-mode.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/boot/video.h1
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32_signal.c19
-rw-r--r--arch/x86/ia32/ia32entry.S485
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/sys_ia32.c14
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/alternative-asm.h53
-rw-r--r--arch/x86/include/asm/alternative.h73
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/calling.h284
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h42
-rw-r--r--arch/x86/include/asm/desc.h7
-rw-r--r--arch/x86/include/asm/dwarf2.h24
-rw-r--r--arch/x86/include/asm/e820.h8
-rw-r--r--arch/x86/include/asm/efi.h6
-rw-r--r--arch/x86/include/asm/elf.h11
-rw-r--r--arch/x86/include/asm/fpu-internal.h130
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/iommu_table.h11
-rw-r--r--arch/x86/include/asm/irqflags.h49
-rw-r--r--arch/x86/include/asm/jump_label.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h28
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/include/asm/microcode.h73
-rw-r--r--arch/x86/include/asm/microcode_intel.h13
-rw-r--r--arch/x86/include/asm/mwait.h8
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h13
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h8
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/processor.h110
-rw-r--r--arch/x86/include/asm/ptrace.h45
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/segment.h289
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/sigcontext.h6
-rw-r--r--arch/x86/include/asm/sighandling.h4
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/include/asm/smp.h3
-rw-r--r--arch/x86/include/asm/special_insns.h24
-rw-r--r--arch/x86/include/asm/thread_info.h74
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h18
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h16
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h13
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h21
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c163
-rw-r--r--arch/x86/kernel/apic/apic.c62
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c126
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c715
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c75
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c345
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c22
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c223
-rw-r--r--arch/x86/kernel/cpu/perf_event.h167
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c908
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c525
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c1379
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c31
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c321
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c1103
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/devicetree.c4
-rw-r--r--arch/x86/kernel/dumpstack.c15
-rw-r--r--arch/x86/kernel/dumpstack_32.c13
-rw-r--r--arch/x86/kernel/dumpstack_64.c11
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early_printk.c32
-rw-r--r--arch/x86/kernel/entry_32.S93
-rw-r--r--arch/x86/kernel/entry_64.S978
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/i387.c56
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c13
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/perf_regs.c40
-rw-r--r--arch/x86/kernel/process.c106
-rw-r--r--arch/x86/kernel/process_32.c27
-rw-r--r--arch/x86/kernel/process_64.c24
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S16
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/signal.c52
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c62
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c24
-rw-r--r--arch/x86/kernel/xsave.c39
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c33
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c193
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c12
-rw-r--r--arch/x86/kvm/ioapic.c22
-rw-r--r--arch/x86/kvm/ioapic.h11
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.c73
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c43
-rw-r--r--arch/x86/kvm/vmx.c146
-rw-r--r--arch/x86/kvm/x86.c171
-rw-r--r--arch/x86/lguest/boot.c4
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S50
-rw-r--r--arch/x86/lib/checksum_32.S64
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/insn.c13
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/lib/msr-reg.S24
-rw-r--r--arch/x86/lib/rwsem.S44
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S28
-rw-r--r--arch/x86/lib/usercopy_64.c15
-rw-r--r--arch/x86/lib/x86-opcode-map.txt9
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--arch/x86/mm/init.c69
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/memtest.c118
-rw-r--r--arch/x86/mm/mmap.c38
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pat.c6
-rw-r--r--arch/x86/mm/pgtable.c160
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c4
-rw-r--r--arch/x86/platform/efi/efi.c17
-rw-r--r--arch/x86/platform/efi/efi_32.c22
-rw-r--r--arch/x86/platform/efi/efi_64.c29
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c10
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c6
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/asm/barrier.h4
-rw-r--r--arch/x86/um/sys_call_table_64.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/vdso/vclock_gettime.c34
-rw-r--r--arch/x86/vdso/vdso32/syscall.S2
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/mmu.c14
-rw-r--r--arch/x86/xen/smp.c60
-rw-r--r--arch/x86/xen/suspend.c11
-rw-r--r--arch/x86/xen/xen-asm_64.S8
219 files changed, 8858 insertions, 4169 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca55187..d43e7e1c784b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,7 @@ config X86
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_USER_RETURN_NOTIFIER
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ select ARCH_HAS_ELF_RANDOMIZE
select HAVE_ARCH_JUMP_LABEL
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select SPARSE_IRQ
@@ -99,6 +99,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -235,12 +236,10 @@ config ARCH_WANT_GENERAL_HUGETLB
def_bool y
config ZONE_DMA32
- bool
- default X86_64
+ def_bool y if X86_64
config AUDIT_ARCH
- bool
- default X86_64
+ def_bool y if X86_64
config ARCH_SUPPORTS_OPTIMIZED_INLINING
def_bool y
@@ -279,6 +278,12 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config PGTABLE_LEVELS
+ int
+ default 4 if X86_64
+ default 3 if X86_PAE
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -716,17 +721,6 @@ endif #HYPERVISOR_GUEST
config NO_BOOTMEM
def_bool y
-config MEMTEST
- bool "Memtest"
- ---help---
- This option adds a kernel parameter 'memtest', which allows memtest
- to be set.
- memtest=0, mean disabled; -- default
- memtest=1, mean do 1 test pattern;
- ...
- memtest=4, mean do 4 test patterns.
- If you are unsure how to answer this question, answer N.
-
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
@@ -891,7 +885,8 @@ config UP_LATE_INIT
depends on !SMP && X86_LOCAL_APIC
config X86_UP_APIC
- bool "Local APIC support on uniprocessors"
+ bool "Local APIC support on uniprocessors" if !PCI_MSI
+ default PCI_MSI
depends on X86_32 && !SMP && !X86_32_NON_STANDARD
---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
@@ -903,10 +898,6 @@ config X86_UP_APIC
performance counters), and the NMI watchdog which detects hard
lockups.
-config X86_UP_APIC_MSI
- def_bool y
- select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
-
config X86_UP_IOAPIC
bool "IO-APIC support on uniprocessors"
depends on X86_UP_APIC
@@ -925,8 +916,8 @@ config X86_LOCAL_APIC
select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
config X86_IO_APIC
- def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
- depends on X86_LOCAL_APIC
+ def_bool y
+ depends on X86_LOCAL_APIC || X86_UP_IOAPIC
select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -1145,10 +1136,10 @@ config MICROCODE_OLD_INTERFACE
depends on MICROCODE
config MICROCODE_INTEL_EARLY
- def_bool n
+ bool
config MICROCODE_AMD_EARLY
- def_bool n
+ bool
config MICROCODE_EARLY
bool "Early load microcode"
@@ -1300,14 +1291,14 @@ config ARCH_DMA_ADDR_T_64BIT
def_bool y
depends on X86_64 || HIGHMEM64G
-config DIRECT_GBPAGES
- bool "Enable 1GB pages for kernel pagetables" if EXPERT
- default y
- depends on X86_64
+config X86_DIRECT_GBPAGES
+ def_bool y
+ depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
---help---
- Allow the kernel linear mapping to use 1GB pages on CPUs that
- support it. This can improve the kernel's performance a tiny bit by
- reducing TLB pressure. If in doubt, say "Y".
+ Certain kernel features effectively disable kernel
+ linear 1 GB mappings (even if the CPU otherwise
+ supports them), so don't confuse the user by printing
+ that we have them enabled.
# Common NUMA Features
config NUMA
@@ -1747,14 +1738,11 @@ config KEXEC_VERIFY_SIG
depends on KEXEC_FILE
---help---
This option makes kernel signature verification mandatory for
- kexec_file_load() syscall. If kernel is signature can not be
- verified, kexec_file_load() will fail.
-
- This option enforces signature verification at generic level.
- One needs to enable signature verification for type of kernel
- image being loaded to make sure it works. For example, enable
- bzImage signature verification option to be able to load and
- verify signatures of bzImage. Otherwise kernel loading will fail.
+ the kexec_file_load() syscall.
+
+ In addition to that option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
config KEXEC_BZIMAGE_VERIFY_SIG
bool "Enable bzImage signature verification support"
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..d7b1f655b3ef 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum,
return slots_fetch_random();
}
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+ unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size)
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
}
#endif
+ boot_params->hdr.loadflags |= KASLR_FLAG;
+
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
(unsigned long)output, output_size);
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbcc196d..8ef964ddc18e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
__HEAD
ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
* us to not reload segments
*/
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 1f
cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c6c082..b0c0d16ef58d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
#include <asm/msr.h>
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
__HEAD
.code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
* us to not reload segments
*/
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 1f
cli
@@ -164,7 +165,7 @@ ENTRY(startup_32)
/* After gdt is loaded */
xorl %eax, %eax
lldt %ax
- movl $0x20, %eax
+ movl $__BOOT_TSS, %eax
ltr %ax
/*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a950864a64da..a107b935e22f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
real_mode = rmode;
+ /* Clear it for solely in-kernel use */
+ real_mode->hdr.loadflags &= ~KASLR_FLAG;
+
sanitize_boot_params(real_mode);
if (real_mode->screen_info.orig_video_mode == 7) {
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
* the entire decompressed kernel plus relocation table, or the
* entire decompressed kernel plus .bss and .brk sections.
*/
- output = choose_kernel_location(input_data, input_len, output,
+ output = choose_kernel_location(real_mode, input_data, input_len, output,
output_len > run_size ? output_len
: run_size);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 04477d68403f..89dd0d78013a 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
#if CONFIG_RANDOMIZE_BASE
/* aslr.c */
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+ unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size);
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
bool has_cpuflag(int flag);
#else
static inline
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+ unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size)
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 493f3fd9f139..318b8465d302 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2)
int delta = 0;
while (*s1 || *s2) {
- delta = *s2 - *s1;
+ delta = *s1 - *s2;
if (delta)
return delta;
s1++;
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index 748e8d06290a..aa8a96b052e3 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -22,10 +22,8 @@
/*
* Common variables
*/
-int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
-u16 video_segment;
+int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
int force_x, force_y; /* Don't query the BIOS for cols/rows */
-
int do_restore; /* Screen contents changed during mode flip */
int graphic_mode; /* Graphic mode with linear frame buffer */
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 43eda284d27f..05111bb8d018 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -17,6 +17,8 @@
#include "video.h"
#include "vesa.h"
+static u16 video_segment;
+
static void store_cursor_position(void)
{
struct biosregs ireg, oreg;
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index 0bb25491262d..b54e0328c449 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -91,7 +91,6 @@ int mode_defined(u16 mode); /* video.c */
#define ADAPTER_VGA 2
extern int adapter;
-extern u16 video_segment;
extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
extern int do_restore; /* Restore screen contents */
extern int graphic_mode; /* Graphics mode with linear frame buffer */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 419819d6dab3..aaa1118bf01e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -248,7 +248,7 @@ CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_EHCI_TT_NEWSCHED=y
CONFIG_USB_OHCI_HCD=y
CONFIG_USB_UHCI_HCD=y
CONFIG_USB_PRINTER=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4c311ddd973b..315b86106572 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -243,7 +243,7 @@ CONFIG_USB=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_EHCI_TT_NEWSCHED=y
CONFIG_USB_OHCI_HCD=y
CONFIG_USB_UHCI_HCD=y
CONFIG_USB_PRINTER=y
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 26d49ebae040..225be06edc80 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -178,7 +178,7 @@ continue_block:
## 2a) PROCESS FULL BLOCKS:
################################################################
full_block:
- movq $128,%rax
+ movl $128,%eax
lea 128*8*2(block_0), block_1
lea 128*8*3(block_0), block_2
add $128*8*1, block_0
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index a039d21986a2..a350c990dc86 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk)
movq R1, 8(%rsi)
popq R1
- movq $1,%rax
+ movl $1,%eax
ret
ENDPROC(twofish_enc_blk)
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk)
movq R1, 8(%rsi)
popq R1
- movq $1,%rax
+ movl $1,%eax
ret
ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b422b766..bb635c641869 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
#
obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9a2932..c81d35e6c7f1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
}
static int ia32_restore_sigcontext(struct pt_regs *regs,
- struct sigcontext_ia32 __user *sc,
- unsigned int *pax)
+ struct sigcontext_ia32 __user *sc)
{
unsigned int tmpflags, err = 0;
void __user *buf;
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
RELOAD_SEG(es);
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
+ COPY(dx); COPY(cx); COPY(ip); COPY(ax);
/* Don't touch extended registers */
COPY_SEG_CPL3(cs);
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
get_user_ex(tmp, &sc->fpstate);
buf = compat_ptr(tmp);
-
- get_user_ex(*pax, &sc->ax);
} get_user_catch(err);
err |= restore_xstate_sig(buf, 1);
+ force_iret();
+
return err;
}
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void)
struct pt_regs *regs = current_pt_regs();
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
- unsigned int ax;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void)
set_current_blocked(&set);
- if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
+ if (ia32_restore_sigcontext(regs, &frame->sc))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "32bit sigreturn");
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void)
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_ia32 __user *frame;
sigset_t set;
- unsigned int ax;
frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void)
set_current_blocked(&set);
- if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "32bit rt sigreturn");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebcab4ada..a821b1cd4fa7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,24 +30,13 @@
.section .entry.text, "ax"
- .macro IA32_ARG_FIXUP noebp=0
- movl %edi,%r8d
- .if \noebp
- .else
- movl %ebp,%r9d
- .endif
- xchg %ecx,%esi
- movl %ebx,%edi
- movl %edx,%edx /* zero extension */
- .endm
-
- /* clobbers %eax */
- .macro CLEAR_RREGS offset=0, _r9=rax
+ /* clobbers %rax */
+ .macro CLEAR_RREGS _r9=rax
xorl %eax,%eax
- movq %rax,\offset+R11(%rsp)
- movq %rax,\offset+R10(%rsp)
- movq %\_r9,\offset+R9(%rsp)
- movq %rax,\offset+R8(%rsp)
+ movq %rax,R11(%rsp)
+ movq %rax,R10(%rsp)
+ movq %\_r9,R9(%rsp)
+ movq %rax,R8(%rsp)
.endm
/*
@@ -60,14 +49,14 @@
* If it's -1 to make us punt the syscall, then (u32)-1 is still
* an appropriately invalid value.
*/
- .macro LOAD_ARGS32 offset, _r9=0
+ .macro LOAD_ARGS32 _r9=0
.if \_r9
- movl \offset+16(%rsp),%r9d
+ movl R9(%rsp),%r9d
.endif
- movl \offset+40(%rsp),%ecx
- movl \offset+48(%rsp),%edx
- movl \offset+56(%rsp),%esi
- movl \offset+64(%rsp),%edi
+ movl RCX(%rsp),%ecx
+ movl RDX(%rsp),%edx
+ movl RSI(%rsp),%esi
+ movl RDI(%rsp),%edi
movl %eax,%eax /* zero extension */
.endm
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit)
/*
* 32bit SYSENTER instruction entry.
*
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
* Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6
- *
- * Interrupts off.
- *
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
+ * path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
- */
+ */
ENTRY(ia32_sysenter_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
- SWAPGS_UNSAFE_STACK
- movq PER_CPU_VAR(kernel_stack), %rsp
- addq $(KERNEL_STACK_OFFSET),%rsp
+
/*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs, here we enable it straight after entry:
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
*/
+ SWAPGS_UNSAFE_STACK
+ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
ENABLE_INTERRUPTS(CLBR_NONE)
- movl %ebp,%ebp /* zero extension */
- pushq_cfi $__USER32_DS
- /*CFI_REL_OFFSET ss,0*/
- pushq_cfi %rbp
- CFI_REL_OFFSET rsp,0
- pushfq_cfi
- /*CFI_REL_OFFSET rflags,0*/
- movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
- CFI_REGISTER rip,r10
- pushq_cfi $__USER32_CS
- /*CFI_REL_OFFSET cs,0*/
+
+ /* Zero-extending 32-bit regs, do not remove */
+ movl %ebp, %ebp
movl %eax, %eax
- pushq_cfi %r10
- CFI_REL_OFFSET rip,0
- pushq_cfi %rax
+
+ movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
+ CFI_REGISTER rip,r10
+
+ /* Construct struct pt_regs on stack */
+ pushq_cfi $__USER32_DS /* pt_regs->ss */
+ pushq_cfi %rbp /* pt_regs->sp */
+ CFI_REL_OFFSET rsp,0
+ pushfq_cfi /* pt_regs->flags */
+ pushq_cfi $__USER32_CS /* pt_regs->cs */
+ pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */
+ CFI_REL_OFFSET rip,0
+ pushq_cfi_reg rax /* pt_regs->orig_ax */
+ pushq_cfi_reg rdi /* pt_regs->di */
+ pushq_cfi_reg rsi /* pt_regs->si */
+ pushq_cfi_reg rdx /* pt_regs->dx */
+ pushq_cfi_reg rcx /* pt_regs->cx */
+ pushq_cfi_reg rax /* pt_regs->ax */
cld
- SAVE_ARGS 0,1,0
- /* no need to do an access_ok check here because rbp has been
- 32bit zero extended */
+ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+ CFI_ADJUST_CFA_OFFSET 10*8
+
+ /*
+ * no need to do an access_ok check here because rbp has been
+ * 32bit zero extended
+ */
ASM_STAC
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target)
* ourselves. To save a few cycles, we can check whether
* NT was set instead of doing an unconditional popfq.
*/
- testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+ testl $X86_EFLAGS_NT,EFLAGS(%rsp)
jnz sysenter_fix_flags
sysenter_flags_fixed:
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
sysenter_do_call:
- IA32_ARG_FIXUP
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ movl %ebp,%r9d /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz sysexit_audit
sysexit_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- /* clear IF, that popfq doesn't enable interrupts early */
- andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
- movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
- CFI_REGISTER rip,rdx
- RESTORE_ARGS 0,24,0,0,0,0
+ /*
+ * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
+ * NMI between STI and SYSEXIT has poorly specified behavior,
+ * and and NMI followed by an IRQ with usergs is fatal. So
+ * we just pretend we're using SYSEXIT but we really use
+ * SYSRETL instead.
+ *
+ * This code path is still called 'sysexit' because it pairs
+ * with 'sysenter' and it uses the SYSENTER calling convention.
+ */
+ andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ movl RIP(%rsp),%ecx /* User %eip */
+ CFI_REGISTER rip,rcx
+ RESTORE_RSI_RDI
+ xorl %edx,%edx /* avoid info leaks */
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
- xorq %r11,%r11
- popfq_cfi
+ movl EFLAGS(%rsp),%r11d /* User eflags */
/*CFI_RESTORE rflags*/
- popq_cfi %rcx /* User %esp */
- CFI_REGISTER rsp,rcx
TRACE_IRQS_ON
- ENABLE_INTERRUPTS_SYSEXIT32
+
+ /*
+ * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
+ * since it avoids a dicey window with interrupts enabled.
+ */
+ movl RSP(%rsp),%esp
+
+ /*
+ * USERGS_SYSRET32 does:
+ * gsbase = user's gs base
+ * eip = ecx
+ * rflags = r11
+ * cs = __USER32_CS
+ * ss = __USER_DS
+ *
+ * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
+ *
+ * pop %ebp
+ * pop %edx
+ * pop %ecx
+ *
+ * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
+ * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
+ * address (already known to user code), and R12-R15 are
+ * callee-saved and therefore don't contain any interesting
+ * kernel data.
+ */
+ USERGS_SYSRET32
CFI_RESTORE_STATE
@@ -205,18 +247,18 @@ sysexit_from_sys_call:
movl %ebx,%esi /* 2nd arg: 1st syscall arg */
movl %eax,%edi /* 1st arg: syscall number */
call __audit_syscall_entry
- movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
+ movl RAX(%rsp),%eax /* reload syscall number */
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
movl %ebx,%edi /* reload 1st syscall arg */
- movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
- movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
- movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
- movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
+ movl RCX(%rsp),%esi /* reload 2nd syscall arg */
+ movl RDX(%rsp),%edx /* reload 3rd syscall arg */
+ movl RSI(%rsp),%ecx /* reload 4th syscall arg */
+ movl RDI(%rsp),%r8d /* reload 5th syscall arg */
.endm
.macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +269,13 @@ sysexit_from_sys_call:
1: setbe %al /* 1 if error, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
call __audit_syscall_exit
- movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */
+ movq RAX(%rsp),%rax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz \exit
- CLEAR_RREGS -ARGOFFSET
+ CLEAR_RREGS
jmp int_with_check
.endm
@@ -253,16 +295,16 @@ sysenter_fix_flags:
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz sysenter_auditsys
#endif
- SAVE_REST
+ SAVE_EXTRA_REGS
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
+ RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
jmp sysenter_do_call
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target)
/*
* 32bit SYSCALL instruction entry.
*
+ * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
* Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx return EIP
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
- * %esp user stack
- * 0(%esp) Arg6
- *
- * Interrupts off.
- *
+ * eax system call number
+ * ecx return address
+ * ebx arg1
+ * ebp arg2 (note: not saved in the stack frame, should not be touched)
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * esp user stack
+ * 0(%esp) arg6
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
+ CFI_DEF_CFA rsp,0
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
+
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
SWAPGS_UNSAFE_STACK
movl %esp,%r8d
CFI_REGISTER rsp,r8
movq PER_CPU_VAR(kernel_stack),%rsp
- /*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
- */
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0,0
- movl %eax,%eax /* zero extension */
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+
+ /* Zero-extending 32-bit regs, do not remove */
+ movl %eax,%eax
+
+ /* Construct struct pt_regs on stack */
+ pushq_cfi $__USER32_DS /* pt_regs->ss */
+ pushq_cfi %r8 /* pt_regs->sp */
+ CFI_REL_OFFSET rsp,0
+ pushq_cfi %r11 /* pt_regs->flags */
+ pushq_cfi $__USER32_CS /* pt_regs->cs */
+ pushq_cfi %rcx /* pt_regs->ip */
+ CFI_REL_OFFSET rip,0
+ pushq_cfi_reg rax /* pt_regs->orig_ax */
+ pushq_cfi_reg rdi /* pt_regs->di */
+ pushq_cfi_reg rsi /* pt_regs->si */
+ pushq_cfi_reg rdx /* pt_regs->dx */
+ pushq_cfi_reg rbp /* pt_regs->cx */
movl %ebp,%ecx
- movq $__USER32_CS,CS-ARGOFFSET(%rsp)
- movq $__USER32_DS,SS-ARGOFFSET(%rsp)
- movq %r11,EFLAGS-ARGOFFSET(%rsp)
- /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- movq %r8,RSP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
- /* no need to do an access_ok check here because r8 has been
- 32bit zero extended */
- /* hardware stack frame is complete now */
+ pushq_cfi_reg rax /* pt_regs->ax */
+ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+ CFI_ADJUST_CFA_OFFSET 10*8
+
+ /*
+ * no need to do an access_ok check here because r8 has been
+ * 32bit zero extended
+ */
ASM_STAC
1: movl (%r8),%r9d
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cmpq $IA32_NR_syscalls-1,%rax
ja ia32_badsys
cstar_do_call:
- IA32_ARG_FIXUP 1
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ /* r9 already loaded */ /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz sysretl_audit
sysretl_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- RESTORE_ARGS 0,-ARG_SKIP,0,0,0
- movl RIP-ARGOFFSET(%rsp),%ecx
+ andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ RESTORE_RSI_RDI_RDX
+ movl RIP(%rsp),%ecx
CFI_REGISTER rip,rcx
- movl EFLAGS-ARGOFFSET(%rsp),%r11d
+ movl EFLAGS(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
xorq %r10,%r10
xorq %r9,%r9
xorq %r8,%r8
TRACE_IRQS_ON
- movl RSP-ARGOFFSET(%rsp),%esp
+ movl RSP(%rsp),%esp
CFI_RESTORE rsp
+ /*
+ * 64bit->32bit SYSRET restores eip from ecx,
+ * eflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * (Note: 32bit->32bit SYSRET is different: since r11
+ * does not exist, it merely sets eflags.IF=1).
+ */
USERGS_SYSRET32
-
+
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
CFI_RESTORE_STATE
- movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
+ movl %r9d,R9(%rsp) /* register to be clobbered by call */
auditsys_entry_common
- movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
+ movl R9(%rsp),%r9d /* reload 6th syscall arg */
jmp cstar_dispatch
sysretl_audit:
@@ -368,17 +444,17 @@ sysretl_audit:
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
- SAVE_REST
- CLEAR_RREGS 0, r9
+ SAVE_EXTRA_REGS
+ CLEAR_RREGS r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
+ RESTORE_EXTRA_REGS
xchgl %ebp,%r9d
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,78 +467,94 @@ ia32_badarg:
jmp ia32_sysret
CFI_ENDPROC
-/*
- * Emulated IA32 system calls via int 0x80.
+/*
+ * Emulated IA32 system calls via int 0x80.
*
- * Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6 (note: not saved in the stack frame, should not be touched)
*
* Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except %eax must be saved (but ptrace may violate that)
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
* Arguments are zero extended. For system calls that want sign extension and
* take long arguments a wrapper is needed. Most calls can just be called
* directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
ENTRY(ia32_syscall)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-RIP
- /*CFI_REL_OFFSET ss,SS-RIP*/
- CFI_REL_OFFSET rsp,RSP-RIP
- /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
- /*CFI_REL_OFFSET cs,CS-RIP*/
- CFI_REL_OFFSET rip,RIP-RIP
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- SWAPGS
+ CFI_DEF_CFA rsp,5*8
+ /*CFI_REL_OFFSET ss,4*8 */
+ CFI_REL_OFFSET rsp,3*8
+ /*CFI_REL_OFFSET rflags,2*8 */
+ /*CFI_REL_OFFSET cs,1*8 */
+ CFI_REL_OFFSET rip,0*8
+
/*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
*/
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ SWAPGS
ENABLE_INTERRUPTS(CLBR_NONE)
- movl %eax,%eax
- pushq_cfi %rax
+
+ /* Zero-extending 32-bit regs, do not remove */
+ movl %eax,%eax
+
+ /* Construct struct pt_regs on stack (iret frame is already on stack) */
+ pushq_cfi_reg rax /* pt_regs->orig_ax */
+ pushq_cfi_reg rdi /* pt_regs->di */
+ pushq_cfi_reg rsi /* pt_regs->si */
+ pushq_cfi_reg rdx /* pt_regs->dx */
+ pushq_cfi_reg rcx /* pt_regs->cx */
+ pushq_cfi_reg rax /* pt_regs->ax */
cld
- /* note the registers are not zero extended to the sf.
- this could be a problem. */
- SAVE_ARGS 0,1,0
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+ CFI_ADJUST_CFA_OFFSET 10*8
+
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
ia32_do_call:
- IA32_ARG_FIXUP
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ movl %ebp,%r9d /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
ia32_ret_from_sys_call:
- CLEAR_RREGS -ARGOFFSET
- jmp int_ret_from_sys_call
+ CLEAR_RREGS
+ jmp int_ret_from_sys_call
-ia32_tracesys:
- SAVE_REST
+ia32_tracesys:
+ SAVE_EXTRA_REGS
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
+ RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
jmp ia32_do_call
END(ia32_syscall)
ia32_badsys:
- movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+ movq $0,ORIG_RAX(%rsp)
movq $-ENOSYS,%rax
jmp ia32_sysret
@@ -479,8 +571,6 @@ GLOBAL(\label)
PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
PTREGSCALL stub32_sigreturn, sys32_sigreturn
- PTREGSCALL stub32_execve, compat_sys_execve
- PTREGSCALL stub32_execveat, compat_sys_execveat
PTREGSCALL stub32_fork, sys_fork
PTREGSCALL stub32_vfork, sys_vfork
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone)
ALIGN
ia32_ptregs_common:
- popq %r11
CFI_ENDPROC
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-ARGOFFSET
- CFI_REL_OFFSET rax,RAX-ARGOFFSET
- CFI_REL_OFFSET rcx,RCX-ARGOFFSET
- CFI_REL_OFFSET rdx,RDX-ARGOFFSET
- CFI_REL_OFFSET rsi,RSI-ARGOFFSET
- CFI_REL_OFFSET rdi,RDI-ARGOFFSET
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
-/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
-/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
- SAVE_REST
+ CFI_DEF_CFA rsp,SIZEOF_PTREGS
+ CFI_REL_OFFSET rax,RAX
+ CFI_REL_OFFSET rcx,RCX
+ CFI_REL_OFFSET rdx,RDX
+ CFI_REL_OFFSET rsi,RSI
+ CFI_REL_OFFSET rdi,RDI
+ CFI_REL_OFFSET rip,RIP
+/* CFI_REL_OFFSET cs,CS*/
+/* CFI_REL_OFFSET rflags,EFLAGS*/
+ CFI_REL_OFFSET rsp,RSP
+/* CFI_REL_OFFSET ss,SS*/
+ SAVE_EXTRA_REGS 8
call *%rax
- RESTORE_REST
- jmp ia32_sysret /* misbalances the return cache */
+ RESTORE_EXTRA_REGS 8
+ ret
CFI_ENDPROC
END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b4e787..000000000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-
-long compat_ni_syscall(void)
-{
- return -ENOSYS;
-}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceecdc957..719cd702b0a4 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
advice);
}
-long sys32_vm86_warning(void)
-{
- struct task_struct *me = current;
- static char lastcomm[sizeof(me->comm)];
-
- if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
- compat_printk(KERN_INFO
- "%s: vm86 mode not supported on 64 bit kernel\n",
- me->comm);
- strncpy(lastcomm, me->comm, sizeof(lastcomm));
- }
- return -ENOSYS;
-}
-
asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
size_t count)
{
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 4754ba0f5d9f..000000000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* System call table for ia32 emulation. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
-#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-
-#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern void compat_ni_syscall(void);
-
-const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
-#include <asm/syscalls_32.h>
-};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..bdf02eeee765 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,63 @@
.endm
#endif
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
.long \orig - .
.long \alt - .
.word \feature
.byte \orig_len
.byte \alt_len
+ .byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+#define old_len 141b-140b
+#define new_len1 144f-143f
+#define new_len2 145f-144f
+
+/*
+ * max without conditionals. Idea adapted from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ */
+#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
.endm
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..ba32af062f61 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
- u8 replacementlen; /* length of new instruction, <= instrlen */
-};
+ u8 replacementlen; /* length of new instruction */
+ u8 padlen; /* length of build-time padding */
+} __packed;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num) "664"#num
+#define e_replacement(num) "665"#num
-#define b_replacement(number) "663"#number
-#define e_replacement(number) "664"#number
+#define alt_end_marker "663"
+#define alt_slen "662b-661b"
+#define alt_pad_len alt_end_marker"b-662b"
+#define alt_total_slen alt_end_marker"b-661b"
+#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
+ "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
-#define ALTINSTR_ENTRY(feature, number) \
+#define OLDINSTR(oldinstr, num) \
+ __OLDINSTR(oldinstr, num) \
+ alt_end_marker ":\n"
+
+/*
+ * max without conditionals. Idea adapted from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas works with s32s.
+ */
+#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
+ "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
+ alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num) \
" .long 661b - .\n" /* label */ \
- " .long " b_replacement(number)"f - .\n" /* new instruction */ \
+ " .long " b_replacement(num)"f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
- " .byte " alt_slen "\n" /* source len */ \
- " .byte " alt_rlen(number) "\n" /* replacement len */
-
-#define DISCARD_ENTRY(number) /* rlen <= slen */ \
- " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
+ " .byte " alt_total_slen "\n" /* source len */ \
+ " .byte " alt_rlen(num) "\n" /* replacement len */ \
+ " .byte " alt_pad_len "\n" /* pad len */
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
- b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, newinstr, feature) \
- OLDINSTR(oldinstr) \
+ OLDINSTR(oldinstr, 1) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \
- ".pushsection .discard,\"aw\",@progbits\n" \
- DISCARD_ENTRY(1) \
- ".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
".popsection"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
- OLDINSTR(oldinstr) \
+ OLDINSTR_2(oldinstr, 1, 2) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature1, 1) \
ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \
- ".pushsection .discard,\"aw\",@progbits\n" \
- DISCARD_ENTRY(1) \
- DISCARD_ENTRY(2) \
- ".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative(oldinstr, newinstr, feature) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
/*
* Alternative inline assembly with input.
*
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..976b86a325e5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+ alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
@@ -204,7 +204,6 @@ extern void clear_local_APIC(void);
extern void disconnect_bsp_APIC(int virt_wire_setup);
extern void disable_local_APIC(void);
extern void lapic_shutdown(void);
-extern int verify_local_APIC(void);
extern void sync_Arb_IDs(void);
extern void init_bsp_APIC(void);
extern void setup_local_APIC(void);
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
* code region.
- *
- * (Could use an alternative three way for this if there was one.)
*/
static __always_inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b46f83..1c8b50edb2db 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with
* for assembly code:
*/
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-
-#define ARGOFFSET R11
-
- .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
- subq $9*8+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq_cfi rdi, 8*8
- movq_cfi rsi, 7*8
- movq_cfi rdx, 6*8
-
- .if \save_rcx
- movq_cfi rcx, 5*8
- .endif
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15 0*8
+#define R14 1*8
+#define R13 2*8
+#define R12 3*8
+#define RBP 4*8
+#define RBX 5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11 6*8
+#define R10 7*8
+#define R9 8*8
+#define R8 9*8
+#define RAX 10*8
+#define RCX 11*8
+#define RDX 12*8
+#define RSI 13*8
+#define RDI 14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 15*8
+/* Return frame for iretq */
+#define RIP 16*8
+#define CS 17*8
+#define EFLAGS 18*8
+#define RSP 19*8
+#define SS 20*8
+
+#define SIZEOF_PTREGS 21*8
+
+ .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
+ subq $15*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+ .endm
- .if \rax_enosys
- movq $-ENOSYS, 4*8(%rsp)
- .else
- movq_cfi rax, 4*8
+ .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
+ .if \r11
+ movq_cfi r11, 6*8+\offset
.endif
-
- .if \save_r891011
- movq_cfi r8, 3*8
- movq_cfi r9, 2*8
- movq_cfi r10, 1*8
- movq_cfi r11, 0*8
+ .if \r8910
+ movq_cfi r10, 7*8+\offset
+ movq_cfi r9, 8*8+\offset
+ movq_cfi r8, 9*8+\offset
+ .endif
+ .if \rax
+ movq_cfi rax, 10*8+\offset
+ .endif
+ .if \rcx
+ movq_cfi rcx, 11*8+\offset
.endif
+ movq_cfi rdx, 12*8+\offset
+ movq_cfi rsi, 13*8+\offset
+ movq_cfi rdi, 14*8+\offset
+ .endm
+ .macro SAVE_C_REGS offset=0
+ SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
+ SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_R891011
+ SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RCX_R891011
+ SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
+ SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
+ .endm
+
+ .macro SAVE_EXTRA_REGS offset=0
+ movq_cfi r15, 0*8+\offset
+ movq_cfi r14, 1*8+\offset
+ movq_cfi r13, 2*8+\offset
+ movq_cfi r12, 3*8+\offset
+ movq_cfi rbp, 4*8+\offset
+ movq_cfi rbx, 5*8+\offset
+ .endm
+ .macro SAVE_EXTRA_REGS_RBP offset=0
+ movq_cfi rbp, 4*8+\offset
+ .endm
+ .macro RESTORE_EXTRA_REGS offset=0
+ movq_cfi_restore 0*8+\offset, r15
+ movq_cfi_restore 1*8+\offset, r14
+ movq_cfi_restore 2*8+\offset, r13
+ movq_cfi_restore 3*8+\offset, r12
+ movq_cfi_restore 4*8+\offset, rbp
+ movq_cfi_restore 5*8+\offset, rbx
.endm
-#define ARG_SKIP (9*8)
+ .macro ZERO_EXTRA_REGS
+ xorl %r15d, %r15d
+ xorl %r14d, %r14d
+ xorl %r13d, %r13d
+ xorl %r12d, %r12d
+ xorl %ebp, %ebp
+ xorl %ebx, %ebx
+ .endm
- .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
- rstor_r8910=1, rstor_rdx=1
+ .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
- movq_cfi_restore 0*8, r11
+ movq_cfi_restore 6*8, r11
.endif
-
.if \rstor_r8910
- movq_cfi_restore 1*8, r10
- movq_cfi_restore 2*8, r9
- movq_cfi_restore 3*8, r8
+ movq_cfi_restore 7*8, r10
+ movq_cfi_restore 8*8, r9
+ movq_cfi_restore 9*8, r8
.endif
-
.if \rstor_rax
- movq_cfi_restore 4*8, rax
+ movq_cfi_restore 10*8, rax
.endif
-
.if \rstor_rcx
- movq_cfi_restore 5*8, rcx
+ movq_cfi_restore 11*8, rcx
.endif
-
.if \rstor_rdx
- movq_cfi_restore 6*8, rdx
- .endif
-
- movq_cfi_restore 7*8, rsi
- movq_cfi_restore 8*8, rdi
-
- .if ARG_SKIP+\addskip > 0
- addq $ARG_SKIP+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
+ movq_cfi_restore 12*8, rdx
.endif
+ movq_cfi_restore 13*8, rsi
+ movq_cfi_restore 14*8, rdi
.endm
-
- .macro LOAD_ARGS offset, skiprax=0
- movq \offset(%rsp), %r11
- movq \offset+8(%rsp), %r10
- movq \offset+16(%rsp), %r9
- movq \offset+24(%rsp), %r8
- movq \offset+40(%rsp), %rcx
- movq \offset+48(%rsp), %rdx
- movq \offset+56(%rsp), %rsi
- movq \offset+64(%rsp), %rdi
- .if \skiprax
- .else
- movq \offset+72(%rsp), %rax
- .endif
+ .macro RESTORE_C_REGS
+ RESTORE_C_REGS_HELPER 1,1,1,1,1
.endm
-
-#define REST_SKIP (6*8)
-
- .macro SAVE_REST
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq_cfi rbx, 5*8
- movq_cfi rbp, 4*8
- movq_cfi r12, 3*8
- movq_cfi r13, 2*8
- movq_cfi r14, 1*8
- movq_cfi r15, 0*8
+ .macro RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_C_REGS_HELPER 0,1,1,1,1
.endm
-
- .macro RESTORE_REST
- movq_cfi_restore 0*8, r15
- movq_cfi_restore 1*8, r14
- movq_cfi_restore 2*8, r13
- movq_cfi_restore 3*8, r12
- movq_cfi_restore 4*8, rbp
- movq_cfi_restore 5*8, rbx
- addq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
+ .macro RESTORE_C_REGS_EXCEPT_RCX
+ RESTORE_C_REGS_HELPER 1,0,1,1,1
.endm
-
- .macro SAVE_ALL
- SAVE_ARGS
- SAVE_REST
+ .macro RESTORE_C_REGS_EXCEPT_R11
+ RESTORE_C_REGS_HELPER 1,1,0,1,1
+ .endm
+ .macro RESTORE_C_REGS_EXCEPT_RCX_R11
+ RESTORE_C_REGS_HELPER 1,0,0,1,1
+ .endm
+ .macro RESTORE_RSI_RDI
+ RESTORE_C_REGS_HELPER 0,0,0,0,0
+ .endm
+ .macro RESTORE_RSI_RDI_RDX
+ RESTORE_C_REGS_HELPER 0,0,0,0,1
.endm
- .macro RESTORE_ALL addskip=0
- RESTORE_REST
- RESTORE_ARGS 1, \addskip
+ .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
+ addq $15*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
.endm
.macro icebp
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with
*/
.macro SAVE_ALL
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ebp
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg edx
+ pushl_cfi_reg ecx
+ pushl_cfi_reg ebx
.endm
.macro RESTORE_ALL
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebp
- CFI_RESTORE ebp
- popl_cfi %eax
- CFI_RESTORE eax
+ popl_cfi_reg ebx
+ popl_cfi_reg ecx
+ popl_cfi_reg edx
+ popl_cfi_reg esi
+ popl_cfi_reg edi
+ popl_cfi_reg ebp
+ popl_cfi_reg eax
.endm
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..acdee09228b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
sp = task_pt_regs(current)->sp;
} else {
/* -128 for the x32 ABI redzone */
- sp = this_cpu_read(old_rsp) - 128;
+ sp = task_pt_regs(current)->sp - 128;
}
return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
#endif
#endif
-DECLARE_PER_CPU(int, cpu_state);
-
int mwait_usable(const struct cpuinfo_x86 *);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..7ee9b94d9921 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
-#define NCAPINTS 11 /* N 32-bit words worth of info */
+#define NCAPINTS 13 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -195,6 +195,7 @@
#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -226,12 +227,15 @@
#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
@@ -242,6 +246,12 @@
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
/*
* BUG word(s)
*/
@@ -418,6 +428,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* 1: do replace */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +443,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (bit) : : t_no);
@@ -457,6 +469,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -483,31 +496,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{
#ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
- asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+ asm_volatile_goto("1: jmp %l[t_dynamic]\n"
"2:\n"
+ ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+ "((5f-4f) - (2b-1b)),0x90\n"
+ "3:\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
- " .long 3f - .\n" /* repl offset */
+ " .long 4f - .\n" /* repl offset */
" .word %P1\n" /* always replace */
- " .byte 2b - 1b\n" /* src len */
- " .byte 4f - 3f\n" /* repl len */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 5f - 4f\n" /* repl len */
+ " .byte 3b - 2b\n" /* pad len */
".previous\n"
".section .altinstr_replacement,\"ax\"\n"
- "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
- "4:\n"
+ "4: jmp %l[t_no]\n"
+ "5:\n"
".previous\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
" .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */
- " .byte 2b - 1b\n" /* src len */
+ " .byte 3b - 1b\n" /* src len */
" .byte 0\n" /* repl len */
+ " .byte 0\n" /* pad len */
".previous\n"
: : "i" (bit), "i" (X86_FEATURE_ALWAYS)
: : t_dynamic, t_no);
@@ -527,6 +539,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P2\n" /* always replace */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +554,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P1\n" /* feature bit */
" .byte 4b - 3b\n" /* src len */
" .byte 6f - 5f\n" /* repl len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..a0bf89fd2647 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
* Pentium F0 0F bugfix can have resulted in the mapped
* IDT being write-protected.
*/
-#define set_intr_gate(n, addr) \
+#define set_intr_gate_notrace(n, addr) \
do { \
BUG_ON((unsigned)n > 0xFF); \
_set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \
__KERNEL_CS); \
+ } while (0)
+
+#define set_intr_gate(n, addr) \
+ do { \
+ set_intr_gate_notrace(n, addr); \
_trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
0, 0, __KERNEL_CS); \
} while (0)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f15986df6c..de1cdaf4d743 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
CFI_ADJUST_CFA_OFFSET 8
.endm
+ .macro pushq_cfi_reg reg
+ pushq %\reg
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET \reg, 0
+ .endm
+
.macro popq_cfi reg
popq \reg
CFI_ADJUST_CFA_OFFSET -8
.endm
+ .macro popq_cfi_reg reg
+ popq %\reg
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE \reg
+ .endm
+
.macro pushfq_cfi
pushfq
CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
CFI_ADJUST_CFA_OFFSET 4
.endm
+ .macro pushl_cfi_reg reg
+ pushl %\reg
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET \reg, 0
+ .endm
+
.macro popl_cfi reg
popl \reg
CFI_ADJUST_CFA_OFFSET -4
.endm
+ .macro popl_cfi_reg reg
+ popl %\reg
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE \reg
+ .endm
+
.macro pushfl_cfi
pushfl
CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
}
#endif
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 25bce45c6fc4..3738b138b843 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -2,6 +2,8 @@
#define _ASM_X86_EFI_H
#include <asm/i387.h>
+#include <asm/pgtable.h>
+
/*
* We map the EFI regions needed for runtime services non-contiguously,
* with preserved alignment on virtual addresses starting from -4G down
@@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
extern struct efi_scratch efi_scratch;
extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int __init efi_memblock_x86_reserve_range(void);
-extern void __init efi_call_phys_prolog(void);
-extern void __init efi_call_phys_epilog(void);
+extern pgd_t * __init efi_call_phys_prolog(void);
+extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
extern void __init efi_unmap_memmap(void);
extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -171,10 +171,11 @@ do { \
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
{
- regs->ax = regs->bx = regs->cx = regs->dx = 0;
- regs->si = regs->di = regs->bp = 0;
+ /* Commented-out registers are cleared in stub_execve */
+ /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
+ regs->si = regs->di /*= regs->bp*/ = 0;
regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
- regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+ /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
t->fs = t->gs = 0;
t->fsindex = t->gsindex = 0;
t->ds = t->es = ds;
@@ -338,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
/*
* True on X86_32 or when emulating IA32 on X86_64
*/
@@ -365,6 +363,7 @@ enum align_flags {
struct va_alignment {
int flags;
unsigned long mask;
+ unsigned long bits;
} ____cacheline_aligned;
extern struct va_alignment va_align;
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 72ba21a8b5fc..da5e96756570 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
#endif
+/*
+ * Must be run with preemption disabled: this clears the fpu_owner_task,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+ per_cpu(fpu_owner_task, cpu) = NULL;
+}
+
+/*
+ * Used to indicate that the FPU state in memory is newer than the FPU
+ * state in registers, and the FPU state should be reloaded next time the
+ * task is run. Only safe on the current task, or non-running tasks.
+ */
+static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
+{
+ tsk->thread.fpu.last_cpu = ~0;
+}
+
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+ return new == this_cpu_read_stable(fpu_owner_task) &&
+ cpu == new->thread.fpu.last_cpu;
+}
+
static inline int is_ia32_compat_frame(void)
{
return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void)
static inline void fx_finit(struct i387_fxsave_struct *fx)
{
- memset(fx, 0, xstate_size);
fx->cwd = 0x37f;
fx->mxcsr = MXCSR_DEFAULT;
}
@@ -351,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
__thread_set_has_fpu(tsk);
}
-static inline void __drop_fpu(struct task_struct *tsk)
+static inline void drop_fpu(struct task_struct *tsk)
{
+ /*
+ * Forget coprocessor state..
+ */
+ preempt_disable();
+ tsk->thread.fpu_counter = 0;
+
if (__thread_has_fpu(tsk)) {
/* Ignore delayed exceptions from user space */
asm volatile("1: fwait\n"
@@ -360,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk)
_ASM_EXTABLE(1b, 2b));
__thread_fpu_end(tsk);
}
-}
-static inline void drop_fpu(struct task_struct *tsk)
-{
- /*
- * Forget coprocessor state..
- */
- preempt_disable();
- tsk->thread.fpu_counter = 0;
- __drop_fpu(tsk);
clear_stopped_child_used_math(tsk);
preempt_enable();
}
-static inline void drop_init_fpu(struct task_struct *tsk)
+static inline void restore_init_xstate(void)
+{
+ if (use_xsave())
+ xrstor_state(init_xstate_buf, -1);
+ else
+ fxrstor_checking(&init_xstate_buf->i387);
+}
+
+/*
+ * Reset the FPU state in the eager case and drop it in the lazy case (later use
+ * will reinit it).
+ */
+static inline void fpu_reset_state(struct task_struct *tsk)
{
if (!use_eager_fpu())
drop_fpu(tsk);
- else {
- if (use_xsave())
- xrstor_state(init_xstate_buf, -1);
- else
- fxrstor_checking(&init_xstate_buf->i387);
- }
+ else
+ restore_init_xstate();
}
/*
@@ -400,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk)
*/
typedef struct { int preload; } fpu_switch_t;
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
- per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
- return new == this_cpu_read_stable(fpu_owner_task) &&
- cpu == new->thread.fpu.last_cpu;
-}
-
static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
{
fpu_switch_t fpu;
@@ -426,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
* If the task has used the math, pre-load the FPU on xsave processors
* or if the past 5 consecutive context-switches used math.
*/
- fpu.preload = tsk_used_math(new) && (use_eager_fpu() ||
- new->thread.fpu_counter > 5);
+ fpu.preload = tsk_used_math(new) &&
+ (use_eager_fpu() || new->thread.fpu_counter > 5);
+
if (__thread_has_fpu(old)) {
if (!__save_init_fpu(old))
- cpu = ~0;
- old->thread.fpu.last_cpu = cpu;
- old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */
+ task_disable_lazy_fpu_restore(old);
+ else
+ old->thread.fpu.last_cpu = cpu;
+
+ /* But leave fpu_owner_task! */
+ old->thread.fpu.has_fpu = 0;
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
@@ -443,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
stts();
} else {
old->thread.fpu_counter = 0;
- old->thread.fpu.last_cpu = ~0;
+ task_disable_lazy_fpu_restore(old);
if (fpu.preload) {
new->thread.fpu_counter++;
- if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
+ if (fpu_lazy_restore(new, cpu))
fpu.preload = 0;
else
prefetch(new->thread.fpu.state);
@@ -466,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
{
if (fpu.preload) {
if (unlikely(restore_fpu_checking(new)))
- drop_init_fpu(new);
+ fpu_reset_state(new);
}
}
@@ -495,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
}
/*
- * Need to be preemption-safe.
+ * Needs to be preemption-safe.
*
* NOTE! user_fpu_begin() must be used only immediately before restoring
- * it. This function does not do any save/restore on their own.
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
*/
static inline void user_fpu_begin(void)
{
@@ -520,24 +540,6 @@ static inline void __save_fpu(struct task_struct *tsk)
}
/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- WARN_ON_ONCE(!__thread_has_fpu(tsk));
-
- if (use_eager_fpu()) {
- __save_fpu(tsk);
- return;
- }
-
- preempt_disable();
- __save_init_fpu(tsk);
- __thread_fpu_end(tsk);
- preempt_enable();
-}
-
-/*
* i387 state interaction
*/
static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 9662290e0b20..e9571ddabc4f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
extern __visible void smp_invalidate_interrupt(struct pt_regs *);
#endif
-extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR
- - FIRST_EXTERNAL_VECTOR])(void);
+extern char irq_entries_start[];
#ifdef CONFIG_TRACING
-#define trace_interrupt interrupt
+#define trace_irq_entries_start irq_entries_start
#endif
#define VECTOR_UNDEFINED (-1)
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1d1846..e7814b74caf8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@ struct insn {
const insn_byte_t *next_byte;
};
-#define MAX_INSN_SIZE 16
+#define MAX_INSN_SIZE 15
#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index f42a04735a0a..e37d6b3ad983 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -79,11 +79,12 @@ struct iommu_table_entry {
* d). Similar to the 'init', except that this gets called from pci_iommu_init
* where we do have a memory allocator.
*
- * The standard vs the _FINISH differs in that the _FINISH variant will
- * continue detecting other IOMMUs in the call list after the
- * the detection routine returns a positive number. The _FINISH will
- * stop the execution chain. Both will still call the 'init' and
- * 'late_init' functions if they are set.
+ * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
+ * in that the former will continue detecting other IOMMUs in the call
+ * list after the detection routine returns a positive number, while the
+ * latter will stop the execution chain upon first successful detection.
+ * Both variants will still call the 'init' and 'late_init' functions if
+ * they are set.
*/
#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
__IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519226b8..b77f5edb03b0 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define USERGS_SYSRET32 \
swapgs; \
sysretl
-#define ENABLE_INTERRUPTS_SYSEXIT32 \
- swapgs; \
- sti; \
- sysexit
#else
#define INTERRUPT_RETURN iret
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void)
return arch_irqs_disabled_flags(flags);
}
+#endif /* !__ASSEMBLY__ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_TRACE_IRQFLAGS
+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
#else
-
-#ifdef CONFIG_X86_64
-#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_X86_64
+# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+# define LOCKDEP_SYS_EXIT_IRQ \
TRACE_IRQS_ON; \
sti; \
- SAVE_REST; \
- LOCKDEP_SYS_EXIT; \
- RESTORE_REST; \
+ call lockdep_sys_exit_thunk; \
cli; \
TRACE_IRQS_OFF;
-
-#else
-#define ARCH_LOCKDEP_SYS_EXIT \
+# else
+# define LOCKDEP_SYS_EXIT \
pushl %eax; \
pushl %ecx; \
pushl %edx; \
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void)
popl %edx; \
popl %ecx; \
popl %eax;
-
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ
-#endif
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
-# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
+# define LOCKDEP_SYS_EXIT_IRQ
+# endif
#else
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
-# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
-# else
# define LOCKDEP_SYS_EXIT
# define LOCKDEP_SYS_EXIT_IRQ
-# endif
-
+#endif
#endif /* __ASSEMBLY__ */
+
#endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 6a2cefb4395a..a4c1cf7e93f8 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,7 +1,7 @@
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
#include <linux/stringify.h>
#include <linux/types.h>
@@ -30,8 +30,6 @@ l_yes:
return true;
}
-#endif /* __KERNEL__ */
-
#ifdef CONFIG_X86_64
typedef u64 jump_label_t;
#else
@@ -44,4 +42,5 @@ struct jump_entry {
jump_label_t key;
};
+#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39cc385..dea2e7e962e3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -81,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
#define KVM_MMU_HASH_SHIFT 10
@@ -345,6 +340,7 @@ struct kvm_pmu {
enum {
KVM_DEBUGREG_BP_ENABLED = 1,
KVM_DEBUGREG_WONT_EXIT = 2,
+ KVM_DEBUGREG_RELOAD = 4,
};
struct kvm_vcpu_arch {
@@ -431,6 +427,9 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+
+ int maxphyaddr;
+
/* emulate context */
struct x86_emulate_ctxt emulate_ctxt;
@@ -550,11 +549,20 @@ struct kvm_arch_memory_slot {
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
+/*
+ * We use as the mode the number of bits allocated in the LDR for the
+ * logical processor ID. It happens that these are all powers of two.
+ * This makes it is very easy to detect cases where the APICs are
+ * configured for multiple modes; in that case, we cannot use the map and
+ * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ */
+#define KVM_APIC_MODE_XAPIC_CLUSTER 4
+#define KVM_APIC_MODE_XAPIC_FLAT 8
+#define KVM_APIC_MODE_X2APIC 16
+
struct kvm_apic_map {
struct rcu_head rcu;
- u8 ldr_bits;
- /* fields bellow are used to decode ldr values in different modes */
- u32 cid_shift, cid_mask, lid_mask, broadcast;
+ u8 mode;
struct kvm_lapic *phys_map[256];
/* first index is cluster id second is cpu id in a cluster */
struct kvm_lapic *logical_map[16][16];
@@ -859,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -933,6 +943,7 @@ struct x86_emulate_ctxt;
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -1128,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e62cf897f781..c1adf33fdd0d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void)
static inline bool kvm_para_available(void)
{
- return 0;
+ return false;
}
static inline unsigned int kvm_arch_para_features(void)
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index a455a53d789a..2d29197bd2fb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void)
#endif
return 0;
}
-extern int klp_write_module_reloc(struct module *mod, unsigned long type,
- unsigned long loc, unsigned long value);
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+ unsigned long loc, unsigned long value);
static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
{
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9b3de99dc004..1f5a86d518db 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
u32 rip_msr;
};
+struct mce_vendor_flags {
+ __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
+ __reserved_0 : 63;
+};
+extern struct mce_vendor_flags mce_flags;
+
extern struct mca_config mca_cfg;
extern void mce_register_decode_chain(struct notifier_block *nb);
extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_vendor_init_severity(void);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_vendor_init_severity(void) {}
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
@@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
enum mcp_flags {
- MCP_TIMESTAMP = (1 << 0), /* log time stamp */
- MCP_UC = (1 << 1), /* log uncorrected errors */
- MCP_DONTLOG = (1 << 2), /* only clear, don't log */
+ MCP_TIMESTAMP = BIT(0), /* log time stamp */
+ MCP_UC = BIT(1), /* log uncorrected errors */
+ MCP_DONTLOG = BIT(2), /* only clear, don't log */
};
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 201b520521ed..2fb20d6f7e23 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {}
#ifdef CONFIG_MICROCODE_EARLY
#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int __x86_family(unsigned int sig)
+{
+ unsigned int x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+
+static inline unsigned int x86_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return __x86_family(eax);
+}
+
+static inline unsigned int x86_model(unsigned int sig)
+{
+ unsigned int x86, model;
+
+ x86 = __x86_family(sig);
+
+ model = (sig >> 4) & 0xf;
+
+ if (x86 == 0x6 || x86 == 0xf)
+ model += ((sig >> 16) & 0xf) << 4;
+
+ return model;
+}
+
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
extern int __init save_microcode_in_initrd(void);
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index dd4c20043ce7..2b9209c46ca9 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -56,12 +56,15 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-extern int
-get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
+extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev);
-extern int
-update_match_revision(struct microcode_header_intel *mc_header, int rev);
+extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
+
+static inline int
+revision_is_newer(struct microcode_header_intel *mc_header, int rev)
+{
+ return (mc_header->rev <= rev) ? 0 : 1;
+}
#ifdef CONFIG_MICROCODE_INTEL_EARLY
extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index a1410db38a1a..653dfa7662e1 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
:: "a" (eax), "c" (ecx));
}
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER (PUD_SHIFT)
#else
#include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER (PMD_SHIFT)
#endif /* CONFIG_X86_64 */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 965c47d254aa..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
}
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
static inline pmd_t __pmd(pmdval_t val)
{
pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
}
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
set_pud(pudp, __pud(0));
}
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
#ifdef CONFIG_X86_PAE
/* Special-case pte-setting operations for PAE, which can't update a
@@ -976,11 +976,6 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
-
-#define ENABLE_INTERRUPTS_SYSEXIT32 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
struct paravirt_callee_save pgd_val;
struct paravirt_callee_save make_pgd;
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* PAGETABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#define pmd_pgtable(pmd) pmd_page(pmd)
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
}
#endif /* CONFIG_X86_PAE */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
___pud_free_tlb(tlb, pud);
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 2
/*
* traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
#define SHARED_KERNEL_PMD 1
#endif
-#define PAGETABLE_LEVELS 3
-
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
return npg >> (20 - PAGE_SHIFT);
}
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
{
return 0;
}
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline int pgd_present(pgd_t pgd)
{
return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
{
return !native_pgd_val(pgd);
}
-#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 4
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
}
#endif
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..23ba6765b718 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
/* in KB - valid for CPUS which support this call: */
int x86_cache_size;
int x86_cache_alignment; /* In bytes */
+ /* Cache QoS architectural values: */
+ int x86_cache_max_rmid; /* max index */
+ int x86_cache_occ_scale; /* scale to bytes */
int x86_power;
unsigned long loops_per_jiffy;
/* cpuid returned max cores value: */
@@ -210,8 +213,23 @@ struct x86_hw_tss {
unsigned long sp0;
unsigned short ss0, __ss0h;
unsigned long sp1;
- /* ss1 caches MSR_IA32_SYSENTER_CS: */
- unsigned short ss1, __ss1h;
+
+ /*
+ * We don't use ring 1, so ss1 is a convenient scratch space in
+ * the same cacheline as sp0. We use ss1 to cache the value in
+ * MSR_IA32_SYSENTER_CS. When we context switch
+ * MSR_IA32_SYSENTER_CS, we first check if the new value being
+ * written matches ss1, and, if it's not, then we wrmsr the new
+ * value and update ss1.
+ *
+ * The only reason we context switch MSR_IA32_SYSENTER_CS is
+ * that we set it to zero in vm86 tasks to avoid corrupting the
+ * stack if we were to go through the sysenter path from vm86
+ * mode.
+ */
+ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
+
+ unsigned short __ss1h;
unsigned long sp2;
unsigned short ss2, __ss2h;
unsigned long __cr3;
@@ -276,13 +294,17 @@ struct tss_struct {
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
/*
- * .. and then another 0x100 bytes for the emergency kernel stack:
+ * Space for the temporary SYSENTER stack:
*/
- unsigned long stack[64];
+ unsigned long SYSENTER_stack[64];
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#endif
/*
* Save the original ist values for checking stack pointers during debugging
@@ -474,7 +496,6 @@ struct thread_struct {
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
#else
- unsigned long usersp; /* Copy from PDA */
unsigned short es;
unsigned short ds;
unsigned short fsindex;
@@ -564,6 +585,16 @@ static inline void native_swapgs(void)
#endif
}
+static inline unsigned long current_top_of_stack(void)
+{
+#ifdef CONFIG_X86_64
+ return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
+#else
+ /* sp0 on x86_32 is special in and around vm86 mode. */
+ return this_cpu_read_stable(cpu_current_top_of_stack);
+#endif
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -761,10 +792,10 @@ extern char ignore_fpu_irq;
#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
-# define BASE_PREFETCH ASM_NOP4
+# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 (%1)"
+# define BASE_PREFETCH "prefetcht0 %P1"
#endif
/*
@@ -775,10 +806,9 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchnta (%1)",
+ alternative_input(BASE_PREFETCH, "prefetchnta %P1",
X86_FEATURE_XMM,
- "r" (x));
+ "m" (*(const char *)x));
}
/*
@@ -788,10 +818,9 @@ static inline void prefetch(const void *x)
*/
static inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchw (%1)",
- X86_FEATURE_3DNOW,
- "r" (x));
+ alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ X86_FEATURE_3DNOWPREFETCH,
+ "m" (*(const char *)x));
}
static inline void spin_lock_prefetch(const void *x)
@@ -799,6 +828,9 @@ static inline void spin_lock_prefetch(const void *x)
prefetchw(x);
}
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+ TOP_OF_KERNEL_STACK_PADDING)
+
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
@@ -809,39 +841,16 @@ static inline void spin_lock_prefetch(const void *x)
#define STACK_TOP_MAX STACK_TOP
#define INIT_THREAD { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .sp0 = TOP_OF_INIT_STACK, \
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
}
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS { \
- .x86_tss = { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
- .ss0 = __KERNEL_DS, \
- .ss1 = __KERNEL_CS, \
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
- }, \
- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
-}
-
extern unsigned long thread_saved_pc(struct task_struct *tsk);
-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info) \
-({ \
- unsigned long *__ptr = (unsigned long *)(info); \
- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
-})
-
/*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
* is accessible even if the CPU haven't stored the SS/ESP registers
* on the stack (interrupt gate does not save these registers
@@ -850,11 +859,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
* "struct pt_regs" is possible, but they may contain the
* completely wrong values.
*/
-#define task_pt_regs(task) \
-({ \
- struct pt_regs *__regs__; \
- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
- __regs__ - 1; \
+#define task_pt_regs(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task_stack_page(task); \
+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+ ((struct pt_regs *)__ptr) - 1; \
})
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
@@ -886,11 +895,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
- .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#define INIT_TSS { \
- .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+ .sp0 = TOP_OF_INIT_STACK \
}
/*
@@ -902,11 +907,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
-/*
- * User space RSP while inside the SYSCALL fast path
- */
-DECLARE_PER_CPU(unsigned long, old_rsp);
-
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb82287..19507ffa5d28 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
#else /* __i386__ */
struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
unsigned long dx;
unsigned long si;
unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
unsigned long ip;
unsigned long cs;
unsigned long flags;
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
}
/*
- * user_mode_vm(regs) determines whether a register set came from user mode.
- * This is true if V8086 mode was enabled OR if the register set was from
- * protected mode with RPL-3 CS value. This tricky test checks that with
- * one comparison. Many places in the kernel can bypass this full check
- * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ * user_mode(regs) determines whether a register set came from user
+ * mode. On x86_32, this is true if V8086 mode was enabled OR if the
+ * register set was from protected mode with RPL-3 CS value. This
+ * tricky test checks that with one comparison.
+ *
+ * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
+ * the extra check.
*/
static inline int user_mode(struct pt_regs *regs)
{
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
#endif
}
-static inline int user_mode_vm(struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_32
- return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
- USER_RPL;
-#else
- return user_mode(regs);
-#endif
-}
-
static inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}
-#define current_user_stack_pointer() this_cpu_read(old_rsp)
-/* ia32 vs. x32 difference */
-#define compat_user_stack_pointer() \
- (test_thread_flag(TIF_IA32) \
- ? current_pt_regs()->sp \
- : this_cpu_read(old_rsp))
+#define current_user_stack_pointer() current_pt_regs()->sp
+#define compat_user_stack_pointer() current_pt_regs()->sp
#endif
#ifdef CONFIG_X86_32
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
*/
#define arch_ptrace_stop_needed(code, info) \
({ \
- set_thread_flag(TIF_NOTIFY_RESUME); \
+ force_iret(); \
false; \
})
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..25b1cc07d496 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti;
+ u32 migrate_count;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index db257a58571f..5a9856eb12ba 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -3,8 +3,10 @@
#include <linux/const.h>
-/* Constructor for a conventional segment GDT (or LDT) entry */
-/* This is a macro so it can be used in initializers */
+/*
+ * Constructor for a conventional segment GDT (or LDT) entry.
+ * This is a macro so it can be used in initializers.
+ */
#define GDT_ENTRY(flags, base, limit) \
((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
(((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
@@ -12,198 +14,228 @@
(((base) & _AC(0x00ffffff,ULL)) << 16) | \
(((limit) & _AC(0x0000ffff,ULL))))
-/* Simple and small GDT entries for booting only */
+/* Simple and small GDT entries for booting only: */
#define GDT_ENTRY_BOOT_CS 2
-#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+#define GDT_ENTRY_BOOT_DS 3
+#define GDT_ENTRY_BOOT_TSS 4
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8)
+
+/*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_RPL_MASK 0x3
-#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
-#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+/* User mode is privilege level 3: */
+#define USER_RPL 0x3
-#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
-#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
+#define SEGMENT_TI_MASK 0x4
+/* LDT segment has TI set ... */
+#define SEGMENT_LDT 0x4
+/* ... GDT has it cleared */
+#define SEGMENT_GDT 0x0
-#define SEGMENT_RPL_MASK 0x3 /*
- * Bottom two bits of selector give the ring
- * privilege level
- */
-#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */
-#define USER_RPL 0x3 /* User mode is privilege level 3 */
-#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */
-#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */
+#define GDT_ENTRY_INVALID_SEG 0
#ifdef CONFIG_X86_32
/*
* The layout of the per-CPU GDT under Linux:
*
- * 0 - null
+ * 0 - null <=== cacheline #1
* 1 - reserved
* 2 - reserved
* 3 - reserved
*
- * 4 - unused <==== new cacheline
+ * 4 - unused <=== cacheline #2
* 5 - unused
*
* ------- start of TLS (Thread-Local Storage) segments:
*
* 6 - TLS segment #1 [ glibc's TLS segment ]
* 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
- * 8 - TLS segment #3
+ * 8 - TLS segment #3 <=== cacheline #3
* 9 - reserved
* 10 - reserved
* 11 - reserved
*
* ------- start of kernel segments:
*
- * 12 - kernel code segment <==== new cacheline
+ * 12 - kernel code segment <=== cacheline #4
* 13 - kernel data segment
* 14 - default user CS
* 15 - default user DS
- * 16 - TSS
+ * 16 - TSS <=== cacheline #5
* 17 - LDT
* 18 - PNPBIOS support (16->32 gate)
* 19 - PNPBIOS support
- * 20 - PNPBIOS support
+ * 20 - PNPBIOS support <=== cacheline #6
* 21 - PNPBIOS support
* 22 - PNPBIOS support
* 23 - APM BIOS support
- * 24 - APM BIOS support
+ * 24 - APM BIOS support <=== cacheline #7
* 25 - APM BIOS support
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - stack_canary-20 [ for stack protector ]
+ * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
*/
-#define GDT_ENTRY_TLS_MIN 6
-#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_KERNEL_CS 12
+#define GDT_ENTRY_KERNEL_DS 13
#define GDT_ENTRY_DEFAULT_USER_CS 14
-
#define GDT_ENTRY_DEFAULT_USER_DS 15
+#define GDT_ENTRY_TSS 16
+#define GDT_ENTRY_LDT 17
+#define GDT_ENTRY_PNPBIOS_CS32 18
+#define GDT_ENTRY_PNPBIOS_CS16 19
+#define GDT_ENTRY_PNPBIOS_DS 20
+#define GDT_ENTRY_PNPBIOS_TS1 21
+#define GDT_ENTRY_PNPBIOS_TS2 22
+#define GDT_ENTRY_APMBIOS_BASE 23
+
+#define GDT_ENTRY_ESPFIX_SS 26
+#define GDT_ENTRY_PERCPU 27
+#define GDT_ENTRY_STACK_CANARY 28
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
-#define GDT_ENTRY_KERNEL_BASE (12)
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES 32
-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0)
+/*
+ * Segment selector values corresponding to the above entries:
+ */
-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4)
-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5)
+/* segment for calling fn: */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8)
+/* code segment for BIOS: */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8)
-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6)
-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11)
+/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32)
-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14)
-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
+/* data segment for BIOS: */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8)
+/* transfer data segment: */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8)
+/* another data segment: */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8)
-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
#ifdef CONFIG_SMP
-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8)
#else
-#define __KERNEL_PERCPU 0
+# define __KERNEL_PERCPU 0
#endif
-#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
#ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
+# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
#else
-#define __KERNEL_STACK_CANARY 0
+# define __KERNEL_STACK_CANARY 0
#endif
-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
-
-/*
- * The GDT has 32 entries
- */
-#define GDT_ENTRIES 32
+#else /* 64-bit: */
-/* The PnP BIOS entries in the GDT */
-#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
-#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
-#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
-#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
-#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
-
-/* The PnP BIOS selectors */
-#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
-#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
-#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
-#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
-#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+#include <asm/cache.h>
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
/*
- * Matching rules for certain types of segments.
+ * We cannot use the same code segment descriptor for user and kernel mode,
+ * not even in long flat mode, because of different DPL.
+ *
+ * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
+ * selectors:
+ *
+ * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ *
+ * ss = STAR.SYSRET_CS+8 (in either case)
+ *
+ * thus USER_DS should be between 32-bit and 64-bit code selectors:
*/
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
-
+/* Needs two entries */
+#define GDT_ENTRY_TSS 8
+/* Needs two entries */
+#define GDT_ENTRY_LDT 10
-#else
-#include <asm/cache.h>
-
-#define GDT_ENTRY_KERNEL32_CS 1
-#define GDT_ENTRY_KERNEL_CS 2
-#define GDT_ENTRY_KERNEL_DS 3
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
-#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
+/* Abused to load per CPU data from limit */
+#define GDT_ENTRY_PER_CPU 15
/*
- * we cannot use the same code segment descriptor for user and kernel
- * -- not even in the long flat mode, because of different DPL /kkeil
- * The segment offset needs to contain a RPL. Grr. -AK
- * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ * Number of entries in the GDT table:
*/
-#define GDT_ENTRY_DEFAULT_USER32_CS 4
-#define GDT_ENTRY_DEFAULT_USER_DS 5
-#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
-#define __USER32_DS __USER_DS
-
-#define GDT_ENTRY_TSS 8 /* needs two entries */
-#define GDT_ENTRY_LDT 10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN 12
-#define GDT_ENTRY_TLS_MAX 14
-
-#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
-#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
+#define GDT_ENTRIES 16
-/* TLS indexes for 64bit - hardcoded in arch_prctl */
-#define FS_TLS 0
-#define GS_TLS 1
-
-#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
-
-#define GDT_ENTRIES 16
+/*
+ * Segment selector values corresponding to the above entries:
+ *
+ * Note, selectors also need to have a correct RPL,
+ * expressed with the +3 value for user-space selectors:
+ */
+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER32_DS __USER_DS
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
+
+/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
+#define FS_TLS 0
+#define GS_TLS 1
+
+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
#endif
-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
#ifndef CONFIG_PARAVIRT
-#define get_kernel_rpl() 0
+# define get_kernel_rpl() 0
#endif
-#define IDT_ENTRIES 256
-#define NUM_EXCEPTION_VECTORS 32
-/* Bitmask of exception vectors which push an error code on the stack */
-#define EXCEPTION_ERRCODE_MASK 0x00027d00
-#define GDT_SIZE (GDT_ENTRIES * 8)
-#define GDT_ENTRY_TLS_ENTRIES 3
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+
+/* Bitmask of exception vectors which push an error code on the stack: */
+#define EXCEPTION_ERRCODE_MASK 0x00027d00
+
+#define GDT_SIZE (GDT_ENTRIES*8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
+
extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
#ifdef CONFIG_TRACING
-#define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handlers early_idt_handlers
#endif
/*
@@ -228,37 +260,30 @@ do { \
} while (0)
/*
- * Save a segment register away
+ * Save a segment register away:
*/
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
/*
- * x86_32 user gs accessors.
+ * x86-32 user GS accessors:
*/
#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk) ((tsk)->thread.gs)
-#define lazy_save_gs(v) savesegment(gs, (v))
-#define lazy_load_gs(v) loadsegment(gs, (v))
-#else /* X86_32_LAZY_GS */
-#define get_user_gs(regs) (u16)((regs)->gs)
-#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v) do { } while (0)
-#define lazy_load_gs(v) do { } while (0)
-#endif /* X86_32_LAZY_GS */
+# ifdef CONFIG_X86_32_LAZY_GS
+# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
+# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+# define task_user_gs(tsk) ((tsk)->thread.gs)
+# define lazy_save_gs(v) savesegment(gs, (v))
+# define lazy_load_gs(v) loadsegment(gs, (v))
+# else /* X86_32_LAZY_GS */
+# define get_user_gs(regs) (u16)((regs)->gs)
+# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+# define lazy_save_gs(v) do { } while (0)
+# define lazy_load_gs(v) do { } while (0)
+# endif /* X86_32_LAZY_GS */
#endif /* X86_32 */
-static inline unsigned long get_limit(unsigned long segment)
-{
- unsigned long __limit;
- asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
- return __limit + 1;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ff4e7b236e21..f69e06b283fb 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { }
*/
extern struct boot_params boot_params;
+static inline bool kaslr_enabled(void)
+{
+ return !!(boot_params.hdr.loadflags & KASLR_FLAG);
+}
+
/*
* Do NOT EVER look at the BIOS memory size location.
* It does not work on many machines.
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..6fe6b182c998 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
unsigned long ip;
unsigned long flags;
unsigned short cs;
- unsigned short gs;
- unsigned short fs;
- unsigned short __pad0;
+ unsigned short __pad2; /* Was called gs, but was always zero. */
+ unsigned short __pad1; /* Was called fs, but was always zero. */
+ unsigned short ss;
unsigned long err;
unsigned long trapno;
unsigned long oldmask;
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 7a958164088c..89db46752a8f 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,9 +13,7 @@
X86_EFLAGS_CF | X86_EFLAGS_RF)
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax);
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
struct pt_regs *regs, unsigned long mask);
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
#ifdef CONFIG_X86_SMAP
-#define ASM_CLAC \
- 661: ASM_NOP3 ; \
- .pushsection .altinstr_replacement, "ax" ; \
- 662: __ASM_CLAC ; \
- .popsection ; \
- .pushsection .altinstructions, "a" ; \
- altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
- .popsection
-
-#define ASM_STAC \
- 661: ASM_NOP3 ; \
- .pushsection .altinstr_replacement, "ax" ; \
- 662: __ASM_STAC ; \
- .popsection ; \
- .pushsection .altinstructions, "a" ; \
- altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
- .popsection
+#define ASM_CLAC \
+ ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+ ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
#else /* CONFIG_X86_SMAP */
@@ -61,20 +49,20 @@
static __always_inline void clac(void)
{
/* Note: a barrier is implicit in alternative() */
- alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+ alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
}
static __always_inline void stac(void)
{
/* Note: a barrier is implicit in alternative() */
- alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+ alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
}
/* These macros can be used in asm() statements */
#define ASM_CLAC \
- ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+ ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
#define ASM_STAC \
- ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+ ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
#else /* CONFIG_X86_SMAP */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..17a8dced12da 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
}
void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void native_smp_cpus_done(unsigned int max_cpus);
+void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
void native_cpu_die(unsigned int cpu);
void native_play_dead(void);
void play_dead_common(void);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6a4b00fafb00..aeb4666e0c0a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
#ifdef __KERNEL__
+#include <asm/nops.h>
+
static inline void native_clts(void)
{
asm volatile("clts");
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p)
"+m" (*(volatile char __force *)__p));
}
+static inline void clwb(volatile void *__p)
+{
+ volatile struct { char x[64]; } *p = __p;
+
+ asm volatile(ALTERNATIVE_2(
+ ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
+ ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
+ X86_FEATURE_CLFLUSHOPT,
+ ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
+ X86_FEATURE_CLWB)
+ : [p] "+m" (*p)
+ : [pax] "a" (p));
+}
+
+static inline void pcommit_sfence(void)
+{
+ alternative(ASM_NOP7,
+ ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
+ "sfence",
+ X86_FEATURE_PCOMMIT);
+}
+
#define nop() asm volatile ("nop")
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..ea2dbe82cba3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,6 +13,33 @@
#include <asm/types.h>
/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack. We do it because of a nasty
+ * 32-bit corner case. On x86_32, the hardware stack frame is
+ * variable-length. Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0. On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault. (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
+/*
* low level task data that entry.S needs immediate access to
* - this struct should fit entirely inside of one cache line
* - this struct shares the supervisor stack pages
@@ -145,7 +172,6 @@ struct thread_info {
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
#define STACK_WARN (THREAD_SIZE/8)
-#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
/*
* macros/functions for gaining access to the thread information structure
@@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
static inline struct thread_info *current_thread_info(void)
{
- struct thread_info *ti;
- ti = (void *)(this_cpu_read_stable(kernel_stack) +
- KERNEL_STACK_OFFSET - THREAD_SIZE);
- return ti;
+ return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
}
static inline unsigned long current_stack_pointer(void)
@@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void)
#else /* !__ASSEMBLY__ */
-/* how to get the thread information struct from ASM */
+/* Load thread_info address into "reg" */
#define GET_THREAD_INFO(reg) \
_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
- _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ;
+ _ASM_SUB $(THREAD_SIZE),reg ;
/*
- * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
- * a certain register (to be used in assembler memory operands).
+ * ASM operand which evaluates to a 'thread_info' address of
+ * the current task, if it is known that "reg" is exactly "off"
+ * bytes below the top of the stack currently.
+ *
+ * ( The kernel stack's size is known at build time, it is usually
+ * 2 or 4 pages, and the bottom of the kernel stack contains
+ * the thread_info structure. So to access the thread_info very
+ * quickly from assembly code we can calculate down from the
+ * top of the kernel stack to the bottom, using constant,
+ * build-time calculations only. )
+ *
+ * For example, to fetch the current thread_info->flags value into %eax
+ * on x86-64 defconfig kernels, in syscall entry code where RSP is
+ * currently at exactly SIZEOF_PTREGS bytes away from the top of the
+ * stack:
+ *
+ * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
+ *
+ * will translate to:
+ *
+ * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
+ *
+ * which is below the current RSP by almost 16K.
*/
-#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
#endif
@@ -236,6 +280,16 @@ static inline bool is_ia32_task(void)
#endif
return false;
}
+
+/*
+ * Force syscall return via IRET by making it look as if there was
+ * some work pending. IRET is our most capable (but slowest) syscall
+ * return path, which is able to restore modified SS, CS and certain
+ * EFLAGS values that other (fast) syscall return instructions
+ * are not able to restore properly.
+ */
+#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
+
#endif /* !__ASSEMBLY__ */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 12a26b979bf1..f2f9b39b274a 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
}
unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+copy_user_handle_tail(char *to, char *from, unsigned len);
#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..ab456dc233b5 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -15,6 +15,7 @@
/* loadflags */
#define LOADED_HIGH (1<<0)
+#define KASLR_FLAG (1<<1)
#define QUIET_FLAG (1<<5)
#define KEEP_SEGMENTS (1<<6)
#define CAN_USE_HEAP (1<<7)
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..1a4eae695ca8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -74,6 +74,24 @@
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
+#define MSR_IA32_RTIT_CTL 0x00000570
+#define RTIT_CTL_TRACEEN BIT(0)
+#define RTIT_CTL_OS BIT(2)
+#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA BIT(8)
+#define RTIT_CTL_TSC_EN BIT(10)
+#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define MSR_IA32_RTIT_STATUS 0x00000571
+#define RTIT_STATUS_CONTEXTEN BIT(1)
+#define RTIT_STATUS_TRIGGEREN BIT(2)
+#define RTIT_STATUS_ERROR BIT(4)
+#define RTIT_STATUS_STOPPED BIT(5)
+#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
+
#define MSR_MTRRfix64K_00000 0x00000250
#define MSR_MTRRfix16K_80000 0x00000258
#define MSR_MTRRfix16K_A0000 0x00000259
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a88851..580aee3072e0 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
#else /* __i386__ */
#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
#define R15 0
#define R14 8
#define R13 16
#define R12 24
#define RBP 32
#define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
#define R11 48
#define R10 56
#define R9 64
@@ -41,15 +45,17 @@
#define RDX 96
#define RSI 104
#define RDI 112
-#define ORIG_RAX 120 /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
#define RIP 128
#define CS 136
#define EFLAGS 144
#define RSP 152
#define SS 160
-#define ARGOFFSET R11
#endif /* __ASSEMBLY__ */
/* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa4d999..bc16115af39b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
#ifndef __KERNEL__
struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
unsigned long rip;
unsigned long cs;
unsigned long eflags;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081e86..16dc4e8a2cd3 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,9 +177,24 @@ struct sigcontext {
__u64 rip;
__u64 eflags; /* RFLAGS */
__u16 cs;
- __u16 gs;
- __u16 fs;
- __u16 __pad0;
+
+ /*
+ * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+ * Linux saved and restored fs and gs in these slots. This
+ * was counterproductive, as fsbase and gsbase were never
+ * saved, so arch_prctl was presumably unreliable.
+ *
+ * If these slots are ever needed for any other purpose, there
+ * is some risk that very old 64-bit binaries could get
+ * confused. I doubt that many such binaries still work,
+ * though, since the same patch in 2.5.64 also removed the
+ * 64-bit set_thread_area syscall, so it appears that there is
+ * no TLS API that works in both pre- and post-2.5.64 kernels.
+ */
+ __u16 __pad2; /* Was gs. */
+ __u16 __pad1; /* Was fs. */
+
+ __u16 ss;
__u64 err;
__u64 trapno;
__u64 oldmask;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index c5f1a1deb91a..1fe92181ee9e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -67,6 +67,7 @@
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
#define EXIT_REASON_PREEMPTION_TIMER 52
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..c887cd944f0c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..aef653193160 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
-#define DPRINTK(fmt, ...) \
-do { \
- if (debug_alternative) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+#define DPRINTK(fmt, args...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG fmt, ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
} while (0)
/*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len);
-/* Replace instructions with better alternatives for this CPU type.
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that asymmetric systems where
- APs have less capabilities than the boot processor are not handled.
- Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+ return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+ u8 *next_rip, *tgt_rip;
+ s32 n_dspl, o_dspl;
+ int repl_len;
+
+ if (a->replacementlen != 5)
+ return;
+
+ o_dspl = *(s32 *)(insnbuf + 1);
+
+ /* next_rip of the replacement JMP */
+ next_rip = repl_insn + a->replacementlen;
+ /* target rip of the replacement JMP */
+ tgt_rip = next_rip + o_dspl;
+ n_dspl = tgt_rip - orig_insn;
+
+ DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+ if (tgt_rip - orig_insn >= 0) {
+ if (n_dspl - 2 <= 127)
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ /* negative offset */
+ } else {
+ if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ }
+
+two_byte_jmp:
+ n_dspl -= 2;
+
+ insnbuf[0] = 0xeb;
+ insnbuf[1] = (s8)n_dspl;
+ add_nops(insnbuf + 2, 3);
+
+ repl_len = 2;
+ goto done;
+
+five_byte_jmp:
+ n_dspl -= 5;
+
+ insnbuf[0] = 0xe9;
+ *(s32 *)&insnbuf[1] = n_dspl;
+ repl_len = 5;
+
+done:
+
+ DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+ n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+ if (instr[0] != 0x90)
+ return;
+
+ add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+ instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
- DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ DPRINTK("alt table %p -> %p", start, end);
/*
* The scan order should be from start to end. A later scanned
- * alternative code can overwrite a previous scanned alternative code.
+ * alternative code can overwrite previously scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code.
*
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid))
+ if (!boot_cpu_has(a->cpuid)) {
+ if (a->padlen > 1)
+ optimize_nops(a, instr);
+
continue;
+ }
+
+ DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, a->instrlen,
+ replacement, a->replacementlen, a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
/* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += replacement - instr;
+ if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+ DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+ *(s32 *)(insnbuf + 1),
+ (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ }
+
+ if (a->replacementlen && is_jmp(replacement[0]))
+ recompute_jump(a, instr, replacement, insnbuf);
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
+ if (a->instrlen > a->replacementlen) {
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+ DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
- text_poke_early(instr, insnbuf, a->instrlen);
+ text_poke_early(instr, insnbuf, insnbuf_sz);
}
}
#ifdef CONFIG_SMP
-
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __func__, smp->locks, smp->locks_end,
+ DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
return 0;
}
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PARAVIRT
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
if (likely(!bp_patching_in_progress))
return 0;
- if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
return 0;
/* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad3639ae1b9b..dcb52850a28f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void)
local_irq_restore(flags);
}
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
- reg1 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ apic->apic_id_mask))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
/**
* sync_Arb_IDs - synchronize APIC bus arbitration IDs
*/
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void)
disable_ioapic_support();
default_setup_apic_routing();
- verify_local_APIC();
apic_bsp_setup(true);
return 0;
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e658f21681c8..d9d0bd2faaf4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
- __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
for_each_online_cpu(cpu) {
if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
continue;
- __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
- __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
}
}
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
- __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+ cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
register_hotcpu_notifier(&x2apic_cpu_notifier);
return 1;
}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8e9dcfd630e4..c8d92950bc04 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void)
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int pnodeid, is_uv1, is_uv2, is_uv3;
-
- is_uv1 = !strcmp(oem_id, "SGI");
- is_uv2 = !strcmp(oem_id, "SGI2");
- is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */
- if (is_uv1 || is_uv2 || is_uv3) {
- uv_hub_info->hub_revision =
- (is_uv1 ? UV1_HUB_REVISION_BASE :
- (is_uv2 ? UV2_HUB_REVISION_BASE :
- UV3_HUB_REVISION_BASE));
- pnodeid = early_get_pnodeid();
- early_get_apic_pnode_shift();
- x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
- x86_platform.nmi_init = uv_nmi_init;
- if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
- else if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
- else if (!strcmp(oem_table_id, "UVH")) {
- __this_cpu_write(x2apic_extra_bits,
- pnodeid << uvh_apicid.s.pnode_shift);
- uv_system_type = UV_NON_UNIQUE_APIC;
- uv_set_apicid_hibit();
- return 1;
- }
+ int pnodeid;
+ int uv_apic;
+
+ if (strncmp(oem_id, "SGI", 3) != 0)
+ return 0;
+
+ /*
+ * Determine UV arch type.
+ * SGI: UV100/1000
+ * SGI2: UV2000/3000
+ * SGI3: UV300 (truncated to 4 chars because of different varieties)
+ */
+ uv_hub_info->hub_revision =
+ !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+
+ if (uv_hub_info->hub_revision == 0)
+ goto badbios;
+
+ pnodeid = early_get_pnodeid();
+ early_get_apic_pnode_shift();
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+
+ if (!strcmp(oem_table_id, "UVX")) { /* most common */
+ uv_system_type = UV_X2APIC;
+ uv_apic = 0;
+
+ } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ __this_cpu_write(x2apic_extra_bits,
+ pnodeid << uvh_apicid.s.pnode_shift);
+ uv_set_apicid_hibit();
+ uv_apic = 1;
+
+ } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */
+ uv_system_type = UV_LEGACY_APIC; /* very small systems */
+ uv_apic = 0;
+
+ } else {
+ goto badbios;
}
- return 0;
+
+ pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
+ oem_id, oem_table_id, uv_system_type,
+ uv_min_hub_revision_id, uv_apic);
+
+ return uv_apic;
+
+badbios:
+ pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
+ pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+ BUG();
}
enum uv_system_type get_uv_system_type(void)
@@ -854,10 +881,14 @@ void __init uv_system_init(void)
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
unsigned char n_lshift;
- char *hub = (is_uv1_hub() ? "UV1" :
- (is_uv2_hub() ? "UV2" :
- "UV3"));
+ char *hub = (is_uv1_hub() ? "UV100/1000" :
+ (is_uv2_hub() ? "UV2000/3000" :
+ (is_uv3_hub() ? "UV300" : NULL)));
+ if (!hub) {
+ pr_err("UV: Unknown/unsupported UV hub\n");
+ return;
+ }
pr_info("UV: Found %s hub\n", hub);
map_low_mmrs();
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- sizeof(struct tss_struct));
+ offsetofend(struct tss_struct, SYSENTER_stack));
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..fd470ebf924e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <linux/io.h>
#include <linux/sched.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_int() & va_align.mask;
}
}
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[10] = eax;
}
+ /* Additional Intel-defined flags: level 0x0000000F */
+ if (c->cpuid_level >= 0x0000000F) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[11] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = ebx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[12] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+ } else {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ }
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
detect_nopl(c);
}
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+ /*
+ * The heavy lifting of max_rmid and cache_occ_scale are handled
+ * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
+ * in case CQM bits really aren't there in this CPU.
+ */
+ if (c != &boot_cpu_data) {
+ boot_cpu_data.x86_cache_max_rmid =
+ min(boot_cpu_data.x86_cache_max_rmid,
+ c->x86_cache_max_rmid);
+ }
+}
+
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
+ x86_init_cache_qos(c);
/*
* Clear/Set all flags overriden by options, need do it
@@ -959,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
}
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
- /* Load these always in case some future AMD CPU supports
- SYSENTER from compat mode too. */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-#endif /* CONFIG_IA32_EMULATION */
-#endif /* CONFIG_X86_64 */
-
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss;
+ int cpu;
- if (!boot_cpu_has(X86_FEATURE_SEP)) {
- put_cpu();
- return;
- }
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ goto out;
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
tss->x86_tss.ss1 = __KERNEL_CS;
- tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+ wrmsr(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+ 0);
+
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
put_cpu();
}
#endif
@@ -1118,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+ (unsigned long)&init_thread_union + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(kernel_stack);
#ifdef CONFIG_X86_64
@@ -1130,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
*/
DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
@@ -1171,10 +1209,23 @@ void syscall_init(void)
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
@@ -1226,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
@@ -1307,7 +1367,7 @@ void cpu_init(void)
*/
load_ucode_ap();
- t = &per_cpu(init_tss, cpu);
+ t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1391,7 +1451,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 659643376dbf..edcb0e28c336 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
+#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include <asm/processor.h>
-#include <linux/smp.h>
#include <asm/amd_nb.h>
#include <asm/smp.h>
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
enum _cache_type {
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
};
union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-struct _cpuid4_info {
- struct _cpuid4_info_regs base;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
static const unsigned char levels[] = { 1, 1, 2, 3 };
static const unsigned char types[] = { 1, 2, 3, 3 };
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
static void
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
- unsigned int);
-};
-
#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
/*
* L3 cache descriptors
*/
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
- int node;
-
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
-
- node = amd_get_nb_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
-}
-
/*
* check whether a slot used for disabling an L3 index is occupied.
* @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
return -1;
}
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
unsigned int slot)
{
int index;
+ struct amd_northbridge *nb = this_leaf->priv;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
+ index = amd_get_l3_disable_slot(nb, slot);
if (index >= 0)
return sprintf(buf, "%d\n", index);
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
#define SHOW_CACHE_DISABLE(slot) \
static ssize_t \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return show_cache_disable(this_leaf, buf, slot); \
}
SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
return 0;
}
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
{
unsigned long val = 0;
int cpu, err = 0;
+ struct amd_northbridge *nb = this_leaf->priv;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ cpu = cpumask_first(&this_leaf->shared_cpu_map);
if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
- err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count, \
- unsigned int cpu) \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return store_cache_disable(this_leaf, buf, count, slot); \
}
STORE_CACHE_DISABLE(0)
STORE_CACHE_DISABLE(1)
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
+static ssize_t subcaches_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
}
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
- unsigned int cpu)
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
unsigned long val;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
-
if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
return count;
}
-static struct _cache_attr subcaches =
- __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!this_leaf->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ int n = 1;
+ static struct attribute **amd_l3_attrs;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (this_leaf->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return;
+
+ node = amd_get_nb_id(smp_processor_id());
+ this_leaf->nb = node_to_amd_nb(node);
+ if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+ amd_calc_l3_indices(this_leaf->nb);
+}
#else
#define amd_init_l3_cache(x, y)
#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
- if (eax.split.type == CACHE_TYPE_NULL)
+ if (eax.split.type == CTYPE_NULL)
return -EIO; /* better error ? */
this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
/* Do cpuid(op) loop to find out num_cache_leaves */
cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
+ } while (cache_eax.split.type != CTYPE_NULL);
return i;
}
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
switch (this_leaf.eax.split.level) {
case 1:
- if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+ if (this_leaf.eax.split.type == CTYPE_DATA)
new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+ else if (this_leaf.eax.split.type == CTYPE_INST)
new_l1i = this_leaf.size/1024;
break;
case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
return l2;
}
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf;
int i, sibling;
if (cpu_has_topoext) {
unsigned int apicid, nshared, first, last;
- if (!per_cpu(ici_cpuid4_info, cpu))
- return 0;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+ this_leaf = this_cpu_ci->info_list + index;
+ nshared = base->eax.split.num_threads_sharing + 1;
apicid = cpu_data(cpu).apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
apicid = cpu_data(i).apicid;
if ((apicid < first) || (apicid > last))
continue;
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+
+ this_leaf = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
apicid = cpu_data(sibling).apicid;
if ((apicid < first) || (apicid > last))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else if (index == 3) {
for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- if (!per_cpu(ici_cpuid4_info, i))
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+ this_leaf = this_cpu_ci->info_list + index;
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
if (!cpu_online(sibling))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
return 1;
}
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf, *sibling_leaf;
unsigned long num_threads_sharing;
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (cache_shared_amd_cpu_map_setup(cpu, index))
+ if (__cache_amd_cpumap_setup(cpu, index, base))
return;
}
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
+ this_leaf = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+ cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
+ return;
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
+ index_msb = get_count_order(num_threads_sharing);
- *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
- if (unlikely(*retval < 0)) {
- int i;
+ for_each_online_cpu(i)
+ if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;/* skip if itself or no cacheinfo */
+ sibling_leaf = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
}
- cache_shared_cpu_map_setup(cpu, j);
- }
}
-static int detect_cache_attributes(unsigned int cpu)
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+ struct _cpuid4_info_regs *base)
{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(ici_cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
- }
-
- return retval;
+ this_leaf->level = base->eax.split.level;
+ this_leaf->type = cache_type_map[base->eax.split.type];
+ this_leaf->coherency_line_size =
+ base->ebx.split.coherency_line_size + 1;
+ this_leaf->ways_of_associativity =
+ base->ebx.split.ways_of_associativity + 1;
+ this_leaf->size = base->size;
+ this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+ this_leaf->physical_line_partition =
+ base->ebx.split.physical_line_partition + 1;
+ this_leaf->priv = base->nb;
}
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int ret;
-
- if (type)
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
- cpumask_pr_args(mask));
- else
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
- cpumask_pr_args(mask));
- buf[ret++] = '\n';
- buf[ret] = '\0';
- return ret;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
+static int __init_cache_level(unsigned int cpu)
{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- switch (this_leaf->base.eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
- NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute **amd_l3_attrs(void)
-{
- static struct attribute **attrs;
- int n;
-
- if (attrs)
- return attrs;
-
- n = ARRAY_SIZE(default_attrs);
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
- if (attrs == NULL)
- return attrs = default_attrs;
-
- for (n = 0; default_attrs[n]; n++)
- attrs[n] = default_attrs[n];
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- attrs[n++] = &cache_disable_0.attr;
- attrs[n++] = &cache_disable_1.attr;
- }
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- attrs[n++] = &subcaches.attr;
-
- return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(ici_cache_kobject, cpu));
- kfree(per_cpu(ici_index_kobject, cpu));
- per_cpu(ici_cache_kobject, cpu) = NULL;
- per_cpu(ici_index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
+ if (!num_cache_leaves)
return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(ici_cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(ici_index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
- goto err_out;
-
+ if (!this_cpu_ci)
+ return -EINVAL;
+ this_cpu_ci->num_levels = 3;
+ this_cpu_ci->num_leaves = num_cache_leaves;
return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
}
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int cache_add_dev(struct device *dev)
+static int __populate_cache_leaves(unsigned int cpu)
{
- unsigned int cpu = dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- struct _cpuid4_info *this_leaf;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
- &ktype_percpu_entry,
- &dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
+ unsigned int idx, ret;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ struct _cpuid4_info_regs id4_regs = {};
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu, i);
- this_object->cpu = cpu;
- this_object->index = i;
-
- this_leaf = CPUID4_INFO_IDX(cpu, i);
-
- ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
- if (this_leaf->base.nb)
- ktype_cache.default_attrs = amd_l3_attrs();
-#endif
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(ici_cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ if (ret)
+ return ret;
+ ci_leaf_init(this_leaf++, &id4_regs);
+ __cache_cpumap_setup(cpu, idx, &id4_regs);
}
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
return 0;
}
-static void cache_remove_dev(struct device *dev)
-{
- unsigned int cpu = dev->id;
- unsigned long i;
-
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
-
- dev = get_cpu_device(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cacheinfo_cpu_notifier = {
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
- int i, err = 0;
-
- if (num_cache_leaves == 0)
- return 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- struct device *dev = get_cpu_device(i);
-
- err = cache_add_dev(dev);
- if (err)
- goto out;
- }
- __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
-
-out:
- cpu_notifier_register_done();
- return err;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+ TOPA_4K = 0,
+ TOPA_8K,
+ TOPA_16K,
+ TOPA_32K,
+ TOPA_64K,
+ TOPA_128K,
+ TOPA_256K,
+ TOPA_512K,
+ TOPA_1MB,
+ TOPA_2MB,
+ TOPA_4MB,
+ TOPA_8MB,
+ TOPA_16MB,
+ TOPA_32MB,
+ TOPA_64MB,
+ TOPA_128MB,
+ TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+ return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+ u64 end : 1;
+ u64 rsvd0 : 1;
+ u64 intr : 1;
+ u64 rsvd1 : 1;
+ u64 stop : 1;
+ u64 rsvd2 : 1;
+ u64 size : 4;
+ u64 rsvd3 : 2;
+ u64 base : 36;
+ u64 rsvd4 : 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+ struct pmu pmu;
+ u32 caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ * cpu, depending on perf event configuration
+ * @cpu: cpu for per-cpu allocation
+ * @tables: list of ToPA tables in this buffer
+ * @first: shorthand for first topa table
+ * @last: shorthand for last topa table
+ * @cur: current topa table
+ * @nr_pages: buffer size in pages
+ * @cur_idx: current output region's index within @cur table
+ * @output_off: offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost: if data was lost/truncated
+ * @head: logical write offset inside the buffer
+ * @snapshot: if this is for a snapshot/overwrite counter
+ * @stop_pos: STOP topa entry in the buffer
+ * @intr_pos: INT topa entry in the buffer
+ * @data_pages: array of pages from perf
+ * @topa_index: table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+ int cpu;
+ struct list_head tables;
+ struct topa *first, *last, *cur;
+ unsigned int cur_idx;
+ size_t output_off;
+ unsigned long nr_pages;
+ local_t data_size;
+ local_t lost;
+ local64_t head;
+ bool snapshot;
+ unsigned long stop_pos, intr_pos;
+ void **data_pages;
+ struct topa_entry *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle: perf output handle
+ * @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+ struct perf_output_handle handle;
+ int handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
};
#define ATTR_LEN 16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..9c682c222071 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum context ctx = error_context(m);
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC)
+ return MCE_PANIC_SEVERITY;
+
+ if (m->status & MCI_STATUS_UC) {
+
+ /*
+ * On older systems where overflow_recov flag is not present, we
+ * should simply panic if an error overflow occurs. If
+ * overflow_recov flag is present and set, then software can try
+ * to at least kill process to prolong system operation.
+ */
+ if (mce_flags.overflow_recov) {
+ /* software can try to contain */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* kill current process */
+ return MCE_AR_SEVERITY;
+ } else {
+ /* at least one error was not logged */
+ if (m->status & MCI_STATUS_OVER)
+ return MCE_PANIC_SEVERITY;
+ }
+
+ /*
+ * For any other case, return MCE_UC_SEVERITY so that we log the
+ * error and exit #MC handler.
+ */
+ return MCE_UC_SEVERITY;
+ }
+
+ /*
+ * deferred error: poll handler catches these and adds to mce_ring so
+ * memory-failure can take recovery actions.
+ */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return MCE_DEFERRED_SEVERITY;
+
+ /*
+ * corrected error: poll handler catches these and passes responsibility
+ * of decoding the error to EDAC
+ */
+ return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
}
}
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+ mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ mce_severity = mce_severity_amd;
+}
+
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c036cb4a370..e535533d5ab8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-#define SPINUNIT 100 /* 100ns */
+#define SPINUNIT 100 /* 100ns */
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/* CMCI storm detection filter */
-static DEFINE_PER_CPU(unsigned long, mce_polled_error);
-
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
+ bool error_logged = false;
struct mce m;
int severity;
int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
- this_cpu_write(mce_polled_error, 1);
+
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
+ error_logged = true;
mce_log(&m);
+ }
/*
* Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
sync_core();
+
+ return error_logged;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -813,7 +816,7 @@ static void mce_reign(void)
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
+ mce_panic("Fatal machine check", m, msg);
/*
* For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
* Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
return interval;
}
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
- mce_adjust_timer_default;
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-static int cmc_error_seen(void)
+static void __restart_timer(struct timer_list *t, unsigned long interval)
{
- unsigned long *v = this_cpu_ptr(&mce_polled_error);
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
- return test_and_clear_bit(0, v);
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+
+ local_irq_restore(flags);
}
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int cpu = smp_processor_id();
unsigned long iv;
- int notify;
- WARN_ON(smp_processor_id() != data);
+ WARN_ON(cpu != data);
+
+ iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP,
- this_cpu_ptr(&mce_poll_banks));
- mce_intel_cmci_poll();
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+
+ if (mce_intel_cmci_poll()) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
}
/*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
*/
- iv = __this_cpu_read(mce_next_interval);
- notify = mce_notify_irq();
- notify |= cmc_error_seen();
- if (notify) {
+ if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100);
- } else {
+ else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- iv = mce_adjust_timer(iv);
- }
+
+done:
__this_cpu_write(mce_next_interval, iv);
- /* Might have become 0 after CMCI storm subsided */
- if (iv) {
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, iv);
}
/*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
void mce_timer_kick(unsigned long interval)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
- if (timer_pending(t)) {
- if (time_before(when, t->expires))
- mod_timer_pinned(t, when);
- } else {
- t->expires = round_jiffies(when);
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, interval);
+
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && cfg->banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 val, hwcr;
- bool need_toggle;
- u32 msrs[] = {
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
0x00000413, /* MC4_MISC0 */
0xc0000408, /* MC4_MISC1 */
- };
+ };
- rdmsrl(MSR_K7_HWCR, hwcr);
+ rdmsrl(MSR_K7_HWCR, hwcr);
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
- for (i = 0; i < ARRAY_SIZE(msrs); i++) {
- rdmsrl(msrs[i], val);
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
- /* CntP bit set? */
- if (val & BIT_64(62)) {
- val &= ~BIT_64(62);
- wrmsrl(msrs[i], val);
- }
- }
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = mce_intel_adjust_timer;
+ mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
+ mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
break;
default:
break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mcheck_vendor_init_severity();
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769bbd64..55ad9b37cae8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
return (bank == 4);
}
-static const char * const bank4_names(struct threshold_block *b)
+static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!b.interrupt_capable)
goto init;
+ b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
log:
mce_setup(&m);
rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ return;
m.misc = ((u64)high << 32) | low;
m.bank = bank;
mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
- if (b->interrupt_capable)
+ if (b->interrupt_capable) {
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
- else
+ b->interrupt_enable = 1;
+ } else {
threshold_ktype.default_attrs[2] = NULL;
+ }
INIT_LIST_HEAD(&b->miscj);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
-void mce_intel_cmci_poll(void)
+bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ return false;
+
+ /*
+ * Reset the counter if we've logged an error in the last poll
+ * during the storm.
+ */
+ if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ else
+ this_cpu_dec(cmci_backoff_cnt);
+
+ return true;
}
void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
- int r;
-
- if (interval < CMCI_POLL_INTERVAL)
- return interval;
+ if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
+
/*
* We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the
- * timer interval is back to our poll interval.
+ * silenced itself. That means no events recorded and the timer
+ * interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- r = atomic_sub_return(1, &cmci_storm_on_cpus);
- if (r == 0)
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
- * We wait for all cpus to go back to SUBSIDED
- * state. When that happens we switch back to
- * interrupt mode.
+ * We wait for all CPUs to go back to SUBSIDED state. When that
+ * happens we switch back to interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
return CMCI_POLL_INTERVAL;
default:
- /*
- * We have shiny weather. Let the poll do whatever it
- * thinks.
- */
+
+ /* We have shiny weather. Let the poll do whatever it thinks. */
return interval;
}
}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_POLL_INTERVAL);
+ mce_timer_kick(CMCI_STORM_INTERVAL);
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
+
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
+
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe6195e2d..12829c3ced3c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
-#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd16ab..a413a69cbd74 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
#include <asm/processor.h>
#include <asm/cmdline.h>
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx) \
- (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
- u32 eax = 0x00000000;
- u32 ebx, ecx = 0, edx;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
- return X86_VENDOR_INTEL;
-
- if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
- return X86_VENDOR_AMD;
-
- return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
- u32 eax = 0x00000001;
- u32 ebx, ecx = 0, edx;
- int x86;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- x86 = (eax >> 8) & 0xf;
- if (x86 == 15)
- x86 += (eax >> 20) & 0xff;
-
- return x86;
-}
-
static bool __init check_loader_disabled_bsp(void)
{
#ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
void __init load_ucode_bsp(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_bsp())
return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_bsp();
break;
default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
void load_ucode_ap(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_ap())
return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_ap();
break;
default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
void reload_early_microcode(void)
{
- int vendor, x86;
+ int vendor, family;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
reload_ucode_intel();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
reload_ucode_amd();
break;
default:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd08aad..a41beadb3db9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
- return get_matching_microcode(csig, cpf, mc_intel, crev);
+ return get_matching_microcode(csig, cpf, crev, mc_intel);
}
static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
- if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+ if (get_matching_microcode(csig, cpf, new_rev, mc)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb933189c..2f49ab4ac0ae 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/slab.h>
@@ -28,6 +36,9 @@
#include <asm/tlbflush.h>
#include <asm/setup.h>
+#undef pr_fmt
+#define pr_fmt(fmt) "microcode: " fmt
+
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
} mc_saved_data;
static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
- unsigned int mc_saved_count,
- struct ucode_cpu_info *uci)
+load_microcode_early(struct microcode_intel **saved,
+ unsigned int num_saved, struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
- int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_OK;
- unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- unsigned int csig = uci->cpu_sig.sig;
- unsigned int cpf = uci->cpu_sig.pf;
- int i;
+ struct microcode_header_intel *mc_hdr;
+ int new_rev, ret, i;
- for (i = 0; i < mc_saved_count; i++) {
- ucode_ptr = mc_saved_p[i];
+ new_rev = uci->cpu_sig.rev;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- mc_size = get_totalsize(mc_header);
- if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
- new_rev = mc_header->rev;
- new_mc = ucode_ptr;
- }
- }
+ for (i = 0; i < num_saved; i++) {
+ ucode_ptr = saved[i];
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
+ ret = get_matching_microcode(uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ new_rev,
+ ucode_ptr);
+ if (!ret)
+ continue;
+
+ new_rev = mc_hdr->rev;
+ new_mc = ucode_ptr;
}
+ if (!new_mc)
+ return UCODE_NFOUND;
+
uci->mc = (struct microcode_intel *)new_mc;
-out:
- return state;
+ return UCODE_OK;
}
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start, int mc_saved_count)
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+ unsigned long off, int num_saved)
{
int i;
- for (i = 0; i < mc_saved_count; i++)
- mc_saved[i] = (struct microcode_intel *)
- (mc_saved_in_initrd[i] + initrd_start);
+ for (i = 0; i < num_saved; i++)
+ mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
}
#ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
#endif
static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start,
- struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long initrd_start, struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int count = mc_saved_data->mc_saved_count;
if (!mc_saved_data->mc_saved) {
- microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
- initrd_start, count);
+ copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mc_saved_data);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
#else
- return generic_load_microcode_early(mc_saved_data->mc_saved,
+ return load_microcode_early(mc_saved_data->mc_saved,
count, uci);
#endif
}
}
-static u8 get_x86_family(unsigned long sig)
-{
- u8 x86;
-
- x86 = (sig >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (sig >> 20) & 0xff;
-
- return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
- u8 x86, x86_model;
-
- x86 = get_x86_family(sig);
- x86_model = (sig >> 4) & 0xf;
-
- if (x86 == 0x6 || x86 == 0xf)
- x86_model += ((sig >> 16) & 0xf) << 4;
-
- return x86_model;
-}
-
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
matching_model_microcode(struct microcode_header_intel *mc_header,
unsigned long sig)
{
- u8 x86, x86_model;
- u8 x86_ucode, x86_model_ucode;
+ unsigned int fam, model;
+ unsigned int fam_ucode, model_ucode;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
- x86 = get_x86_family(sig);
- x86_model = get_x86_model(sig);
+ fam = __x86_family(sig);
+ model = x86_model(sig);
- x86_ucode = get_x86_family(mc_header->sig);
- x86_model_ucode = get_x86_model(mc_header->sig);
+ fam_ucode = __x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
return UCODE_NFOUND;
- ext_header = (struct extended_sigtable *)
- mc_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
- x86_ucode = get_x86_family(ext_sig->sig);
- x86_model_ucode = get_x86_model(ext_sig->sig);
+ fam_ucode = __x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
ext_sig++;
}
-
return UCODE_NFOUND;
}
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
unsigned int mc_saved_count)
{
int i, j;
- struct microcode_intel **mc_saved_p;
+ struct microcode_intel **saved_ptr;
int ret;
if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
/*
* Copy new microcode data.
*/
- mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
- GFP_KERNEL);
- if (!mc_saved_p)
+ saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ if (!saved_ptr)
return -ENOMEM;
for (i = 0; i < mc_saved_count; i++) {
- struct microcode_intel *mc = mc_saved_src[i];
- struct microcode_header_intel *mc_header = &mc->hdr;
- unsigned long mc_size = get_totalsize(mc_header);
- mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
- if (!mc_saved_p[i]) {
- ret = -ENOMEM;
- goto err;
- }
+ struct microcode_header_intel *mc_hdr;
+ struct microcode_intel *mc;
+ unsigned long size;
+
if (!mc_saved_src[i]) {
ret = -EINVAL;
goto err;
}
- memcpy(mc_saved_p[i], mc, mc_size);
+
+ mc = mc_saved_src[i];
+ mc_hdr = &mc->hdr;
+ size = get_totalsize(mc_hdr);
+
+ saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ if (!saved_ptr[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(saved_ptr[i], mc, size);
}
/*
* Point to newly saved microcode.
*/
- mc_saved_data->mc_saved = mc_saved_p;
+ mc_saved_data->mc_saved = saved_ptr;
mc_saved_data->mc_saved_count = mc_saved_count;
return 0;
err:
for (j = 0; j <= i; j++)
- kfree(mc_saved_p[j]);
- kfree(mc_saved_p);
+ kfree(saved_ptr[j]);
+ kfree(saved_ptr);
return ret;
}
@@ -257,48 +239,45 @@ err:
* - or if it is a newly discovered microcode patch.
*
* The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
*/
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
- unsigned int *mc_saved_count_p)
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+ u8 *ucode_ptr, unsigned int num_saved)
{
- int i;
- int found = 0;
- unsigned int mc_saved_count = *mc_saved_count_p;
- struct microcode_header_intel *mc_header;
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ unsigned int sig, pf, new_rev;
+ int found = 0, i;
+
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ for (i = 0; i < num_saved; i++) {
+ mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+ new_rev = mc_hdr->rev;
+
+ if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+ continue;
+
+ found = 1;
+
+ if (!revision_is_newer(mc_hdr, new_rev))
+ continue;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- for (i = 0; i < mc_saved_count; i++) {
- unsigned int sig, pf;
- unsigned int new_rev;
- struct microcode_header_intel *mc_saved_header =
- (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- new_rev = mc_header->rev;
-
- if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
- found = 1;
- if (update_match_revision(mc_header, new_rev)) {
- /*
- * Found an older ucode saved before.
- * Replace the older one with this newer
- * one.
- */
- mc_saved[i] =
- (struct microcode_intel *)ucode_ptr;
- break;
- }
- }
- }
- if (i >= mc_saved_count && !found)
/*
- * This ucode is first time discovered in ucode file.
- * Save it to memory.
+ * Found an older ucode saved earlier. Replace it with
+ * this newer one.
*/
- mc_saved[mc_saved_count++] =
- (struct microcode_intel *)ucode_ptr;
+ mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+
+ /* Newly detected microcode, save it to memory. */
+ if (i >= num_saved && !found)
+ mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
- *mc_saved_count_p = mc_saved_count;
+ return num_saved;
}
/*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
continue;
}
- _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
ucode_ptr += mc_size;
}
@@ -372,7 +351,7 @@ out:
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
- u8 x86, x86_model;
+ unsigned int family, model;
struct cpu_signature csig;
unsigned int eax, ebx, ecx, edx;
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- x86 = get_x86_family(csig.sig);
- x86_model = get_x86_model(csig.sig);
+ family = __x86_family(csig.sig);
+ model = x86_model(csig.sig);
- if ((x86_model >= 5) || (x86 > 6)) {
+ if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
rev = uci.cpu_sig.rev;
- pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
- smp_processor_id(), sig, pf, rev);
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (struct extended_sigtable *)
- mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
-
- _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
/*
* Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- struct ucode_cpu_info *uci)
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long start, unsigned long size,
+ struct ucode_cpu_info *uci)
{
- unsigned int size = end - start + 1;
struct cpio_data cd;
long offset = 0;
#ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
if (!cd.data)
return UCODE_ERROR;
-
return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, mc_saved_in_initrd,
- uci);
+ mc_saved_data, initrd, uci);
}
/*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
if (count == 0)
return ret;
- microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start_early,
- unsigned long initrd_end_early,
- struct ucode_cpu_info *uci)
+ unsigned long *initrd,
+ unsigned long start, unsigned long size)
{
+ struct ucode_cpu_info uci;
enum ucode_state ret;
- collect_cpu_info_early(uci);
- scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
- mc_saved_in_initrd, uci);
+ collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, uci);
+ ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ if (ret != UCODE_OK)
+ return;
- if (ret == UCODE_OK)
- apply_microcode_early(uci, true);
+ ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
}
-void __init
-load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(void)
{
- u64 ramdisk_image, ramdisk_size;
- unsigned long initrd_start_early, initrd_end_early;
- struct ucode_cpu_info uci;
+ u64 start, size;
#ifdef CONFIG_X86_32
- struct boot_params *boot_params_p;
+ struct boot_params *p;
- boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
- ramdisk_image = boot_params_p->hdr.ramdisk_image;
- ramdisk_size = boot_params_p->hdr.ramdisk_size;
- initrd_start_early = ramdisk_image;
- initrd_end_early = initrd_start_early + ramdisk_size;
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ start = p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
_load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- initrd_start_early, initrd_end_early, &uci);
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
#else
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- initrd_start_early = ramdisk_image + PAGE_OFFSET;
- initrd_end_early = initrd_start_early + ramdisk_size;
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, initrd_end_early,
- &uci);
+ start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+ size = boot_params.hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
#endif
}
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
unsigned long initrd_start_addr;
+ enum ucode_state ret;
#ifdef CONFIG_X86_32
unsigned long *initrd_start_p;
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
return;
collect_cpu_info_early(&uci);
- load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
+ ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+
+ if (ret != UCODE_OK)
+ return;
+
apply_microcode_early(&uci, true);
}
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
collect_cpu_info_early(&uci);
- ret = generic_load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
+ ret = load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
if (ret != UCODE_OK)
return;
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..cd47a510a3f1 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
}
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
int microcode_sanity_check(void *mc, int print_err)
{
unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
{
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
}
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
{
- struct microcode_header_intel *mc_header = mc;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!update_match_revision(mc_header, rev))
+ if (!revision_is_newer(mc_hdr, rev))
return 0;
- return get_matching_sig(csig, cpf, mc, rev);
+ return get_matching_sig(csig, cpf, rev, mc);
}
EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a337b49..3f20710a5b23 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
IN=$1
OUT=$2
-function dump_array()
+dump_array()
{
ARRAY=$1
SIZE=$2
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
}
}
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+ hw_perf_event_destroy(event);
+
+ /* undo the lbr/bts event accounting */
+ x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
static inline int x86_pmu_initialized(void)
{
return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return x86_pmu_extra_regs(val, event);
}
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+ int ret = -EBUSY, i;
+
+ if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+ return 0;
+
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto out;
+
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ ret = 0;
+
+out:
+ mutex_unlock(&pmc_reserve_mutex);
+ return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+ atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
/* BTS is currently only allowed for user-mode. */
if (!attr->exclude_kernel)
return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
}
hwc->config |= config;
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
- /*
- * check that PEBS LBR correction does not conflict with
- * whatever the user is asking with attr->branch_sample_type
- */
- if (event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2) {
- u64 *br_type = &event->attr.branch_sample_type;
-
- if (has_branch_stack(event)) {
- if (!precise_br_compat(event))
- return -EOPNOTSUPP;
-
- /* branch_sample_type is compatible */
-
- } else {
- /*
- * user did not specify branch_sample_type
- *
- * For PEBS fixups, we capture all
- * the branches at the priv level of the
- * event.
- */
- *br_type = PERF_SAMPLE_BRANCH_ANY;
-
- if (!event->attr.exclude_user)
- *br_type |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
- }
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
}
}
+ if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ event->attach_state |= PERF_ATTACH_TASK_DATA;
+
/*
* Generate PMC IRQs:
* (keep 'enabled' bit clear for now)
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+ if (event->attr.sample_period && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, event->attr.sample_period) >
+ event->attr.sample_period)
+ return -EINVAL;
+ }
+
return x86_setup_perfctr(event);
}
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
- int i, wmin, wmax, num = 0;
+ int i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ if (x86_pmu.start_scheduling)
+ x86_pmu.start_scheduling(cpuc);
+
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
- c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+ c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
hwc->constraint = c;
wmin = min(wmin, c->weight);
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n)
- num = perf_assign_events(cpuc->event_list, n, wmin,
- wmax, assign);
+ unsched = perf_assign_events(cpuc->event_list, n, wmin,
+ wmax, assign);
/*
- * Mark the event as committed, so we do not put_constraint()
- * in case new events are added and fail scheduling.
+ * In case of success (unsched = 0), mark events as committed,
+ * so we do not put_constraint() in case new events are added
+ * and fail to be scheduled
+ *
+ * We invoke the lower level commit callback to lock the resource
+ *
+ * We do not need to do all of this in case we are called to
+ * validate an event group (assign == NULL)
*/
- if (!num && assign) {
+ if (!unsched && assign) {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+ if (x86_pmu.commit_scheduling)
+ x86_pmu.commit_scheduling(cpuc, e, assign[i]);
}
}
- /*
- * scheduling failed or is just a simulation,
- * free resources if necessary
- */
- if (!assign || num) {
+
+ if (!assign || unsched) {
+
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
continue;
+ /*
+ * release events that failed scheduling
+ */
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, e);
}
}
- return num ? -EINVAL : 0;
+
+ if (x86_pmu.stop_scheduling)
+ x86_pmu.stop_scheduling(cpuc);
+
+ return unsched ? -EINVAL : 0;
}
/*
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
+ if (x86_pmu.limit_period)
+ left = x86_pmu.limit_period(event, left);
+
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
hwc = &event->hw;
- perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
ret = n = collect_events(cpuc, event, false);
if (ret < 0)
@@ -1071,7 +1140,6 @@ done_collect:
ret = 0;
out:
- perf_pmu_enable(event->pmu);
return ret;
}
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- u64 pebs;
+ u64 pebs, debugctl;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
- rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ if (x86_pmu.pebs_constraints) {
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ }
+ if (x86_pmu.lbr_nr) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
+ }
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int ret = NOTIFY_OK;
+ int i, ret = NOTIFY_OK;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- cpuc->kfree_on_online = NULL;
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+ cpuc->kfree_on_online[i] = NULL;
if (x86_pmu.cpu_prepare)
ret = x86_pmu.cpu_prepare(cpu);
break;
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
- kfree(cpuc->kfree_on_online);
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+ kfree(cpuc->kfree_on_online[i]);
+ cpuc->kfree_on_online[i] = NULL;
+ }
break;
case CPU_DYING:
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
- c = x86_pmu.get_event_constraints(fake_cpuc, event);
+ c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
if (!c || !c->weight)
ret = -EINVAL;
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_flush_branch_stack(void)
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
}
void perf_check_microcode(void)
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,
.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
+ .sched_task = x86_pmu_sched_task,
+ .task_ctx_size = sizeof(struct x86_perf_task_context),
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
data = cyc2ns_read_begin();
+ /*
+ * Internal timekeeping for enabled/running/stopped times
+ * is always in the local_clock domain.
+ */
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;
- userpg->cap_user_time_zero = 1;
- userpg->time_zero = data->cyc2ns_offset;
+ /*
+ * cap_user_time_zero doesn't make sense when we're using a different
+ * time base for the records.
+ */
+ if (event->clock == &local_clock) {
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+ }
cyc2ns_read_end(data);
}
@@ -2147,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
static unsigned long code_segment_base(struct pt_regs *regs)
{
/*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+
+#ifdef CONFIG_X86_32
+ /*
* If we are in VM86 mode, add the segment offset to convert to a
* linear address.
*/
if (regs->flags & X86_VM_MASK)
return 0x10 * regs->cs;
- /*
- * For IA32 we look at the GDT/LDT segment base to convert the
- * effective IP to a linear address.
- */
-#ifdef CONFIG_X86_32
if (user_mode(regs) && regs->cs != __USER_CS)
return get_segment_base(regs->cs);
#else
- if (test_thread_flag(TIF_IA32)) {
- if (user_mode(regs) && regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
- }
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
#endif
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..329f0356ad4a 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x80 /* dynamic alloc'd constraint */
#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
@@ -123,8 +125,37 @@ struct intel_shared_regs {
unsigned core_id; /* per-core: core id */
};
+enum intel_excl_state_type {
+ INTEL_EXCL_UNUSED = 0, /* counter is unused */
+ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
+ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+ enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+ enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+ int num_alloc_cntrs;/* #counters allocated */
+ int max_alloc_cntrs;/* max #counters allowed */
+ bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+ raw_spinlock_t lock;
+
+ struct intel_excl_states states[2];
+
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
#define MAX_LBR_ENTRIES 16
+enum {
+ X86_PERF_KFREE_SHARED = 0,
+ X86_PERF_KFREE_EXCL = 1,
+ X86_PERF_KFREE_MAX
+};
+
struct cpu_hw_events {
/*
* Generic x86 PMC bits
@@ -179,6 +210,12 @@ struct cpu_hw_events {
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
+ /*
+ * manage exclusive counter access between hyperthread
+ */
+ struct event_constraint *constraint_list; /* in enable order */
+ struct intel_excl_cntrs *excl_cntrs;
+ int excl_thread_id; /* 0 or 1 */
/*
* AMD specific bits
@@ -187,7 +224,7 @@ struct cpu_hw_events {
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
- void *kfree_on_online;
+ void *kfree_on_online[X86_PERF_KFREE_MAX];
};
#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
@@ -202,6 +239,10 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+ 0, PERF_X86_EVENT_EXCL)
+
/*
* The overlap flag marks event constraints with overlapping counter
* masks. This is the case if the counter mask of such an event is not
@@ -259,6 +300,10 @@ struct cpu_hw_events {
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
@@ -283,22 +328,40 @@ struct cpu_hw_events {
/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
- __EVENT_CONSTRAINT(code, n, \
+ __EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
@@ -408,6 +471,13 @@ union x86_pmu_config {
#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+enum {
+ x86_lbr_exclusive_lbr,
+ x86_lbr_exclusive_bts,
+ x86_lbr_exclusive_pt,
+ x86_lbr_exclusive_max,
+};
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -443,14 +513,25 @@ struct x86_pmu {
u64 max_period;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
+ int idx,
struct perf_event *event);
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
+
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ int cntr);
+
+ void (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+ void (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
+ unsigned (*limit_period)(struct perf_event *event, unsigned l);
/*
* sysfs attrs
@@ -472,7 +553,8 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*flush_branch_stack)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);
/*
* Intel Arch Perfmon v2+
@@ -504,10 +586,15 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
/*
+ * Intel PT/LBR/BTS are exclusive
+ */
+ atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+
+ /*
* Extra registers for events
*/
struct extra_reg *extra_regs;
- unsigned int er_flags;
+ unsigned int flags;
/*
* Intel host/guest support (KVM)
@@ -515,6 +602,13 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
+struct x86_perf_task_context {
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ int lbr_callstack_users;
+ int lbr_stack_state;
+};
+
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
@@ -524,8 +618,13 @@ do { \
x86_pmu.quirks = &__quirk; \
} while (0)
-#define ERF_NO_HT_SHARING 1
-#define ERF_HAS_RSP_1 2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
+#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
+#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
extern struct x86_pmu x86_pmu __read_mostly;
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event);
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
int x86_setup_perfctr(struct perf_event *event);
int x86_pmu_hw_config(struct perf_event *event);
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+ /* user explicitly requested branch sampling */
+ if (has_branch_stack(event))
+ return true;
+
+ /* implicit branch sampling to correct PEBS skid */
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+ x86_pmu.intel_cap.pebs_format < 2)
+ return true;
+
+ return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+ !event->attr.freq && event->hw.sample_period == 1)
+ return true;
+
+ return false;
+}
+
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event);
struct intel_shared_regs *allocate_shared_regs(int cpu);
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
void intel_ds_init(void);
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
void intel_pmu_lbr_disable(struct perf_event *event);
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
void intel_pmu_lbr_disable_all(void);
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void);
+void intel_pmu_lbr_init_hsw(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
int p4_pmu_init(void);
int p6_pmu_init(void);
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline int is_ht_workaround_enabled(void)
+{
+ return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
static void amd_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
struct amd_nb *nb;
int i, nb_id;
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
continue;
if (nb->nb_id == nb_id) {
- cpuc->kfree_on_online = cpuc->amd_nb;
+ *onln = cpuc->amd_nb;
cpuc->amd_nb = nb;
break;
}
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
}
static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
/*
* if not NB event or no NB, then no constraints
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
* the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
* is using the new offset.
*/
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
{
int offset;
int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
if (offset == APIC_EILVT_NR_MAX) {
printk(KERN_DEBUG "No EILVT entry available\n");
- return -EBUSY;
+ return;
}
ret = setup_ibs_ctl(offset);
if (ret)
goto out;
- if (!ibs_eilvt_valid()) {
- ret = -EFAULT;
+ if (!ibs_eilvt_valid())
goto out;
- }
pr_info("IBS: LVT offset %d assigned\n", offset);
- return 0;
+ return;
out:
preempt_disable();
put_eilvt(offset);
preempt_enable();
- return ret;
+ return;
}
static void ibs_eilvt_setup(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 258990688a5e..9da2400c2ec3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/watchdog.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- /*
- * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
- * siblings; disable these events because they can corrupt unrelated
- * counters.
- */
- INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
EVENT_CONSTRAINT_END
};
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
+ else
+ intel_bts_disable_local();
intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+ __intel_pmu_disable_all();
intel_pmu_lbr_disable_all();
}
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
intel_pmu_pebs_enable_all();
- intel_pmu_lbr_enable_all();
+ intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
return;
intel_pmu_enable_bts(event->hw.config);
- }
+ } else
+ intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+ __intel_pmu_enable_all(added, false);
}
/*
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
* must disable before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_disable(event);
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
* must enabled before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
if (ds)
ds->bts_index = ds->bts_buffer_base;
+ /* Ack all overflows and disable fixed counters */
+ if (x86_pmu.version >= 2) {
+ intel_pmu_ack_status(intel_pmu_get_status());
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ /* Reset LBRs and LBR freezing */
+ if (x86_pmu.lbr_nr) {
+ update_debugctlmsr(get_debugctlmsr() &
+ ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+ }
+
local_irq_restore(flags);
}
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
*/
if (!x86_pmu.late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
- intel_pmu_disable_all();
+ __intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
status = intel_pmu_get_status();
if (!status)
goto done;
@@ -1399,6 +1629,14 @@ again:
}
/*
+ * Intel PT
+ */
+ if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+ handled++;
+ intel_pt_interrupt();
+ }
+
+ /*
* Checkpointed counters can lead to 'spurious' PMIs because the
* rollback caused by the PMI will have cleared the overflow status
* bit. Therefore always force probe these counters.
@@ -1433,7 +1671,7 @@ again:
goto again;
done:
- intel_pmu_enable_all(0);
+ __intel_pmu_enable_all(0, true);
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
static int intel_alt_er(int idx)
{
- if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
}
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
}
static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- return x86_get_event_constraints(cpuc, event);
+ return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = true;
+ xl->num_alloc_cntrs = 0;
+ /*
+ * lock shared state until we are done scheduling
+ * in stop_event_scheduling()
+ * makes scheduling appear as a transaction
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+ raw_spin_lock(&excl_cntrs->lock);
+
+ /*
+ * save initial state of sibling thread
+ */
+ memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ /*
+ * make new sibling thread state visible
+ */
+ memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+ xl->sched_started = false;
+ /*
+ * release shared state lock (acquired in intel_start_scheduling())
+ */
+ raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int idx, struct event_constraint *c)
+{
+ struct event_constraint *cx;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int is_excl, i;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* alternate */
+
+ /*
+ * validating a group does not require
+ * enforcing cross-thread exclusion
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return c;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return c;
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ /*
+ * xl = state of current HT
+ * xlo = state of sibling HT
+ */
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * do not allow scheduling of more than max_alloc_cntrs
+ * which is set to half the available generic counters.
+ * this helps avoid counter starvation of sibling thread
+ * by ensuring at most half the counters cannot be in
+ * exclusive mode. There is not designated counters for the
+ * limits. Any N/2 counters can be used. This helps with
+ * events with specifix counter constraints
+ */
+ if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+ return &emptyconstraint;
+
+ cx = c;
+
+ /*
+ * because we modify the constraint, we need
+ * to make a copy. Static constraints come
+ * from static const tables.
+ *
+ * only needed when constraint has not yet
+ * been cloned (marked dynamic)
+ */
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+ /* sanity check */
+ if (idx < 0)
+ return &emptyconstraint;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ memcpy(cx, c, sizeof(*cx));
+
+ /*
+ * mark constraint as dynamic, so we
+ * can free it later on
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ }
+
+ /*
+ * From here on, the constraint is dynamic.
+ * Either it was just allocated above, or it
+ * was allocated during a earlier invocation
+ * of this function
+ */
+
+ /*
+ * Modify static constraint with current dynamic
+ * state of thread
+ *
+ * EXCLUSIVE: sibling counter measuring exclusive event
+ * SHARED : sibling counter measuring non-exclusive event
+ * UNUSED : sibling counter unused
+ */
+ for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+ /*
+ * exclusive event in sibling counter
+ * our corresponding counter cannot be used
+ * regardless of our event
+ */
+ if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, cx->idxmsk);
+ /*
+ * if measuring an exclusive event, sibling
+ * measuring non-exclusive, then counter cannot
+ * be used
+ */
+ if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, cx->idxmsk);
+ }
+
+ /*
+ * recompute actual bit weight for scheduling algorithm
+ */
+ cx->weight = hweight64(cx->idxmsk64);
+
+ /*
+ * if we return an empty mask, then switch
+ * back to static empty constraint to avoid
+ * the cost of freeing later on
+ */
+ if (cx->weight == 0)
+ cx = &emptyconstraint;
+
+ return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c1 = event->hw.constraint;
+ struct event_constraint *c2;
+
+ /*
+ * first time only
+ * - static constraint: no change across incremental scheduling calls
+ * - dynamic constraint: handled by intel_get_excl_constraints()
+ */
+ c2 = __intel_get_event_constraints(cpuc, idx, event);
+ if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+ bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+ c1->weight = c2->weight;
+ c2 = c1;
+ }
+
+ if (cpuc->excl_cntrs)
+ return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+ return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xlo, *xl;
+ unsigned long flags = 0; /* keep compiler happy */
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake)
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional
+ */
+ if (!xl->sched_started)
+ raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+ /*
+ * if event was actually assigned, then mark the
+ * counter state as unused now
+ */
+ if (hwc->idx >= 0)
+ xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
}
static void
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
+ struct event_constraint *c = event->hw.constraint;
+
intel_put_shared_regs_event_constraints(cpuc, event);
+
+ /*
+ * is PMU has exclusive counter restrictions, then
+ * all events are subject to and must call the
+ * put_excl_constraints() routine
+ */
+ if (c && cpuc->excl_cntrs)
+ intel_put_excl_constraints(cpuc, event);
+
+ /* cleanup dynamic constraint */
+ if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+ event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+ struct perf_event *event, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = event->hw.constraint;
+ struct intel_excl_states *xlo, *xl;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+ int is_excl;
+
+ if (cpuc->is_fake || !c)
+ return;
+
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+ if (cntr >= 0) {
+ if (is_excl)
+ xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+ }
}
static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
- if (intel_pmu_needs_lbr_smpl(event)) {
+ if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+
+ /*
+ * BTS is set up earlier in this path, so don't account twice
+ */
+ if (!intel_pmu_has_bts(event)) {
+ /* disallow lbr if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
}
if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
EVENT_CONSTRAINT(0, 0x4, 0);
static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
- struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return c;
}
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+ if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (left < 128)
+ left = 128;
+ left &= ~0x3fu;
+ }
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
return regs;
}
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+ struct intel_excl_cntrs *c;
+ int i;
+
+ c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (c) {
+ raw_spin_lock_init(&c->lock);
+ for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+ c->states[0].state[i] = INTEL_EXCL_UNUSED;
+ c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+ c->states[1].state[i] = INTEL_EXCL_UNUSED;
+ c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+ }
+ c->core_id = -1;
+ }
+ return c;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
- return NOTIFY_OK;
+ if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ return NOTIFY_BAD;
+ }
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- return NOTIFY_BAD;
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+ cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+ if (!cpuc->constraint_list)
+ return NOTIFY_BAD;
+
+ cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+ if (!cpuc->excl_cntrs) {
+ kfree(cpuc->constraint_list);
+ kfree(cpuc->shared_regs);
+ return NOTIFY_BAD;
+ }
+ cpuc->excl_thread_id = 0;
+ }
return NOTIFY_OK;
}
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu)
if (!cpuc->shared_regs)
return;
- if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+ if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
+ *onln = cpuc->shared_regs;
cpuc->shared_regs = pc;
break;
}
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.lbr_sel_map)
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ int h = x86_pmu.num_counters >> 1;
+
+ for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ struct intel_excl_cntrs *c;
+
+ c = per_cpu(cpu_hw_events, i).excl_cntrs;
+ if (c && c->core_id == core_id) {
+ cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+ cpuc->excl_cntrs = c;
+ cpuc->excl_thread_id = 1;
+ break;
+ }
+ }
+ cpuc->excl_cntrs->core_id = core_id;
+ cpuc->excl_cntrs->refcnt++;
+ /*
+ * set hard limit to half the number of generic counters
+ */
+ cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+ cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
+ }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct intel_excl_cntrs *c;
+
+ c = cpuc->excl_cntrs;
+ if (c) {
+ if (c->core_id == -1 || --c->refcnt == 0)
+ kfree(c);
+ cpuc->excl_cntrs = NULL;
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+ }
}
static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu)
cpuc->shared_regs = NULL;
}
- fini_debug_store_on_cpu(cpu);
-}
+ free_excl_cntrs(cpu);
-static void intel_pmu_flush_branch_stack(void)
-{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
- if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
+ fini_debug_store_on_cpu(cpu);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .sched_task = intel_pmu_lbr_sched_task,
};
static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void)
}
}
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+ x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
+ x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
x86_pmu.extra_regs = intel_slm_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
pr_cont("Silvermont events, ");
break;
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void)
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = nhm_events_attrs;
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void)
case 42: /* 32nm SandyBridge */
case 45: /* 32nm SandyBridge-E/EN/EP */
x86_add_quirk(intel_sandybridge_quirk);
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void)
case 58: /* 22nm IvyBridge */
case 62: /* 22nm IvyBridge-EP/EX */
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
/* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void)
else
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void)
case 63: /* 22nm Haswell Server */
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+ x86_add_quirk(intel_ht_bug);
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
- intel_pmu_lbr_init_snb();
+ intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.limit_period = bdw_limit_period;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void)
return 0;
}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+ int cpu = smp_processor_id();
+ int w, c;
+ /*
+ * problem not present on this CPU model, nothing to do
+ */
+ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+ return 0;
+
+ w = cpumask_weight(topology_thread_cpumask(cpu));
+ if (w > 1) {
+ pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+ return 0;
+ }
+
+ watchdog_nmi_disable_all();
+
+ x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+ x86_pmu.commit_scheduling = NULL;
+ x86_pmu.start_scheduling = NULL;
+ x86_pmu.stop_scheduling = NULL;
+
+ watchdog_nmi_enable_all();
+
+ get_online_cpus();
+
+ for_each_online_cpu(c) {
+ free_excl_cntrs(c);
+ }
+
+ put_online_cpus();
+ pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+ return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..ac1f0c55f379
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+ struct perf_output_handle handle;
+ struct debug_store ds_back;
+ int started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE 24
+#define BTS_SAFETY_MARGIN 4080
+
+struct bts_phys {
+ struct page *page;
+ unsigned long size;
+ unsigned long offset;
+ unsigned long displacement;
+};
+
+struct bts_buffer {
+ size_t real_size; /* multiple of BTS_RECORD_SIZE */
+ unsigned int nr_pages;
+ unsigned int nr_bufs;
+ unsigned int cur_buf;
+ bool snapshot;
+ local_t data_size;
+ local_t lost;
+ local_t head;
+ unsigned long end;
+ void **data_pages;
+ struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+ return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+ struct bts_buffer *buf;
+ struct page *page;
+ int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ unsigned long offset;
+ size_t size = nr_pages << PAGE_SHIFT;
+ int pg, nbuf, pad;
+
+ /* count all the high order buffers */
+ for (pg = 0, nbuf = 0; pg < nr_pages;) {
+ page = virt_to_page(pages[pg]);
+ if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+ return NULL;
+ pg += 1 << page_private(page);
+ nbuf++;
+ }
+
+ /*
+ * to avoid interrupts in overwrite mode, only allow one physical
+ */
+ if (overwrite && nbuf > 1)
+ return NULL;
+
+ buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->nr_pages = nr_pages;
+ buf->nr_bufs = nbuf;
+ buf->snapshot = overwrite;
+ buf->data_pages = pages;
+ buf->real_size = size - size % BTS_RECORD_SIZE;
+
+ for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+ unsigned int __nr_pages;
+
+ page = virt_to_page(pages[pg]);
+ __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+ buf->buf[nbuf].page = page;
+ buf->buf[nbuf].offset = offset;
+ buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+ buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+ pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+ buf->buf[nbuf].size -= pad;
+
+ pg += __nr_pages;
+ offset += __nr_pages << PAGE_SHIFT;
+ }
+
+ return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+ kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+ return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_phys *phys = &buf->buf[buf->cur_buf];
+ unsigned long index, thresh = 0, end = phys->size;
+ struct page *page = phys->page;
+
+ index = local_read(&buf->head);
+
+ if (!buf->snapshot) {
+ if (buf->end < phys->offset + buf_size(page))
+ end = buf->end - phys->offset - phys->displacement;
+
+ index -= phys->offset + phys->displacement;
+
+ if (end - index > BTS_SAFETY_MARGIN)
+ thresh = end - BTS_SAFETY_MARGIN;
+ else if (end - index > BTS_RECORD_SIZE)
+ thresh = end - BTS_RECORD_SIZE;
+ else
+ thresh = end;
+ }
+
+ ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+ ds->bts_index = ds->bts_buffer_base + index;
+ ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+ ds->bts_interrupt_threshold = !buf->snapshot
+ ? ds->bts_buffer_base + thresh
+ : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+ unsigned long index = head - phys->offset;
+
+ memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= bts->handle.size ||
+ bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+ return true;
+
+ return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+ if (!buf)
+ return;
+
+ head = index + bts_buffer_offset(buf, buf->cur_buf);
+ old = local_xchg(&buf->head, head);
+
+ if (!buf->snapshot) {
+ if (old == head)
+ return;
+
+ if (ds->bts_index >= ds->bts_absolute_maximum)
+ local_inc(&buf->lost);
+
+ /*
+ * old and head are always in the same physical buffer, so we
+ * can subtract them to get the data size.
+ */
+ local_add(head - old, &buf->data_size);
+ } else {
+ local_set(&buf->data_size, head);
+ }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ u64 config = 0;
+
+ if (!buf || bts_buffer_is_full(buf, bts))
+ return;
+
+ event->hw.state = 0;
+
+ if (!buf->snapshot)
+ config |= ARCH_PERFMON_EVENTSEL_INT;
+ if (!event->attr.exclude_kernel)
+ config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (!event->attr.exclude_user)
+ config |= ARCH_PERFMON_EVENTSEL_USR;
+
+ bts_config_buffer(buf);
+
+ /*
+ * local barrier to make sure that ds configuration made it
+ * before we enable BTS
+ */
+ wmb();
+
+ intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ __bts_event_start(event);
+
+ /* PMI handler: this counter is running and likely generating PMIs */
+ ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+ /*
+ * No extra synchronization is mandated by the documentation to have
+ * BTS data stores globally visible.
+ */
+ intel_pmu_disable_bts();
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /* PMI handler: don't restart this counter */
+ ACCESS_ONCE(bts->started) = 0;
+
+ __bts_event_stop(event);
+
+ if (flags & PERF_EF_UPDATE)
+ bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event && bts->started)
+ __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event)
+ __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+ unsigned long head, space, next_space, pad, gap, skip, wakeup;
+ unsigned int next_buf;
+ struct bts_phys *phys, *next_phys;
+ int ret;
+
+ if (buf->snapshot)
+ return 0;
+
+ head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+ if (WARN_ON_ONCE(head != local_read(&buf->head)))
+ return -EINVAL;
+
+ phys = &buf->buf[buf->cur_buf];
+ space = phys->offset + phys->displacement + phys->size - head;
+ pad = space;
+ if (space > handle->size) {
+ space = handle->size;
+ space -= space % BTS_RECORD_SIZE;
+ }
+ if (space <= BTS_SAFETY_MARGIN) {
+ /* See if next phys buffer has more space */
+ next_buf = buf->cur_buf + 1;
+ if (next_buf >= buf->nr_bufs)
+ next_buf = 0;
+ next_phys = &buf->buf[next_buf];
+ gap = buf_size(phys->page) - phys->displacement - phys->size +
+ next_phys->displacement;
+ skip = pad + gap;
+ if (handle->size >= skip) {
+ next_space = next_phys->size;
+ if (next_space + skip > handle->size) {
+ next_space = handle->size - skip;
+ next_space -= next_space % BTS_RECORD_SIZE;
+ }
+ if (next_space > space || !space) {
+ if (pad)
+ bts_buffer_pad_out(phys, head);
+ ret = perf_aux_output_skip(handle, skip);
+ if (ret)
+ return ret;
+ /* Advance to next phys buffer */
+ phys = next_phys;
+ space = next_space;
+ head = phys->offset + phys->displacement;
+ /*
+ * After this, cur_buf and head won't match ds
+ * anymore, so we must not be racing with
+ * bts_update().
+ */
+ buf->cur_buf = next_buf;
+ local_set(&buf->head, head);
+ }
+ }
+ }
+
+ /* Don't go far beyond wakeup watermark */
+ wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+ handle->head;
+ if (space > wakeup) {
+ space = wakeup;
+ space -= space % BTS_RECORD_SIZE;
+ }
+
+ buf->end = head + space;
+
+ /*
+ * If we have no space, the lost notification would have been sent when
+ * we hit absolute_maximum - see bts_update()
+ */
+ if (!space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct perf_event *event = bts->handle.event;
+ struct bts_buffer *buf;
+ s64 old_head;
+ int err;
+
+ if (!event || !bts->started)
+ return 0;
+
+ buf = perf_get_aux(&bts->handle);
+ /*
+ * Skip snapshot counters: they don't use the interrupt, but
+ * there's no other way of telling, because the pointer will
+ * keep moving
+ */
+ if (!buf || buf->snapshot)
+ return 0;
+
+ old_head = local_read(&buf->head);
+ bts_update(bts);
+
+ /* no new data */
+ if (old_head == local_read(&buf->head))
+ return 0;
+
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return 1;
+
+ err = bts_buffer_reset(buf, &bts->handle);
+ if (err)
+ perf_aux_output_end(&bts->handle, 0, false);
+
+ return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+ bts_event_stop(event, PERF_EF_UPDATE);
+
+ if (buf) {
+ if (buf->snapshot)
+ bts->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+ struct bts_buffer *buf;
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ return -EBUSY;
+
+ if (bts->handle.event)
+ return -EBUSY;
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return -EINVAL;
+
+ ret = bts_buffer_reset(buf, &bts->handle);
+ if (ret) {
+ perf_aux_output_end(&bts->handle, 0, false);
+ return ret;
+ }
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ if (mode & PERF_EF_START) {
+ bts_event_start(event, 0);
+ if (hwc->state & PERF_HES_STOPPED) {
+ bts_event_del(event, 0);
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+ if (event->attr.type != bts_pmu.type)
+ return -ENOENT;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
+ event->destroy = bts_event_destroy;
+
+ return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+ return -ENODEV;
+
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+ bts_pmu.task_ctx_nr = perf_sw_context;
+ bts_pmu.event_init = bts_event_init;
+ bts_pmu.add = bts_event_add;
+ bts_pmu.del = bts_event_del;
+ bts_pmu.start = bts_event_start;
+ bts_pmu.stop = bts_event_stop;
+ bts_pmu.read = bts_event_read;
+ bts_pmu.setup_aux = bts_buffer_setup_aux;
+ bts_pmu.free_aux = bts_buffer_free_aux;
+
+ return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..e4d1b8b738fa
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+ raw_spinlock_t lock;
+ int rmid;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR (1ULL << 63)
+#define RMID_VAL_UNAVAIL (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+ if (!rmid || rmid == INVALID_RMID)
+ return false;
+
+ return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
+{
+ u64 val;
+
+ /*
+ * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+ * it just says that to increase confusion.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ /*
+ * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+ * the number of cachelines tagged with @rmid.
+ */
+ return val;
+}
+
+enum rmid_recycle_state {
+ RMID_YOUNG = 0,
+ RMID_AVAILABLE,
+ RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+ unsigned int rmid;
+ enum rmid_recycle_state state;
+ struct list_head list;
+ unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ entry = cqm_rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static int __get_rmid(void)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ if (list_empty(&cqm_rmid_free_lru))
+ return INVALID_RMID;
+
+ entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void __put_rmid(unsigned int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ WARN_ON(!__rmid_valid(rmid));
+ entry = __rmid_entry(rmid);
+
+ entry->queue_time = jiffies;
+ entry->state = RMID_YOUNG;
+
+ list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+ struct cqm_rmid_entry *entry;
+ unsigned int nr_rmids;
+ int r = 0;
+
+ nr_rmids = cqm_max_rmid + 1;
+ cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ nr_rmids, GFP_KERNEL);
+ if (!cqm_rmid_ptrs)
+ return -ENOMEM;
+
+ for (; r <= cqm_max_rmid; r++) {
+ struct cqm_rmid_entry *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto fail;
+
+ INIT_LIST_HEAD(&entry->list);
+ entry->rmid = r;
+ cqm_rmid_ptrs[r] = entry;
+
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ mutex_lock(&cache_mutex);
+ intel_cqm_rotation_rmid = __get_rmid();
+ mutex_unlock(&cache_mutex);
+
+ return 0;
+fail:
+ while (r--)
+ kfree(cqm_rmid_ptrs[r]);
+
+ kfree(cqm_rmid_ptrs);
+ return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+ /* Per-cpu and task events don't mix */
+ if ((a->attach_state & PERF_ATTACH_TASK) !=
+ (b->attach_state & PERF_ATTACH_TASK))
+ return false;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (a->cgrp != b->cgrp)
+ return false;
+#endif
+
+ /* If not task event, we're machine wide */
+ if (!(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Events that target same task are placed into the same cache group.
+ */
+ if (a->hw.target == b->hw.target)
+ return true;
+
+ /*
+ * Are we an inherited event?
+ */
+ if (b->parent == a)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return perf_cgroup_from_task(event->hw.target);
+
+ return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ * PROHIBITS
+ * system-wide -> cgroup and task
+ * cgroup -> system-wide
+ * -> task in cgroup
+ * task -> system-wide
+ * -> task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+ /*
+ * We can have any number of cgroups but only one system-wide
+ * event at a time.
+ */
+ if (a->cgrp && b->cgrp) {
+ struct perf_cgroup *ac = a->cgrp;
+ struct perf_cgroup *bc = b->cgrp;
+
+ /*
+ * This condition should have been caught in
+ * __match_event() and we should be sharing an RMID.
+ */
+ WARN_ON_ONCE(ac == bc);
+
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+
+ if (a->cgrp || b->cgrp) {
+ struct perf_cgroup *ac, *bc;
+
+ /*
+ * cgroup and system-wide events are mutually exclusive
+ */
+ if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+ (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+ return true;
+
+ /*
+ * Ensure neither event is part of the other's cgroup
+ */
+ ac = event_to_cgroup(a);
+ bc = event_to_cgroup(b);
+ if (ac == bc)
+ return true;
+
+ /*
+ * Must have cgroup and non-intersecting task events.
+ */
+ if (!ac || !bc)
+ return false;
+
+ /*
+ * We have cgroup and task events, and the task belongs
+ * to a cgroup. Check for for overlap.
+ */
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+#endif
+ /*
+ * If one of them is not a task, same story as above with cgroups.
+ */
+ if (!(a->attach_state & PERF_ATTACH_TASK) ||
+ !(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Must be non-overlapping.
+ */
+ return false;
+}
+
+struct rmid_read {
+ unsigned int rmid;
+ atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+ struct perf_event *event;
+ unsigned int old_rmid = group->hw.cqm_rmid;
+ struct list_head *head = &group->hw.cqm_group_entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ /*
+ * If our RMID is being deallocated, perform a read now.
+ */
+ if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ .rmid = old_rmid,
+ };
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+ &rr, 1);
+ local64_set(&group->count, atomic64_read(&rr.value));
+ }
+
+ raw_spin_lock_irq(&cache_lock);
+
+ group->hw.cqm_rmid = rmid;
+ list_for_each_entry(event, head, hw.cqm_group_entry)
+ event->hw.cqm_rmid = rmid;
+
+ raw_spin_unlock_irq(&cache_lock);
+
+ return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+ struct cqm_rmid_entry *entry;
+
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ if (entry->state != RMID_AVAILABLE)
+ break;
+
+ if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+ entry->state = RMID_DIRTY;
+ }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+ struct perf_event *leader, *event;
+
+ lockdep_assert_held(&cache_mutex);
+
+ leader = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+ event = leader;
+
+ list_for_each_entry_continue(event, &cache_groups,
+ hw.cqm_groups_entry) {
+ if (__rmid_valid(event->hw.cqm_rmid))
+ continue;
+
+ if (__conflict_event(event, leader))
+ continue;
+
+ intel_cqm_xchg_rmid(event, rmid);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+ struct cqm_rmid_entry *entry, *tmp;
+
+ lockdep_assert_held(&cache_mutex);
+
+ *available = 0;
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ unsigned long min_queue_time;
+ unsigned long now = jiffies;
+
+ /*
+ * We hold RMIDs placed into limbo for a minimum queue
+ * time. Before the minimum queue time has elapsed we do
+ * not recycle RMIDs.
+ *
+ * The reasoning is that until a sufficient time has
+ * passed since we stopped using an RMID, any RMID
+ * placed onto the limbo list will likely still have
+ * data tagged in the cache, which means we'll probably
+ * fail to recycle it anyway.
+ *
+ * We can save ourselves an expensive IPI by skipping
+ * any RMIDs that have not been queued for the minimum
+ * time.
+ */
+ min_queue_time = entry->queue_time +
+ msecs_to_jiffies(__rmid_queue_time_ms);
+
+ if (time_after(min_queue_time, now))
+ break;
+
+ entry->state = RMID_AVAILABLE;
+ (*available)++;
+ }
+
+ /*
+ * Fast return if none of the RMIDs on the limbo list have been
+ * sitting on the queue for the minimum queue time.
+ */
+ if (!*available)
+ return false;
+
+ /*
+ * Test whether an RMID is free for each package.
+ */
+ on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+ list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+ /*
+ * Exhausted all RMIDs that have waited min queue time.
+ */
+ if (entry->state == RMID_YOUNG)
+ break;
+
+ if (entry->state == RMID_DIRTY)
+ continue;
+
+ list_del(&entry->list); /* remove from limbo */
+
+ /*
+ * The rotation RMID gets priority if it's
+ * currently invalid. In which case, skip adding
+ * the RMID to the the free lru.
+ */
+ if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_rotation_rmid = entry->rmid;
+ continue;
+ }
+
+ /*
+ * If we have groups waiting for RMIDs, hand
+ * them one now provided they don't conflict.
+ */
+ if (intel_cqm_sched_in_event(entry->rmid))
+ continue;
+
+ /*
+ * Otherwise place it onto the free list.
+ */
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+
+ return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+ struct perf_event *rotor;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ rotor = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+
+ /*
+ * The group at the front of the list should always have a valid
+ * RMID. If it doesn't then no groups have RMIDs assigned and we
+ * don't need to rotate the list.
+ */
+ if (next == rotor)
+ return;
+
+ rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+ __put_rmid(rmid);
+
+ list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+ struct perf_event *group, *g;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+ if (group == event)
+ continue;
+
+ rmid = group->hw.cqm_rmid;
+
+ /*
+ * Skip events that don't have a valid RMID.
+ */
+ if (!__rmid_valid(rmid))
+ continue;
+
+ /*
+ * No conflict? No problem! Leave the event alone.
+ */
+ if (!__conflict_event(group, event))
+ continue;
+
+ intel_cqm_xchg_rmid(group, INVALID_RMID);
+ __put_rmid(rmid);
+ }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ * 1. To handle the scheduling of conflicting events
+ * 2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+ struct perf_event *group, *start = NULL;
+ unsigned int threshold_limit;
+ unsigned int nr_needed = 0;
+ unsigned int nr_available;
+ bool rotated = false;
+
+ mutex_lock(&cache_mutex);
+
+again:
+ /*
+ * Fast path through this function if there are no groups and no
+ * RMIDs that need cleaning.
+ */
+ if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+ if (!__rmid_valid(group->hw.cqm_rmid)) {
+ if (!start)
+ start = group;
+ nr_needed++;
+ }
+ }
+
+ /*
+ * We have some event groups, but they all have RMIDs assigned
+ * and no RMIDs need cleaning.
+ */
+ if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ if (!nr_needed)
+ goto stabilize;
+
+ /*
+ * We have more event groups without RMIDs than available RMIDs,
+ * or we have event groups that conflict with the ones currently
+ * scheduled.
+ *
+ * We force deallocate the rmid of the group at the head of
+ * cache_groups. The first event group without an RMID then gets
+ * assigned intel_cqm_rotation_rmid. This ensures we always make
+ * forward progress.
+ *
+ * Rotate the cache_groups list so the previous head is now the
+ * tail.
+ */
+ __intel_cqm_pick_and_rotate(start);
+
+ /*
+ * If the rotation is going to succeed, reduce the threshold so
+ * that we don't needlessly reuse dirty RMIDs.
+ */
+ if (__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+ intel_cqm_rotation_rmid = __get_rmid();
+
+ intel_cqm_sched_out_conflicting_events(start);
+
+ if (__intel_cqm_threshold)
+ __intel_cqm_threshold--;
+ }
+
+ rotated = true;
+
+stabilize:
+ /*
+ * We now need to stablize the RMID we freed above (if any) to
+ * ensure that the next time we rotate we have an RMID with zero
+ * occupancy value.
+ *
+ * Alternatively, if we didn't need to perform any rotation,
+ * we'll have a bunch of RMIDs in limbo that need stabilizing.
+ */
+ threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+ while (intel_cqm_rmid_stabilize(&nr_available) &&
+ __intel_cqm_threshold < threshold_limit) {
+ unsigned int steal_limit;
+
+ /*
+ * Don't spin if nobody is actively waiting for an RMID,
+ * the rotation worker will be kicked as soon as an
+ * event needs an RMID anyway.
+ */
+ if (!nr_needed)
+ break;
+
+ /* Allow max 25% of RMIDs to be in limbo. */
+ steal_limit = (cqm_max_rmid + 1) / 4;
+
+ /*
+ * We failed to stabilize any RMIDs so our rotation
+ * logic is now stuck. In order to make forward progress
+ * we have a few options:
+ *
+ * 1. rotate ("steal") another RMID
+ * 2. increase the threshold
+ * 3. do nothing
+ *
+ * We do both of 1. and 2. until we hit the steal limit.
+ *
+ * The steal limit prevents all RMIDs ending up on the
+ * limbo list. This can happen if every RMID has a
+ * non-zero occupancy above threshold_limit, and the
+ * occupancy values aren't dropping fast enough.
+ *
+ * Note that there is prioritisation at work here - we'd
+ * rather increase the number of RMIDs on the limbo list
+ * than increase the threshold, because increasing the
+ * threshold skews the event data (because we reuse
+ * dirty RMIDs) - threshold bumps are a last resort.
+ */
+ if (nr_available < steal_limit)
+ goto again;
+
+ __intel_cqm_threshold++;
+ }
+
+out:
+ mutex_unlock(&cache_mutex);
+ return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+ unsigned long delay;
+
+ __intel_cqm_rmid_rotate();
+
+ delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+ schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+ struct perf_event **group)
+{
+ struct perf_event *iter;
+ unsigned int rmid;
+ bool conflict = false;
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+ rmid = iter->hw.cqm_rmid;
+
+ if (__match_event(iter, event)) {
+ /* All tasks in a group share an RMID */
+ event->hw.cqm_rmid = rmid;
+ *group = iter;
+ return;
+ }
+
+ /*
+ * We only care about conflicts for events that are
+ * actually scheduled in (and hence have a valid RMID).
+ */
+ if (__conflict_event(iter, event) && __rmid_valid(rmid))
+ conflict = true;
+ }
+
+ if (conflict)
+ rmid = INVALID_RMID;
+ else
+ rmid = __get_rmid();
+
+ event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+ unsigned long flags;
+ unsigned int rmid;
+ u64 val;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ rmid = event->hw.cqm_rmid;
+
+ if (!__rmid_valid(rmid))
+ goto out;
+
+ val = __rmid_read(rmid);
+
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ goto out;
+
+ local64_set(&event->count, val);
+out:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read(rr->rmid);
+
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+ return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+ unsigned long flags;
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /*
+ * We only need to worry about task events. System-wide events
+ * are handled like usual, i.e. entirely with
+ * intel_cqm_event_read().
+ */
+ if (event->cpu != -1)
+ return __perf_event_count(event);
+
+ /*
+ * Only the group leader gets to report values. This stops us
+ * reporting duplicate values to userspace, and gives us a clear
+ * rule for which task gets to report the values.
+ *
+ * Note that it is impossible to attribute these values to
+ * specific packages - we forfeit that ability when we create
+ * task events.
+ */
+ if (!cqm_group_leader(event))
+ return 0;
+
+ /*
+ * Notice that we don't perform the reading of an RMID
+ * atomically, because we can't hold a spin lock across the
+ * IPIs.
+ *
+ * Speculatively perform the read, since @event might be
+ * assigned a different (possibly invalid) RMID while we're
+ * busying performing the IPI calls. It's therefore necessary to
+ * check @event's RMID afterwards, and if it has changed,
+ * discard the result of the read.
+ */
+ rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+ if (!__rmid_valid(rr.rmid))
+ goto out;
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ if (event->hw.cqm_rmid == rr.rmid)
+ local64_set(&event->count, atomic64_read(&rr.value));
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+ return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned int rmid = event->hw.cqm_rmid;
+ unsigned long flags;
+
+ if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+ return;
+
+ event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+
+ if (state->cnt++)
+ WARN_ON_ONCE(state->rmid != rmid);
+ else
+ WARN_ON_ONCE(state->rmid);
+
+ state->rmid = rmid;
+ wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned long flags;
+
+ if (event->hw.cqm_state & PERF_HES_STOPPED)
+ return;
+
+ event->hw.cqm_state |= PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+ intel_cqm_event_read(event);
+
+ if (!--state->cnt) {
+ state->rmid = 0;
+ wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ } else {
+ WARN_ON_ONCE(!state->rmid);
+ }
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+ unsigned long flags;
+ unsigned int rmid;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ event->hw.cqm_state = PERF_HES_STOPPED;
+ rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+ intel_cqm_event_start(event, mode);
+
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+ intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+ struct perf_event *group_other = NULL;
+
+ mutex_lock(&cache_mutex);
+
+ /*
+ * If there's another event in this group...
+ */
+ if (!list_empty(&event->hw.cqm_group_entry)) {
+ group_other = list_first_entry(&event->hw.cqm_group_entry,
+ struct perf_event,
+ hw.cqm_group_entry);
+ list_del(&event->hw.cqm_group_entry);
+ }
+
+ /*
+ * And we're the group leader..
+ */
+ if (cqm_group_leader(event)) {
+ /*
+ * If there was a group_other, make that leader, otherwise
+ * destroy the group and return the RMID.
+ */
+ if (group_other) {
+ list_replace(&event->hw.cqm_groups_entry,
+ &group_other->hw.cqm_groups_entry);
+ } else {
+ unsigned int rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid))
+ __put_rmid(rmid);
+ list_del(&event->hw.cqm_groups_entry);
+ }
+ }
+
+ mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+ struct perf_event *group = NULL;
+ bool rotate = false;
+
+ if (event->attr.type != intel_cqm_pmu.type)
+ return -ENOENT;
+
+ if (event->attr.config & ~QOS_EVENT_MASK)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+ INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+ event->destroy = intel_cqm_event_destroy;
+
+ mutex_lock(&cache_mutex);
+
+ /* Will also set rmid */
+ intel_cqm_setup_event(event, &group);
+
+ if (group) {
+ list_add_tail(&event->hw.cqm_group_entry,
+ &group->hw.cqm_group_entry);
+ } else {
+ list_add_tail(&event->hw.cqm_groups_entry,
+ &cache_groups);
+
+ /*
+ * All RMIDs are either in use or have recently been
+ * used. Kick the rotation worker to clean/free some.
+ *
+ * We only do this for the group leader, rather than for
+ * every event in a group to save on needless work.
+ */
+ if (!__rmid_valid(event->hw.cqm_rmid))
+ rotate = true;
+ }
+
+ mutex_unlock(&cache_mutex);
+
+ if (rotate)
+ schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+ return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+ .name = "events",
+ .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+ .name = "format",
+ .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes, cachelines;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+
+ __intel_cqm_max_threshold = bytes;
+ cachelines = bytes / cqm_l3_scale;
+
+ /*
+ * The new maximum takes effect immediately.
+ */
+ if (__intel_cqm_threshold > cachelines)
+ __intel_cqm_threshold = cachelines;
+
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+ &dev_attr_max_recycle_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+ .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+ &intel_cqm_events_group,
+ &intel_cqm_format_group,
+ &intel_cqm_group,
+ NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+ .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+ .attr_groups = intel_cqm_attr_groups,
+ .task_ctx_nr = perf_sw_context,
+ .event_init = intel_cqm_event_init,
+ .add = intel_cqm_event_add,
+ .del = intel_cqm_event_del,
+ .start = intel_cqm_event_start,
+ .stop = intel_cqm_event_stop,
+ .read = intel_cqm_event_read,
+ .count = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ for_each_cpu(i, &cqm_cpumask) {
+ if (phys_id == topology_physical_package_id(i))
+ return; /* already got reader for this socket */
+ }
+
+ cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+ struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ raw_spin_lock_init(&state->lock);
+ state->rmid = 0;
+ state->cnt = 0;
+
+ WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+ WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ /*
+ * Is @cpu a designated cqm reader?
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+ return;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+
+ if (phys_id == topology_physical_package_id(i)) {
+ cpumask_set_cpu(i, &cqm_cpumask);
+ break;
+ }
+ }
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ intel_cqm_cpu_prepare(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ intel_cqm_cpu_exit(cpu);
+ break;
+ case CPU_STARTING:
+ cqm_pick_event_reader(cpu);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+ {}
+};
+
+static int __init intel_cqm_init(void)
+{
+ char *str, scale[20];
+ int i, cpu, ret;
+
+ if (!x86_match_cpu(intel_cqm_match))
+ return -ENODEV;
+
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+ /*
+ * It's possible that not all resources support the same number
+ * of RMIDs. Instead of making scheduling much more complicated
+ * (where we have to match a task's RMID to a cpu that supports
+ * that many RMIDs) just find the minimum RMIDs supported across
+ * all cpus.
+ *
+ * Also, check that the scales match on all cpus.
+ */
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;
+
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
+ boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event_attr_intel_cqm_llc_scale.event_str = str;
+
+ ret = intel_cqm_setup_rmid_cache();
+ if (ret)
+ goto out;
+
+ for_each_online_cpu(i) {
+ intel_cqm_cpu_prepare(i);
+ cqm_pick_event_reader(i);
+ }
+
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+ ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+ if (ret)
+ pr_err("Intel CQM perf registration failed: %d\n", ret);
+ else
+ pr_info("Intel CQM monitoring enabled\n");
+
+out:
+ cpu_notifier_register_done();
+
+ return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..ca69ea56c712 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
debugctlmsr |= DEBUGCTLMSR_TR;
debugctlmsr |= DEBUGCTLMSR_BTS;
- debugctlmsr |= DEBUGCTLMSR_BTINT;
+ if (config & ARCH_PERFMON_EVENTSEL_INT)
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
if (!(config & ARCH_PERFMON_EVENTSEL_OS))
debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
@@ -611,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -622,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -633,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -69,33 +71,31 @@ static enum {
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)
-#define for_each_branch_sample_type(x) \
- for ((x) = PERF_SAMPLE_BRANCH_USER; \
- (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -112,13 +112,15 @@ enum {
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
- X86_BR_IND_CALL)
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
* otherwise it becomes near impossible to get a reliable stack.
*/
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
{
- u64 debugctl;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_select = 0, orig_debugctl;
- if (cpuc->lbr_sel)
- wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+ /*
+ * No need to reprogram LBR_SELECT in a PMI, as it
+ * did not change.
+ */
+ if (cpuc->lbr_sel && !pmi) {
+ lbr_select = cpuc->lbr_sel->config;
+ wrmsrl(MSR_LBR_SELECT, lbr_select);
+ }
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+ debugctl |= DEBUGCTLMSR_LBR;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may cause superfluous increase/decrease of LBR_TOS.
+ */
+ if (!(lbr_select & LBR_CALL_STACK))
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (orig_debugctl != debugctl)
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void __intel_pmu_lbr_disable(void)
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_64();
}
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+ return tos;
+}
+
+enum {
+ LBR_NONE,
+ LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0 ||
+ task_ctx->lbr_stack_state == LBR_NONE) {
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ /*
+ * If LBR callstack feature is enabled and the stack was saved when
+ * the task was scheduled out, restore the stack. Otherwise flush
+ * the LBR stack.
+ */
+ task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ if (task_ctx) {
+ if (sched_in) {
+ __intel_pmu_lbr_restore(task_ctx);
+ cpuc->lbr_context = ctx;
+ } else {
+ __intel_pmu_lbr_save(task_ctx);
+ }
+ return;
+ }
+
+ /*
+ * When sampling the branck stack in system-wide, it may be
+ * necessary to flush the stack on context switch. This happens
+ * when the branch stack does not tag its entries with the pid
+ * of the current task. Otherwise it becomes impossible to
+ * associate a branch entry with a task. This ambiguity is more
+ * likely to appear when the branch stack supports priv level
+ * filtering and the user sets it to monitor only at the user
+ * level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch
+ * stack with branch from multiple tasks.
+ */
+ if (sched_in) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = ctx;
+ }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
}
cpuc->br_sel = event->hw.branch_reg.reg;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users++;
+ }
+
cpuc->lbr_users++;
+ perf_sched_cb_inc(event->ctx->pmu);
}
void intel_pmu_lbr_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users--;
+ }
+
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->ctx->pmu);
if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
}
}
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
- __intel_pmu_lbr_enable();
+ __intel_pmu_lbr_enable(pmi);
}
void intel_pmu_lbr_disable_all(void)
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
- u64 tos;
-
- rdmsrl(x86_pmu.lbr_tos, tos);
-
- return tos;
-}
-
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
+ return 0;
}
/*
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
- u64 mask = 0, m;
- u64 v;
+ u64 mask = 0, v;
+ int i;
- for_each_branch_sample_type(m) {
- if (!(br_type & m))
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & (1ULL << i)))
continue;
- v = x86_pmu.lbr_sel_map[m];
+ v = x86_pmu.lbr_sel_map[i];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
- /* LBR_SELECT operates in suppress mode so invert mask */
- reg->config = ~mask & x86_pmu.lbr_sel_mask;
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ */
+ reg->config = mask ^ x86_pmu.lbr_sel_mask;
return 0;
}
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* setup SW LBR filter
*/
- intel_pmu_setup_sw_lbr_filter(event);
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;
/*
* setup HW LBR filter, if any
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
+ insn_get_immediate(&insn);
+ if (insn.immediate1.value == 0) {
+ /* zero length call */
+ ret = X86_BR_ZERO_CALL;
+ break;
+ }
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
/*
* Map interface branch filters onto LBR filters
*/
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
- | LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
- [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
};
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
- [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
- | LBR_FAR,
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
};
/* core */
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
pr_cont("16-deep LBR, ");
}
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ pr_cont("16-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..f2770641c0fd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1103 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
+ PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
+ PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+ struct pt_cap_desc *cd = &pt_caps[cap];
+ u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *ea =
+ container_of(attr, struct dev_ext_attribute, attr);
+ enum pt_capabilities cap = (long)ea->var;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+ .name = "caps",
+};
+
+PMU_FORMAT_ATTR(tsc, "config:10" );
+PMU_FORMAT_ATTR(noretcomp, "config:11" );
+
+static struct attribute *pt_formats_attr[] = {
+ &format_attr_tsc.attr,
+ &format_attr_noretcomp.attr,
+ NULL,
+};
+
+static struct attribute_group pt_format_group = {
+ .name = "format",
+ .attrs = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+ &pt_cap_group,
+ &pt_format_group,
+ NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+ struct dev_ext_attribute *de_attrs;
+ struct attribute **attrs;
+ size_t size;
+ int ret;
+ long i;
+
+ attrs = NULL;
+ ret = -ENODEV;
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ goto fail;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ cpuid_count(20, i,
+ &pt_pmu.caps[CR_EAX + i*4],
+ &pt_pmu.caps[CR_EBX + i*4],
+ &pt_pmu.caps[CR_ECX + i*4],
+ &pt_pmu.caps[CR_EDX + i*4]);
+ }
+
+ ret = -ENOMEM;
+ size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+ attrs = kzalloc(size, GFP_KERNEL);
+ if (!attrs)
+ goto fail;
+
+ size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+ de_attrs = kzalloc(size, GFP_KERNEL);
+ if (!de_attrs)
+ goto fail;
+
+ for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+ struct dev_ext_attribute *de_attr = de_attrs + i;
+
+ de_attr->attr.attr.name = pt_caps[i].name;
+
+ sysfs_attr_init(&de_attrs->attr.attr);
+
+ de_attr->attr.attr.mode = S_IRUGO;
+ de_attr->attr.show = pt_cap_show;
+ de_attr->var = (void *)i;
+
+ attrs[i] = &de_attr->attr.attr;
+ }
+
+ pt_cap_group.attrs = attrs;
+
+ return 0;
+
+fail:
+ kfree(attrs);
+
+ return ret;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+
+ if ((config & PT_CONFIG_MASK) != config)
+ return false;
+
+ return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+ u64 reg;
+
+ reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+ if (!event->attr.exclude_kernel)
+ reg |= RTIT_CTL_OS;
+ if (!event->attr.exclude_user)
+ reg |= RTIT_CTL_USR;
+
+ reg |= (event->attr.config & PT_CONFIG_MASK);
+
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+ if (start)
+ ctl |= RTIT_CTL_TRACEEN;
+ else
+ ctl &= ~RTIT_CTL_TRACEEN;
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ /*
+ * A wrmsr that disables trace generation serializes other PT
+ * registers and causes all data packets to be written to memory,
+ * but a fence is required for the data to become globally visible.
+ *
+ * The below WMB, separating data store and aux_head store matches
+ * the consumer's RMB that separates aux_head load and data load.
+ */
+ if (!start)
+ wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+ unsigned int output_off)
+{
+ u64 reg;
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+ reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table: actual ToPA table entries, as understood by PT hardware
+ * @list: linkage to struct pt_buffer's list of tables
+ * @phys: physical address of this page
+ * @offset: offset of the first entry in this table in the buffer
+ * @size: total size of all entries in this table
+ * @last: index of the last initialized entry in this table
+ */
+struct topa {
+ struct topa_entry table[TENTS_PER_PAGE];
+ struct list_head list;
+ u64 phys;
+ u64 offset;
+ size_t size;
+ int last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+ int node = cpu_to_node(cpu);
+ struct topa *topa;
+ struct page *p;
+
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+ if (!p)
+ return NULL;
+
+ topa = page_address(p);
+ topa->last = 0;
+ topa->phys = page_to_phys(p);
+
+ /*
+ * In case of singe-entry ToPA, always put the self-referencing END
+ * link as the 2nd entry in the table
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, 1)->end = 1;
+ }
+
+ return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa: Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+ free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf: PT buffer that's being extended.
+ * @topa: New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+ struct topa *last = buf->last;
+
+ list_add_tail(&topa->list, &buf->tables);
+
+ if (!buf->first) {
+ buf->first = buf->last = buf->cur = topa;
+ return;
+ }
+
+ topa->offset = last->offset + last->size;
+ buf->last = topa;
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return;
+
+ BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+ TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa: ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+ /* single-entry ToPA is a special case */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return !!topa->last;
+
+ return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf: PT buffer being initialized.
+ * @gfp: Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return: 0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+ struct topa *topa = buf->last;
+ int order = 0;
+ struct page *p;
+
+ p = virt_to_page(buf->data_pages[buf->nr_pages]);
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (topa_table_full(topa)) {
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+ }
+
+ TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, -1)->size = order;
+ if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, -1)->intr = 1;
+ TOPA_ENTRY(topa, -1)->stop = 1;
+ }
+
+ topa->last++;
+ topa->size += sizes(order);
+
+ buf->nr_pages += 1ul << order;
+
+ return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf: PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+ struct topa *topa;
+
+ list_for_each_entry(topa, &buf->tables, list) {
+ int i;
+
+ pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+ topa->phys, topa->offset, topa->size);
+ for (i = 0; i < TENTS_PER_PAGE; i++) {
+ pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+ &topa->table[i],
+ (unsigned long)topa->table[i].base << TOPA_SHIFT,
+ sizes(topa->table[i].size),
+ topa->table[i].end ? 'E' : ' ',
+ topa->table[i].intr ? 'I' : ' ',
+ topa->table[i].stop ? 'S' : ' ',
+ *(u64 *)&topa->table[i]);
+ if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+ topa->table[i].stop) ||
+ topa->table[i].end)
+ break;
+ }
+ }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf: PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+ buf->output_off = 0;
+ buf->cur_idx++;
+
+ if (buf->cur_idx == buf->cur->last) {
+ if (buf->cur == buf->last)
+ buf->cur = buf->first;
+ else
+ buf->cur = list_entry(buf->cur->list.next, struct topa,
+ list);
+ buf->cur_idx = 0;
+ }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt: Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ u64 topa_idx, base, old;
+
+ /* offset of the first region in this table from the beginning of buf */
+ base = buf->cur->offset + buf->output_off;
+
+ /* offset of the current output region within this table */
+ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+ base += sizes(buf->cur->table[topa_idx].size);
+
+ if (buf->snapshot) {
+ local_set(&buf->data_size, base);
+ } else {
+ old = (local64_xchg(&buf->head, base) &
+ ((buf->nr_pages << PAGE_SHIFT) - 1));
+ if (base < old)
+ base += buf->nr_pages << PAGE_SHIFT;
+
+ local_add(base - old, &buf->data_size);
+ }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf: PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+ return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf: PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+ return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt: Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ int advance = 0;
+ u64 status;
+
+ rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+ if (status & RTIT_STATUS_ERROR) {
+ pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+ pt_topa_dump(buf);
+ status &= ~RTIT_STATUS_ERROR;
+ }
+
+ if (status & RTIT_STATUS_STOPPED) {
+ status &= ~RTIT_STATUS_STOPPED;
+
+ /*
+ * On systems that only do single-entry ToPA, hitting STOP
+ * means we are already losing data; need to let the decoder
+ * know.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ local_inc(&buf->lost);
+ advance++;
+ }
+ }
+
+ /*
+ * Also on single-entry ToPA implementations, interrupt will come
+ * before the output reaches its output region's boundary.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+ pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+ void *head = pt_buffer_region(buf);
+
+ /* everything within this margin needs to be zeroed out */
+ memset(head + buf->output_off, 0,
+ pt_buffer_region_size(buf) -
+ buf->output_off);
+ advance++;
+ }
+
+ if (advance)
+ pt_buffer_advance(buf);
+
+ wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf: PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+ u64 offset, base_topa;
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+ buf->cur = phys_to_virt(base_topa);
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+ /* offset within current output region */
+ buf->output_off = offset >> 32;
+ /* index of current output region within this table */
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf: PT buffer.
+ * @pg: Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+ struct topa_entry *te = buf->topa_index[pg];
+
+ /* one region */
+ if (buf->first == buf->last && buf->first->last == 1)
+ return pg;
+
+ do {
+ pg++;
+ pg &= buf->nr_pages - 1;
+ } while (buf->topa_index[pg] == te);
+
+ return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf: PT buffer.
+ * @handle: Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+ struct perf_output_handle *handle)
+
+{
+ unsigned long idx, npages, end;
+
+ if (buf->snapshot)
+ return 0;
+
+ /* can't stop in the middle of an output region */
+ if (buf->output_off + handle->size + 1 <
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+ return -EINVAL;
+
+
+ /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return 0;
+
+ /* clear STOP and INT from current entry */
+ buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->intr_pos]->intr = 0;
+
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ npages = (handle->size + 1) >> PAGE_SHIFT;
+ end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+ /*if (end > handle->wakeup >> PAGE_SHIFT)
+ end = handle->wakeup >> PAGE_SHIFT;*/
+ idx = end & (buf->nr_pages - 1);
+ buf->stop_pos = idx;
+ idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+ idx &= buf->nr_pages - 1;
+ buf->intr_pos = idx;
+ }
+
+ buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->intr_pos]->intr = 1;
+
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf: PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+ struct topa *cur = buf->first, *prev = buf->last;
+ struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+ *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+ int pg = 0, idx = 0, ntopa = 0;
+
+ while (pg < buf->nr_pages) {
+ int tidx;
+
+ /* pages within one topa entry */
+ for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+ buf->topa_index[pg] = te_prev;
+
+ te_prev = te_cur;
+
+ if (idx == cur->last - 1) {
+ /* advance to next topa table */
+ idx = 0;
+ cur = list_entry(cur->list.next, struct topa, list);
+ ntopa++;
+ } else
+ idx++;
+ te_cur = TOPA_ENTRY(cur, idx);
+ }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf: PT buffer.
+ * @head: Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+ int pg;
+
+ if (buf->snapshot)
+ head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ pg = pt_topa_next_entry(buf, pg);
+
+ buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+ buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+ (unsigned long)buf->cur) / sizeof(struct topa_entry);
+ buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+ local64_set(&buf->head, head);
+ local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf: PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+ struct topa *topa, *iter;
+
+ list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+ /*
+ * right now, this is in free_aux() path only, so
+ * no need to unlink this table from the list
+ */
+ topa_free(topa);
+ }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf: PT buffer.
+ * @size: Total size of all regions within this ToPA.
+ * @gfp: Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+ gfp_t gfp)
+{
+ struct topa *topa;
+ int err;
+
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+
+ while (buf->nr_pages < nr_pages) {
+ err = topa_insert_pages(buf, gfp);
+ if (err) {
+ pt_buffer_fini_topa(buf);
+ return -ENOMEM;
+ }
+ }
+
+ pt_buffer_setup_topa_index(buf);
+
+ /* link last table to the first one, unless we're double buffering */
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(buf->last, -1)->end = 1;
+ }
+
+ pt_topa_dump(buf);
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu: Cpu on which to allocate, -1 means current.
+ * @pages: Array of pointers to buffer pages passed from perf core.
+ * @nr_pages: Number of pages in the buffer.
+ * @snapshot: If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return: Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+ struct pt_buffer *buf;
+ int node, ret;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (cpu == -1)
+ cpu = raw_smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+ GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->cpu = cpu;
+ buf->snapshot = snapshot;
+ buf->data_pages = pages;
+
+ INIT_LIST_HEAD(&buf->tables);
+
+ ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+
+ return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data: PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+ struct pt_buffer *buf = data;
+
+ pt_buffer_fini_topa(buf);
+ kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf: PT buffer.
+ * @pt: Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= pt->handle.size)
+ return true;
+
+ return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+ struct perf_event *event = pt->handle.event;
+
+ /*
+ * There may be a dangling PT bit in the interrupt status register
+ * after PT has been disabled by pt_event_stop(). Make sure we don't
+ * do anything (particularly, re-enable) for this event here.
+ */
+ if (!ACCESS_ONCE(pt->handle_nmi))
+ return;
+
+ pt_config_start(false);
+
+ if (!event)
+ return;
+
+ buf = perf_get_aux(&pt->handle);
+ if (!buf)
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+
+ if (!event->hw.state) {
+ int ret;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0, true);
+ return;
+ }
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+ }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ ACCESS_ONCE(pt->handle_nmi) = 1;
+ event->hw.state = 0;
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ /*
+ * Protect against the PMI racing with disabling wrmsr,
+ * see comment in intel_pt_interrupt().
+ */
+ ACCESS_ONCE(pt->handle_nmi) = 0;
+ pt_config_start(false);
+
+ if (event->hw.state == PERF_HES_STOPPED)
+ return;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_UPDATE) {
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (!buf)
+ return;
+
+ if (WARN_ON_ONCE(pt->handle.event != event))
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+ }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+
+ pt_event_stop(event, PERF_EF_UPDATE);
+
+ buf = perf_get_aux(&pt->handle);
+
+ if (buf) {
+ if (buf->snapshot)
+ pt->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+ }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+ struct pt_buffer *buf;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ if (pt->handle.event)
+ goto out;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0, true);
+ goto out;
+ }
+ }
+
+ if (mode & PERF_EF_START) {
+ pt_event_start(event, 0);
+ if (hwc->state == PERF_HES_STOPPED) {
+ pt_event_del(event, 0);
+ ret = -EBUSY;
+ }
+ } else {
+ hwc->state = PERF_HES_STOPPED;
+ }
+
+ ret = 0;
+out:
+
+ if (ret)
+ hwc->state = PERF_HES_STOPPED;
+
+ return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+ if (event->attr.type != pt_pmu.pmu.type)
+ return -ENOENT;
+
+ if (!pt_event_valid(event))
+ return -EINVAL;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_pt))
+ return -EBUSY;
+
+ event->destroy = pt_event_destroy;
+
+ return 0;
+}
+
+static __init int pt_init(void)
+{
+ int ret, cpu, prior_warn = 0;
+
+ BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ u64 ctl;
+
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+ if (!ret && (ctl & RTIT_CTL_TRACEEN))
+ prior_warn++;
+ }
+ put_online_cpus();
+
+ if (prior_warn) {
+ x86_add_exclusive(x86_lbr_exclusive_pt);
+ pr_warn("PT is enabled at boot time, doing nothing\n");
+
+ return -EBUSY;
+ }
+
+ ret = pt_pmu_hw_init();
+ if (ret)
+ return ret;
+
+ if (!pt_cap_get(PT_CAP_topa_output)) {
+ pr_warn("ToPA output is not supported on this CPU\n");
+ return -ENODEV;
+ }
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ pt_pmu.pmu.capabilities =
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+ return ret;
+}
+
+module_init(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
}
}
- if (ubox_dev)
- pci_dev_put(ubox_dev);
+ pci_dev_put(ubox_dev);
return err ? pcibios_err_to_errno(err) : 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
{ X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
{ X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f90c716..c76d3e37c6e1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
- if (!user_mode_vm(regs)) {
+ if (!user_mode(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3d3503351242..6367a780cc8c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void)
initial_boot_params = dt = early_memremap(initial_dtb, map_len);
size = of_get_flat_dt_size();
if (map_len < size) {
- early_iounmap(dt, map_len);
+ early_memunmap(dt, map_len);
initial_boot_params = dt = early_memremap(initial_dtb, size);
map_len = size;
}
unflatten_and_copy_device_tree();
- early_iounmap(dt, map_len);
+ early_memunmap(dt, map_len);
}
#else
static inline void x86_flattree_get_config(void) { }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..9c30acfadae2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
-static void printk_stack_address(unsigned long address, int reliable)
+static void printk_stack_address(unsigned long address, int reliable,
+ void *data)
{
- pr_cont(" [<%p>] %s%pB\n",
- (void *)address, reliable ? "" : "? ", (void *)address);
+ printk("%s [<%p>] %s%pB\n",
+ (char *)data, (void *)address, reliable ? "" : "? ",
+ (void *)address);
}
void printk_address(unsigned long address)
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name)
static void print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
- printk(data);
- printk_stack_address(addr, reliable);
+ printk_stack_address(addr, reliable, data);
}
static const struct stacktrace_ops print_trace_ops = {
@@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
print_modules();
show_regs(regs);
#ifdef CONFIG_X86_32
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
} else {
@@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err)
unsigned long flags = oops_begin();
int sig = SIGSEGV;
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
report_bug(regs->ip, regs);
if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..464ffd69b92e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
for (i = 0; i < kstack_depth_to_print; i++) {
if (kstack_end(stack))
break;
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- pr_cont("\n");
- pr_cont(" %08lx", *stack++);
+ if ((i % STACKSLOTS_PER_LINE) == 0) {
+ if (i != 0)
+ pr_cont("\n");
+ printk("%s %08lx", log_lvl, *stack++);
+ } else
+ pr_cont(" %08lx", *stack++);
touch_nmi_watchdog();
}
pr_cont("\n");
@@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs)
int i;
show_regs_print_info(KERN_EMERG);
- __show_regs(regs, !user_mode_vm(regs));
+ __show_regs(regs, !user_mode(regs));
/*
* When in-kernel, we also print out the stack and code at the
* time of the fault..
*/
- if (!user_mode_vm(regs)) {
+ if (!user_mode(regs)) {
unsigned int code_prologue = code_bytes * 43 / 64;
unsigned int code_len = code_bytes;
unsigned char c;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index ff86f19b5758..5f1c6266eb30 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
pr_cont(" <EOI> ");
}
} else {
- if (((long) stack & (THREAD_SIZE-1)) == 0)
+ if (kstack_end(stack))
break;
}
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- pr_cont("\n");
- pr_cont(" %016lx", *stack++);
+ if ((i % STACKSLOTS_PER_LINE) == 0) {
+ if (i != 0)
+ pr_cont("\n");
+ printk("%s %016lx", log_lvl, *stack++);
+ } else
+ pr_cont(" %016lx", *stack++);
touch_nmi_watchdog();
}
preempt_enable();
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201deee923..7d46bb260334 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- early_iounmap(sdata, data_len);
+ early_memunmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index a62536a1be88..49ff55ef9b26 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
-static void mem32_serial_out(unsigned long addr, int offset, int value)
-{
- uint32_t *vaddr = (uint32_t *)addr;
- /* shift implied by pointer type */
- writel(value, vaddr + offset);
-}
-
-static unsigned int mem32_serial_in(unsigned long addr, int offset)
-{
- uint32_t *vaddr = (uint32_t *)addr;
- /* shift implied by pointer type */
- return readl(vaddr + offset);
-}
-
static unsigned int io_serial_in(unsigned long addr, int offset)
{
return inb(addr + offset);
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s)
}
#ifdef CONFIG_PCI
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ u32 *vaddr = (u32 *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ u32 *vaddr = (u32 *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+
/*
* early_pci_serial_init()
*
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s)
unsigned divisor;
unsigned long baud = DEFAULT_BAUD;
u8 bus, slot, func;
- uint32_t classcode, bar0;
- uint16_t cmdreg;
+ u32 classcode, bar0;
+ u16 cmdreg;
char *e;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..1c309763e321 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
/*CFI_REL_OFFSET cs, 0*/
/*
* Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
- * pushed above; +8 corresponds to copy_thread's esp0 setting.
+ * A tiny bit of offset fixup is necessary: TI_sysenter_return
+ * is relative to thread_info, which is at the bottom of the
+ * kernel stack page. 4*4 means the 4 words pushed above;
+ * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+ * and THREAD_SIZE takes us to the bottom.
*/
- pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
- jne sysexit_audit
+ jnz sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
sysexit_audit:
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
+ jnz syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
movl %eax,%edx /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
+ jnz syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
#endif
@@ -510,7 +513,7 @@ syscall_exit:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jne syscall_exit_work
+ jnz syscall_exit_work
restore_all:
TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and
#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
- jne work_notifysig_v86 # returning to kernel-space or
+ jnz work_notifysig_v86 # returning to kernel-space or
# vm86-space
1:
#else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
.endm
/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
*/
-.section .init.rodata,"a"
-ENTRY(interrupt)
-.section .entry.text, "ax"
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
+ .align 8
ENTRY(irq_entries_start)
RING0_INT_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < FIRST_SYSTEM_VECTOR
- .if vector <> FIRST_EXTERNAL_VECTOR
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .long 1b
- .section .entry.text, "ax"
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
+ .align 8
+ .endr
END(irq_entries_start)
-.previous
-END(interrupt)
-.previous
-
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
- altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663: pushl $do_simd_coprocessor_error
-664:
-.previous
+ ALTERNATIVE "pushl_cfi $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
#else
pushl_cfi $do_simd_coprocessor_error
#endif
@@ -1240,20 +1216,13 @@ error_code:
/*CFI_REL_OFFSET es, 0*/
pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ebp
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg edx
+ pushl_cfi_reg ecx
+ pushl_cfi_reg ebx
cld
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f0095a76c182..c7b238494b31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
* A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
- * - partial stack frame: partially saved registers up to R11.
- * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
* backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
* - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
* - idtentry - Define exception entry points.
*/
@@ -70,10 +57,6 @@
.section .entry.text, "ax"
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON
1:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
call debug_stack_reset
.endm
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+.macro TRACE_IRQS_IRETQ_DEBUG
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON_DEBUG
1:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
#endif
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp offset=0
- movq PER_CPU_VAR(old_rsp),\tmp
- movq \tmp,RSP+\offset(%rsp)
- movq $__USER_DS,SS+\offset(%rsp)
- movq $__USER_CS,CS+\offset(%rsp)
- movq RIP+\offset(%rsp),\tmp /* get rip */
- movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
- movq R11+\offset(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS+\offset(%rsp)
- .endm
-
- .macro RESTORE_TOP_OF_STACK tmp offset=0
- movq RSP+\offset(%rsp),\tmp
- movq \tmp,PER_CPU_VAR(old_rsp)
- movq EFLAGS+\offset(%rsp),\tmp
- movq \tmp,R11+\offset(%rsp)
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
+ * empty frame
*/
.macro EMPTY_FRAME start=1 offset=0
.if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
* initial frame state for interrupts (and exceptions without error code)
*/
.macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, SS+8+\offset-RIP
- /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
- CFI_REL_OFFSET rsp, RSP+\offset-RIP
- /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
- /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
- CFI_REL_OFFSET rip, RIP+\offset-RIP
+ EMPTY_FRAME \start, 5*8+\offset
+ /*CFI_REL_OFFSET ss, 4*8+\offset*/
+ CFI_REL_OFFSET rsp, 3*8+\offset
+ /*CFI_REL_OFFSET rflags, 2*8+\offset*/
+ /*CFI_REL_OFFSET cs, 1*8+\offset*/
+ CFI_REL_OFFSET rip, 0*8+\offset
.endm
/*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
* with vector already pushed)
*/
.macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, RIP+\offset-ORIG_RAX
- .endm
-
-/*
- * frame that enables calling into C.
- */
- .macro PARTIAL_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
- CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
- CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
- CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
- CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
- CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
- CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
- CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ INTR_FRAME \start, 1*8+\offset
.endm
/*
* frame that enables passing a complete pt_regs to a C function.
*/
.macro DEFAULT_FRAME start=1 offset=0
- PARTIAL_FRAME \start, R11+\offset-R15
+ XCPT_FRAME \start, ORIG_RAX+\offset
+ CFI_REL_OFFSET rdi, RDI+\offset
+ CFI_REL_OFFSET rsi, RSI+\offset
+ CFI_REL_OFFSET rdx, RDX+\offset
+ CFI_REL_OFFSET rcx, RCX+\offset
+ CFI_REL_OFFSET rax, RAX+\offset
+ CFI_REL_OFFSET r8, R8+\offset
+ CFI_REL_OFFSET r9, R9+\offset
+ CFI_REL_OFFSET r10, R10+\offset
+ CFI_REL_OFFSET r11, R11+\offset
CFI_REL_OFFSET rbx, RBX+\offset
CFI_REL_OFFSET rbp, RBP+\offset
CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm
-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
- cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(save_paranoid)
-
/*
- * A newly forked process directly context switches into this address.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
-
- call schedule_tail # rdi: 'prev' task parameter
-
- GET_THREAD_INFO(%rcx)
-
- RESTORE_REST
-
- testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- jz 1f
-
- /*
- * By the time we get here, we have no idea whether our pt_regs,
- * ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the ia32entry paths.
- * Use int_ret_from_sys_call to return, since it can safely handle
- * all of the above.
- */
- jmp int_ret_from_sys_call
-
-1:
- subq $REST_SKIP, %rsp # leave space for volatiles
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
*
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer. However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * Registers on entry:
* rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
* rsi arg1
* rdx arg2
- * r10 arg3 (--> moved to rcx for C)
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
- * r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
*
- * Interrupts are off on entry.
* Only called from user space.
*
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- * and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
+ * When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
@@ -324,9 +198,15 @@ END(ret_from_fork)
ENTRY(system_call)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
+ CFI_DEF_CFA rsp,0
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
+
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
SWAPGS_UNSAFE_STACK
/*
* A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
*/
GLOBAL(system_call_after_swapgs)
- movq %rsp,PER_CPU_VAR(old_rsp)
+ movq %rsp,PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack),%rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq_cfi $__USER_DS /* pt_regs->ss */
+ pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
/*
- * No need to follow this irqs off/on section - it's straight
- * and short:
+ * Re-enable interrupts.
+ * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+ * must execute atomically in the face of possible interrupt-driven
+ * task preemption. We must enable interrupts only after we're done
+ * with using rsp_scratch:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8, 0, rax_enosys=1
- movq_cfi rax,(ORIG_RAX-ARGOFFSET)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ pushq_cfi %r11 /* pt_regs->flags */
+ pushq_cfi $__USER_CS /* pt_regs->cs */
+ pushq_cfi %rcx /* pt_regs->ip */
+ CFI_REL_OFFSET rip,0
+ pushq_cfi_reg rax /* pt_regs->orig_ax */
+ pushq_cfi_reg rdi /* pt_regs->di */
+ pushq_cfi_reg rsi /* pt_regs->si */
+ pushq_cfi_reg rdx /* pt_regs->dx */
+ pushq_cfi_reg rcx /* pt_regs->cx */
+ pushq_cfi $-ENOSYS /* pt_regs->ax */
+ pushq_cfi_reg r8 /* pt_regs->r8 */
+ pushq_cfi_reg r9 /* pt_regs->r9 */
+ pushq_cfi_reg r10 /* pt_regs->r10 */
+ pushq_cfi_reg r11 /* pt_regs->r11 */
+ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
+ CFI_ADJUST_CFA_OFFSET 6*8
+
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz tracesys
system_call_fastpath:
#if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja ret_from_sys_call /* and return regs->ax */
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10,%rcx
- call *sys_call_table(,%rax,8) # XXX: rip relative
- movq %rax,RAX-ARGOFFSET(%rsp)
+ call *sys_call_table(,%rax,8)
+ movq %rax,RAX(%rsp)
+1:
/*
- * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Syscall return path ending with SYSRET (fast path).
+ * Has incompletely filled pt_regs.
*/
-ret_from_sys_call:
LOCKDEP_SYS_EXIT
+ /*
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
/*
* We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
* flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
* very bad.
*/
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
CFI_REMEMBER_STATE
- /*
- * sysretq will re-enable interrupts:
- */
- TRACE_IRQS_ON
- movq RIP-ARGOFFSET(%rsp),%rcx
+
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ movq RIP(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 1,-ARG_SKIP,0
+ movq EFLAGS(%rsp),%r11
/*CFI_REGISTER rflags,r11*/
- movq PER_CPU_VAR(old_rsp), %rsp
+ movq RSP(%rsp),%rsp
+ /*
+ * 64bit SYSRET restores rip from rcx,
+ * rflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * Restoration of rflags re-enables interrupts.
+ */
USERGS_SYSRET64
CFI_RESTORE_STATE
-int_ret_from_sys_call_fixup:
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_ret_from_sys_call_irqs_off
-
- /* Do syscall tracing */
+ /* Do syscall entry tracing */
tracesys:
- leaq -REST_SKIP(%rsp), %rdi
- movq $AUDIT_ARCH_X86_64, %rsi
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
call syscall_trace_enter_phase1
test %rax, %rax
jnz tracesys_phase2 /* if needed, run the slow path */
- LOAD_ARGS 0 /* else restore clobbered regs */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX(%rsp), %rax
jmp system_call_fastpath /* and return to the fast path */
tracesys_phase2:
- SAVE_REST
- FIXUP_TOP_OF_STACK %rdi
+ SAVE_EXTRA_REGS
movq %rsp, %rdi
- movq $AUDIT_ARCH_X86_64, %rsi
+ movl $AUDIT_ARCH_X86_64, %esi
movq %rax,%rdx
call syscall_trace_enter_phase2
/*
- * Reload arg registers from stack in case ptrace changed them.
+ * Reload registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
+ RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja int_ret_from_sys_call /* RAX(%rsp) is already set */
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- /* Use IRET because user could have changed frame */
+ movq %rax,RAX(%rsp)
+1:
+ /* Use IRET because user could have changed pt_regs->foo */
/*
* Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct iret frame.
*/
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
+int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
TRACE_IRQS_OFF
-int_ret_from_sys_call_irqs_off:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
+ andl $~TS_COMPAT,TI_status(%rcx)
+ jmp syscall_return
/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
TRACE_IRQS_OFF
jmp int_with_check
- /* handle signals and tracing -- both require a full stack frame */
+ /* handle signals and tracing -- both require a full pt_regs */
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
- SAVE_REST
+ SAVE_EXTRA_REGS
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
@@ -492,86 +395,192 @@ int_signal:
call do_notify_resume
1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
+
+syscall_return:
+ /* The IRETQ could re-enable interrupts: */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq RCX(%rsp),%rcx
+ cmpq %rcx,RIP(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- SYSRET checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ /*
+ * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET. This would cause an infinite loop whenever #DB happens
+ * with register state that satisfies the opportunistic SYSRET
+ * conditions. For example, single-stepping this user code:
+ *
+ * movq $stuck_here,%rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ CFI_REMEMBER_STATE
+ /* r11 is already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_R11
+ movq RSP(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+ SWAPGS
+ jmp restore_c_regs_and_iret
CFI_ENDPROC
END(system_call)
+
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
- popq %r11 /* save return address */
- PARTIAL_FRAME 0
- SAVE_REST
- pushq %r11 /* put it back on stack */
- FIXUP_TOP_OF_STACK %r11, 8
- DEFAULT_FRAME 0 8 /* offset 8: return address */
- call sys_\func
- RESTORE_TOP_OF_STACK %r11, 8
- ret $REST_SKIP /* pop extended registers */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
+ SAVE_EXTRA_REGS 8
+ jmp sys_\func
CFI_ENDPROC
END(stub_\func)
.endm
- .macro FIXED_FRAME label,func
-ENTRY(\label)
- CFI_STARTPROC
- PARTIAL_FRAME 0 8 /* offset 8: return address */
- FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
- call \func
- RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
- ret
- CFI_ENDPROC
-END(\label)
- .endm
-
FORK_LIKE clone
FORK_LIKE fork
FORK_LIKE vfork
- FIXED_FRAME stub_iopl, sys_iopl
ENTRY(stub_execve)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call sys_execve
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
+ DEFAULT_FRAME 0, 8
+ call sys_execve
+return_from_execve:
+ testl %eax, %eax
+ jz 1f
+ /* exec failed, can use fast SYSRET code path in this case */
+ ret
+1:
+ /* must use IRET code path (pt_regs->cs may have changed) */
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
+ ZERO_EXTRA_REGS
+ movq %rax,RAX(%rsp)
+ jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
-
-ENTRY(stub_execveat)
+/*
+ * Remaining execve stubs are only 7 bytes long.
+ * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
+ */
+ .align 8
+GLOBAL(stub_execveat)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call sys_execveat
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
+ DEFAULT_FRAME 0, 8
+ call sys_execveat
+ jmp return_from_execve
CFI_ENDPROC
END(stub_execveat)
+#ifdef CONFIG_X86_X32_ABI
+ .align 8
+GLOBAL(stub_x32_execve)
+ CFI_STARTPROC
+ DEFAULT_FRAME 0, 8
+ call compat_sys_execve
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub_x32_execve)
+ .align 8
+GLOBAL(stub_x32_execveat)
+ CFI_STARTPROC
+ DEFAULT_FRAME 0, 8
+ call compat_sys_execveat
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub_x32_execveat)
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+ .align 8
+GLOBAL(stub32_execve)
+ CFI_STARTPROC
+ call compat_sys_execve
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub32_execve)
+ .align 8
+GLOBAL(stub32_execveat)
+ CFI_STARTPROC
+ call compat_sys_execveat
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub32_execveat)
+#endif
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
*/
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
+ DEFAULT_FRAME 0, 8
+ /*
+ * SAVE_EXTRA_REGS result is not normally needed:
+ * sigreturn overwrites all pt_regs->GPREGS.
+ * But sigreturn can fail (!), and there is no easy way to detect that.
+ * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
+ * we SAVE_EXTRA_REGS here.
+ */
+ SAVE_EXTRA_REGS 8
call sys_rt_sigreturn
- movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+return_from_stub:
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
+ RESTORE_EXTRA_REGS
+ movq %rax,RAX(%rsp)
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
#ifdef CONFIG_X86_X32_ABI
ENTRY(stub_x32_rt_sigreturn)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
+ DEFAULT_FRAME 0, 8
+ SAVE_EXTRA_REGS 8
call sys32_x32_rt_sigreturn
- movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
- jmp int_ret_from_sys_call
+ jmp return_from_stub
CFI_ENDPROC
END(stub_x32_rt_sigreturn)
+#endif
-ENTRY(stub_x32_execve)
- CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call compat_sys_execve
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_x32_execve)
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+ DEFAULT_FRAME
-ENTRY(stub_x32_execveat)
- CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call compat_sys_execveat
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+ pushq_cfi $0x0002
+ popfq_cfi # reset kernel eflags
+
+ call schedule_tail # rdi: 'prev' task parameter
+
+ RESTORE_EXTRA_REGS
+
+ testl $3,CS(%rsp) # from kernel_thread?
+
+ /*
+ * By the time we get here, we have no idea whether our pt_regs,
+ * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+ * the slow path, or one of the ia32entry paths.
+ * Use IRET code path to return, since it can safely handle
+ * all of the above.
+ */
+ jnz int_ret_from_sys_call
+
+ /* We came from kernel_thread */
+ /* nb: we depend on RESTORE_EXTRA_REGS above */
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
-END(stub_x32_execveat)
-
-#endif
+END(ret_from_fork)
/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
*/
- .section .init.rodata,"a"
-ENTRY(interrupt)
- .section .entry.text
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
+ .align 8
ENTRY(irq_entries_start)
INTR_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < FIRST_SYSTEM_VECTOR
- .if vector <> FIRST_EXTERNAL_VECTOR
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
CFI_ADJUST_CFA_OFFSET -8
- .endif
-1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .quad 1b
- .section .entry.text
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
+ .align 8
+ .endr
CFI_ENDPROC
END(irq_entries_start)
-.previous
-END(interrupt)
-.previous
-
/*
* Interrupt entry/exit.
*
@@ -669,47 +662,45 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- /* reserve pt_regs for scratch regs and rbp */
- subq $ORIG_RAX-RBP, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
+ /*
+ * Since nothing in interrupt handling code touches r12...r15 members
+ * of "struct pt_regs", and since interrupts can nest, we can save
+ * four stack slots and simultaneously provide
+ * an unwind-friendly stack layout by saving "truncated" pt_regs
+ * exactly up to rbp slot, without these members.
+ */
+ ALLOC_PT_GPREGS_ON_STACK -RBP
+ SAVE_C_REGS -RBP
+ /* this goes to 0(%rsp) for unwinder, not for saving the value: */
+ SAVE_EXTRA_REGS_RBP -RBP
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
+ leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
+
+ testl $3, CS-RBP(%rsp)
je 1f
SWAPGS
+1:
/*
+ * Save previous stack pointer, optionally switch to interrupt stack.
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl PER_CPU_VAR(irq_count)
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
pushq %rsi
+ /*
+ * For debugger:
+ * "CFA (Current Frame Address) is the value on stack + offset"
+ */
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
+ 0x77 /* DW_OP_breg7 (rsp) */, 0, \
0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
- /* 0(%rsp): old_rsp-ARGOFFSET */
+ /* 0(%rsp): old RSP */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
- leaq ARGOFFSET-RBP(%rsi), %rsp
+ CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+ /* return code expects complete pt_regs - adjust rsp accordingly: */
+ leaq -RBP(%rsi),%rsp
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
+ CFI_ADJUST_CFA_OFFSET RBP
-exit_intr:
- GET_THREAD_INFO(%rcx)
- testl $3,CS-ARGOFFSET(%rsp)
+ testl $3,CS(%rsp)
je retint_kernel
-
/* Interrupt came from user space */
+
+ GET_THREAD_INFO(%rcx)
/*
- * Has a correct top of stack, but a partial stack frame
* %rcx: thread info. Interrupts off.
*/
retint_with_reschedule:
@@ -766,84 +756,34 @@ retint_swapgs: /* return to user-space */
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
- */
- movq (RCX-R11)(%rsp), %rcx
- cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
- jne opportunistic_sysret_failed
-
- /*
- * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP. It's not worth
- * testing for canonicalness exactly -- this check detects any
- * of the 17 high bits set, which is true for non-canonical
- * or kernel addresses. (This will pessimize vsyscall=native.
- * Big deal.)
- *
- * If virtual addresses ever become wider, this will need
- * to be updated to remain correct on both old and new CPUs.
- */
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- sysret checks need update"
- .endif
- shr $__VIRTUAL_MASK_SHIFT, %rcx
- jnz opportunistic_sysret_failed
-
- cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
-
- movq (R11-ARGOFFSET)(%rsp), %r11
- cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
-
- /*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
- *
- * movq $stuck_here,%rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
-
- /*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
- */
-irq_return_via_sysret:
- CFI_REMEMBER_STATE
- RESTORE_ARGS 1,8,1
- movq (RSP-RIP)(%rsp),%rsp
- USERGS_SYSRET64
- CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
SWAPGS
- jmp restore_args
+ jmp restore_c_regs_and_iret
-retint_restore_args: /* return to kernel space */
- DISABLE_INTERRUPTS(CLBR_ANY)
+/* Returning to kernel space */
+retint_kernel:
+#ifdef CONFIG_PREEMPT
+ /* Interrupts are off */
+ /* Check if we need preemption */
+ bt $9,EFLAGS(%rsp) /* interrupts were off? */
+ jnc 1f
+0: cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz 1f
+ call preempt_schedule_irq
+ jmp 0b
+1:
+#endif
/*
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
-restore_args:
- RESTORE_ARGS 1,8,1
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
irq_return:
INTERRUPT_RETURN
@@ -914,28 +854,17 @@ retint_signal:
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
+ SAVE_EXTRA_REGS
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
jmp retint_with_reschedule
-#ifdef CONFIG_PREEMPT
- /* Returning to kernel space. Check if we need preemption */
- /* rcx: threadinfo. interrupts off. */
-ENTRY(retint_kernel)
- cmpl $0,PER_CPU_VAR(__preempt_count)
- jnz retint_restore_args
- bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
- jnc retint_restore_args
- call preempt_schedule_irq
- jmp exit_intr
-#endif
CFI_ENDPROC
END(common_interrupt)
@@ -1024,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@@ -1046,8 +975,7 @@ ENTRY(\sym)
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
.endif
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
@@ -1055,10 +983,11 @@ ENTRY(\sym)
testl $3, CS(%rsp) /* If coming from userspace, switch */
jnz 1f /* stacks. */
.endif
- call save_paranoid
+ call paranoid_entry
.else
call error_entry
.endif
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
DEFAULT_FRAME 0
@@ -1080,19 +1009,20 @@ ENTRY(\sym)
.endif
.if \shift_ist != -1
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
call \do_sym
.if \shift_ist != -1
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
+ /* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
- jmp paranoid_exit /* %ebx: no swapgs flag */
+ jmp paranoid_exit
.else
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit
.endif
.if \paranoid == 1
@@ -1296,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
@@ -1328,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
- /*
- * "Paranoid" exit path from exception stack. This is invoked
- * only on return from non-NMI IST interrupts that came
- * from kernel space.
- *
- * We may be returning to very strange contexts (e.g. very early
- * in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
- */
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+ XCPT_FRAME 1 15*8
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(paranoid_entry)
- /* ebx: no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- TRACE_IRQS_IRETQ 0
+ jnz paranoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
- INTERRUPT_RETURN
-paranoid_restore:
- TRACE_IRQS_IRETQ_DEBUG 0
- RESTORE_ALL 8
+ jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
+ TRACE_IRQS_IRETQ_DEBUG
+paranoid_exit_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
+ XCPT_FRAME 1 15*8
cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq %rdx, RDX+8(%rsp)
- movq %rcx, RCX+8(%rsp)
- movq %rax, RAX+8(%rsp)
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1390,12 +1329,12 @@ error_sti:
TRACE_IRQS_OFF
ret
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
error_kernelspace:
CFI_REL_OFFSET rcx, RCX+8
incl %ebx
@@ -1425,11 +1364,11 @@ error_bad_iret:
END(error_entry)
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(error_exit)
DEFAULT_FRAME
movl %ebx,%eax
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1444,19 +1383,7 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
-/*
- * Test if a given stack is an NMI stack or not.
- */
- .macro test_in_nmi reg stack nmi_ret normal_ret
- cmpq %\reg, \stack
- ja \normal_ret
- subq $EXCEPTION_STKSZ, %\reg
- cmpq %\reg, \stack
- jb \normal_ret
- jmp \nmi_ret
- .endm
-
- /* runs on exception stack */
+/* Runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1492,7 +1419,7 @@ ENTRY(nmi)
* NMI.
*/
- /* Use %rdx as out temp variable throughout */
+ /* Use %rdx as our temp variable throughout */
pushq_cfi %rdx
CFI_REL_OFFSET rdx, 0
@@ -1517,8 +1444,17 @@ ENTRY(nmi)
* We check the variable because the first NMI could be in a
* breakpoint routine using a breakpoint stack.
*/
- lea 6*8(%rsp), %rdx
- test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ lea 6*8(%rsp), %rdx
+ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+ cmpq %rdx, 4*8(%rsp)
+ /* If the stack pointer is above the NMI stack, this is a normal NMI */
+ ja first_nmi
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ /* If it is below the NMI stack, it is a normal NMI */
+ jb first_nmi
+ /* Ah, it is within the NMI stack, treat it as nested */
+
CFI_REMEMBER_STATE
nested_nmi:
@@ -1611,7 +1547,7 @@ first_nmi:
.rept 5
pushq_cfi 11*8(%rsp)
.endr
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
/* Everything up to here is safe from nested NMIs */
@@ -1639,7 +1575,7 @@ repeat_nmi:
pushq_cfi -6*8(%rsp)
.endr
subq $(5*8), %rsp
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
end_repeat_nmi:
/*
@@ -1648,16 +1584,16 @@ end_repeat_nmi:
* so that we repeat another NMI.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
+
/*
- * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
*/
- call save_paranoid
+ call paranoid_entry
DEFAULT_FRAME 0
/*
@@ -1688,8 +1624,10 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
/* Pop the extra iret frame at once */
- RESTORE_ALL 6*8
+ REMOVE_PT_GPREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */
movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c4f8d4659070..2b55ee6db053 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
- early_printk("Kernel alive\n");
-
clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
#include <asm/cpufeature.h>
#include <asm/percpu.h>
#include <asm/nops.h>
+#include <asm/bootparam.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 2f
/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9f69a..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
/*
- * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
* %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
- * arch/x86_64/boot/compressed/head.S.
+ * arch/x86/boot/compressed/head_64.S.
*
* We only come here initially at boot nothing else comes here.
*
@@ -146,7 +146,7 @@ startup_64:
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
-1: testq $1, 0(%rdi)
+1: testb $1, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fce0b71..367f39d35e9c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -42,8 +42,8 @@ void kernel_fpu_enable(void)
* be set (so that the clts/stts pair does nothing that is
* visible in the interrupted kernel thread).
*
- * Except for the eagerfpu case when we return 1 unless we've already
- * been eager and saved the state in kernel_fpu_begin().
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
*/
static inline bool interrupted_kernel_fpu_idle(void)
{
@@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
return false;
if (use_eager_fpu())
- return __thread_has_fpu(current);
+ return true;
return !__thread_has_fpu(current) &&
(read_cr0() & X86_CR0_TS);
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
static inline bool interrupted_user_mode(void)
{
struct pt_regs *regs = get_irq_regs();
- return regs && user_mode_vm(regs);
+ return regs && user_mode(regs);
}
/*
@@ -94,9 +94,10 @@ void __kernel_fpu_begin(void)
if (__thread_has_fpu(me)) {
__save_init_fpu(me);
- } else if (!use_eager_fpu()) {
+ } else {
this_cpu_write(fpu_owner_task, NULL);
- clts();
+ if (!use_eager_fpu())
+ clts();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -107,7 +108,7 @@ void __kernel_fpu_end(void)
if (__thread_has_fpu(me)) {
if (WARN_ON(restore_fpu_checking(me)))
- drop_init_fpu(me);
+ fpu_reset_state(me);
} else if (!use_eager_fpu()) {
stts();
}
@@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk)
{
preempt_disable();
if (__thread_has_fpu(tsk)) {
- __save_init_fpu(tsk);
- __thread_fpu_end(tsk);
- } else
- tsk->thread.fpu_counter = 0;
+ if (use_eager_fpu()) {
+ __save_fpu(tsk);
+ } else {
+ __save_init_fpu(tsk);
+ __thread_fpu_end(tsk);
+ }
+ }
preempt_enable();
}
EXPORT_SYMBOL(unlazy_fpu);
@@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu)
return;
}
+ memset(fpu->state, 0, xstate_size);
+
if (cpu_has_fxsr) {
fx_finit(&fpu->state->fxsave);
} else {
struct i387_fsave_struct *fp = &fpu->state->fsave;
- memset(fp, 0, xstate_size);
fp->cwd = 0xffff037fu;
fp->swd = 0xffff0000u;
fp->twd = 0xffffffffu;
@@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk)
if (tsk_used_math(tsk)) {
if (cpu_has_fpu && tsk == current)
unlazy_fpu(tsk);
- tsk->thread.fpu.last_cpu = ~0;
+ task_disable_lazy_fpu_restore(tsk);
return 0;
}
@@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
+ struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
int ret;
if (!cpu_has_xsave)
@@ -350,14 +356,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
* memory layout in the thread struct, so that we can copy the entire
* xstateregs to the user using one user_regset_copyout().
*/
- memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
- xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
+ memcpy(&xsave->i387.sw_reserved,
+ xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
/*
* Copy the xstate memory layout.
*/
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->xsave, 0, -1);
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
return ret;
}
@@ -365,8 +369,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
+ struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
int ret;
- struct xsave_hdr_struct *xsave_hdr;
if (!cpu_has_xsave)
return -ENODEV;
@@ -375,22 +379,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->xsave, 0, -1);
-
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
- xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+ xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
/*
* These bits must be zero.
*/
- memset(xsave_hdr->reserved, 0, 48);
-
+ memset(&xsave->xsave_hdr.reserved, 0, 48);
return ret;
}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 67b1cbe0093a..e5952c225532 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
this_cpu = smp_processor_id();
cpumask_copy(&online_new, cpu_online_mask);
- cpu_clear(this_cpu, online_new);
+ cpumask_clear_cpu(this_cpu, &online_new);
this_count = 0;
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new, data->affinity);
- cpu_clear(this_cpu, affinity_new);
+ cpumask_clear_cpu(this_cpu, &affinity_new);
/* Do not count inactive or per-cpu irqs. */
if (!irq_has_action(irq) || irqd_is_per_cpu(data))
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5eb8f4..f9fd86a7fcc7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
if (unlikely(!desc))
return false;
- if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d5558c..394e643d7830 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
u64 estack_top, estack_bottom;
u64 curbase = (u64)task_stack_page(current);
- if (user_mode_vm(regs))
+ if (user_mode(regs))
return;
if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181ea1eac..cd10a6437264 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
#endif
for_each_clear_bit_from(i, used_vectors, first_system_vector) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate(i, irq_entries_start +
+ 8 * (i - FIRST_EXTERNAL_VECTOR));
}
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 25ecd56cefa8..d6178d9791db 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
#ifdef CONFIG_X86_32
switch (regno) {
case GDB_SS:
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
*(unsigned long *)mem = __KERNEL_DS;
break;
case GDB_SP:
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
*(unsigned long *)mem = kernel_stack_pointer(regs);
break;
case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..1deffe6cc873 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int length;
unsigned long recovered_insn =
recover_probed_instruction(buf, (unsigned long)src);
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
return 0;
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
+ length = insn.length;
+
/* Another subsystem puts a breakpoint, failed to recover */
if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
- memcpy(dest, insn.kaddr, insn.length);
+ memcpy(dest, insn.kaddr, length);
#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
- kernel_insn_init(&insn, dest, insn.length);
+ kernel_insn_init(&insn, dest, length);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
*(s32 *) disp = (s32) newdisp;
}
#endif
- return insn.length;
+ return length;
}
static int arch_copy_kprobe(struct kprobe *p)
@@ -602,7 +605,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
struct kprobe *p;
struct kprobe_ctlblk *kcb;
- if (user_mode_vm(regs))
+ if (user_mode(regs))
return 0;
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1010,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
struct die_args *args = data;
int ret = NOTIFY_DONE;
- if (args->regs && user_mode_vm(args->regs))
+ if (args->regs && user_mode(args->regs))
return ret;
if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
* can get false positives too easily, for example if the host is
* overcommitted.
*/
- watchdog_enable_hardlockup_detector(false);
+ hardlockup_detector_disable();
}
static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b72c72..005c03e93fc5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/setup.h>
#if 0
#define DEBUGP(fmt, ...) \
@@ -47,21 +48,13 @@ do { \
#ifdef CONFIG_RANDOMIZE_BASE
static unsigned long module_load_offset;
-static int randomize_modules = 1;
/* Mutex protects the module_load_offset. */
static DEFINE_MUTEX(module_kaslr_mutex);
-static int __init parse_nokaslr(char *p)
-{
- randomize_modules = 0;
- return 0;
-}
-early_param("nokaslr", parse_nokaslr);
-
static unsigned long int get_module_load_offset(void)
{
- if (randomize_modules) {
+ if (kaslr_enabled()) {
mutex_lock(&module_kaslr_mutex);
/*
* Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
.pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
.pud_val = PTE_IDENT,
.make_pud = PTE_IDENT,
.set_pgd = native_set_pgd,
#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
.pte_val = PTE_IDENT,
.pgd_val = PTE_IDENT,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..da8cb987b973 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
}
/*
- * RIP, flags, and the argument registers are usually saved.
- * orig_ax is probably okay, too.
+ * These registers are always saved on 64-bit syscall entry.
+ * On 32-bit entry points, they are saved too except r8..r11.
*/
regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->ax = user_regs->ax;
regs_user_copy->cx = user_regs->cx;
regs_user_copy->dx = user_regs->dx;
regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
regs_user_copy->r11 = user_regs->r11;
regs_user_copy->orig_ax = user_regs->orig_ax;
regs_user_copy->flags = user_regs->flags;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
/*
- * Don't even try to report the "rest" regs.
+ * Most system calls don't save these registers, don't report them.
*/
regs_user_copy->bx = -1;
regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
/*
* For this to be at all useful, we need a reasonable guess for
- * sp and the ABI. Be careful: we're in NMI context, and we're
+ * the ABI. Be careful: we're in NMI context, and we're
* considering current to be the current task, so we should
* be careful not to look at any other percpu variables that might
* change during context switches.
*/
- if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
- task_thread_info(current)->status & TS_COMPAT) {
- /* Easy case: we're in a compat syscall. */
- regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
- regs_user_copy->sp = user_regs->sp;
- regs_user_copy->cs = user_regs->cs;
- regs_user_copy->ss = user_regs->ss;
- } else if (user_regs->orig_ax != -1) {
- /*
- * We're probably in a 64-bit syscall.
- * Warning: this code is severely racy. At least it's better
- * than just blindly copying user_regs.
- */
- regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
- regs_user_copy->sp = this_cpu_read(old_rsp);
- regs_user_copy->cs = __USER_CS;
- regs_user_copy->ss = __USER_DS;
- regs_user_copy->cx = -1; /* usually contains garbage */
- } else {
- /* We're probably in an interrupt or exception. */
- regs_user->abi = user_64bit_mode(user_regs) ?
- PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
- regs_user_copy->sp = user_regs->sp;
- regs_user_copy->cs = user_regs->cs;
- regs_user_copy->ss = user_regs->ss;
- }
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
regs_user->regs = regs_user_copy;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d620bbe..8213da62b1b7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
#include <linux/random.h>
#include <linux/user-return-notifier.h>
#include <linux/dmi.h>
@@ -24,6 +24,7 @@
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
+#include <asm/mwait.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
@@ -37,7 +38,26 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+ .x86_tss = {
+ .sp0 = TOP_OF_INIT_STACK,
+#ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+#endif
+ },
+#ifdef CONFIG_X86_32
+ /*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
dst->thread.fpu_counter = 0;
dst->thread.fpu.has_fpu = 0;
- dst->thread.fpu.last_cpu = ~0;
dst->thread.fpu.state = NULL;
+ task_disable_lazy_fpu_restore(dst);
if (tsk_used_math(src)) {
int err = fpu_alloc(&dst->thread.fpu);
if (err)
@@ -109,7 +129,7 @@ void exit_thread(void)
unsigned long *bp = t->io_bitmap_ptr;
if (bp) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
@@ -131,13 +151,18 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- drop_init_fpu(tsk);
- /*
- * Free the FPU state for non xsave platforms. They get reallocated
- * lazily at the first use.
- */
- if (!use_eager_fpu())
+
+ if (!use_eager_fpu()) {
+ /* FPU state will be reallocated lazily at the first use. */
+ drop_fpu(tsk);
free_thread_xstate(tsk);
+ } else if (!used_math()) {
+ /* kthread execs. TODO: cleanup this horror. */
+ if (WARN_ON(init_fpu(tsk)))
+ force_sig(SIGKILL, tsk);
+ user_fpu_begin();
+ restore_init_xstate();
+ }
}
static void hard_disable_TSC(void)
@@ -377,14 +402,11 @@ static void amd_e400_idle(void)
if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
cpumask_set_cpu(cpu, amd_e400_c1e_mask);
- /*
- * Force broadcast so ACPI can not interfere.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
- &cpu);
+ /* Force broadcast so ACPI can not interfere. */
+ tick_broadcast_force();
pr_info("Switch to broadcast mode on CPU%d\n", cpu);
}
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+ tick_broadcast_enter();
default_idle();
@@ -393,12 +415,59 @@ static void amd_e400_idle(void)
* called with interrupts disabled.
*/
local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ tick_broadcast_exit();
local_irq_enable();
} else
default_idle();
}
+/*
+ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
+ * We can't rely on cpuidle installing MWAIT, because it will not load
+ * on systems that support only C1 -- so the boot default must be MWAIT.
+ *
+ * Some AMD machines are the opposite, they depend on using HALT.
+ *
+ * So for default C1, which is used during boot until cpuidle loads,
+ * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ */
+static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * MONITOR/MWAIT with no hints, used for default default C1 state.
+ * This invokes MWAIT with interrutps enabled and no flags,
+ * which is backwards compatible with the original MWAIT implementation.
+ */
+
+static void mwait_idle(void)
+{
+ if (!current_set_polling_and_test()) {
+ if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
+ smp_mb(); /* quirk */
+ clflush((void *)&current_thread_info()->flags);
+ smp_mb(); /* quirk */
+ }
+
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ }
+ __current_clr_polling();
+}
+
void select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
@@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
/* E400: APIC timer interrupt does not wake up CPU from C1e */
pr_info("using AMD E400 aware idle routine\n");
x86_idle = amd_e400_idle;
+ } else if (prefer_mwait_c1_over_halt(c)) {
+ pr_info("using mwait in idle threads\n");
+ x86_idle = mwait_idle;
} else
x86_idle = default_idle;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..8ed2106b06da 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned long sp;
unsigned short ss, gs;
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->ip = new_ip;
regs->sp = new_sp;
regs->flags = X86_EFLAGS_IF;
- /*
- * force it to the iret return path by making it look as if there was
- * some work pending.
- */
- set_thread_flag(TIF_NOTIFY_RESUME);
+ force_iret();
}
EXPORT_SYMBOL_GPL(start_thread);
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
fpu_switch_t fpu;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
/*
- * Reload esp0.
- */
- load_sp0(tss, next);
-
- /*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_end_context_switch(next_p);
+ /*
+ * Reload esp0, kernel_stack, and current_top_of_stack. This changes
+ * current_thread_info().
+ */
+ load_sp0(tss, next);
this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
/*
* Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43577d2..4baaa972f52a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
asmlinkage extern void ret_from_fork(void);
-__visible DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
- p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
p->thread.io_bitmap_ptr = NULL;
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
*/
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
+ if (is_ia32_task())
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
- current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
+ force_iret();
}
void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned fsindex, gsindex;
fpu_switch_t fpu;
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
- /* Reload esp0 and ss1. */
- load_sp0(tss, next);
-
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = this_cpu_read(old_rsp);
- this_cpu_write(old_rsp, next->usersp);
this_cpu_write(current_task, next_p);
/*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+ /* Reload esp0 and ss1. This changes current_thread_info(). */
+ load_sp0(tss, next);
+
this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
unsigned long KSTK_ESP(struct task_struct *task)
{
- return (test_tsk_thread_flag(task, TIF_IA32)) ?
- (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+ return task_pt_regs(task)->sp;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..a7bc79480719 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct,cs):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->cs = value;
-#endif
+ task_pt_regs(task)->cs = value;
break;
case offsetof(struct user_regs_struct,ss):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->ss = value;
-#endif
+ task_pt_regs(task)->ss = value;
break;
}
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
memset(info, 0, sizeof(*info));
info->si_signo = SIGTRAP;
info->si_code = si_code;
- info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+ info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
}
void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..e5ecd20e72dd 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+ if (!pvclock_vdso_info) {
+ BUG();
+ return NULL;
+ }
+
+ return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+ return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
#ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+ void *v)
+{
+ struct task_migration_notifier *mn = v;
+ struct pvclock_vsyscall_time_info *pvti;
+
+ pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+ /* this is NULL when pvclock vsyscall is not initialized */
+ if (unlikely(pvti == NULL))
+ return NOTIFY_DONE;
+
+ pvti->migrate_count++;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+ .notifier_call = pvclock_task_migrate,
+};
+
/*
* Initialize the generic pvclock vsyscall state. This will allocate
* a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+ pvclock_vdso_info = i;
+
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
__pa(i) + (idx*PAGE_SIZE),
PAGE_KERNEL_VVAR);
}
+
+ register_task_migration_notifier(&pvclock_migrate);
+
return 0;
}
#endif
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
movl (%ebx), %ecx
addl $4, %ebx
1:
- testl $0x1, %ecx /* is it a destination page */
+ testb $0x1, %cl /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
- testl $0x2, %ecx /* is it an indirection page */
+ testb $0x2, %cl /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
- testl $0x4, %ecx /* is it the done indicator */
+ testb $0x4, %cl /* is it the done indicator */
jz 2f
jmp 3f
2:
- testl $0x8, %ecx /* is it the source indicator */
+ testb $0x8, %cl /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..98111b38ebfd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
* Set cr4 to a known state:
* - physical address extension enabled
*/
- movq $X86_CR4_PAE, %rax
+ movl $X86_CR4_PAE, %eax
movq %rax, %cr4
jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
movq (%rbx), %rcx
addq $8, %rbx
1:
- testq $0x1, %rcx /* is it a destination page? */
+ testb $0x1, %cl /* is it a destination page? */
jz 2f
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp 0b
2:
- testq $0x2, %rcx /* is it an indirection page? */
+ testb $0x2, %cl /* is it an indirection page? */
jz 2f
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp 0b
2:
- testq $0x4, %rcx /* is it the done indicator? */
+ testb $0x4, %cl /* is it the done indicator? */
jz 2f
jmp 3f
2:
- testq $0x8, %rcx /* is it the source indicator? */
+ testb $0x8, %cl /* is it the source indicator? */
jz 0b /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
movq %rsi, %rax
movq %r10, %rdi
- movq $512, %rcx
+ movl $512, %ecx
rep ; movsq
movq %rax, %rdi
movq %rdx, %rsi
- movq $512, %rcx
+ movl $512, %ecx
rep ; movsq
movq %rdx, %rdi
movq %r10, %rsi
- movq $512, %rcx
+ movl $512, %ecx
rep ; movsq
lea PAGE_SIZE(%rax), %rsi
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421cca01f..d74ac33290ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -354,7 +354,7 @@ static void __init relocate_initrd(void)
mapaddr = ramdisk_image & PAGE_MASK;
p = early_memremap(mapaddr, clen+slop);
memcpy(q, p+slop, clen);
- early_iounmap(p, clen+slop);
+ early_memunmap(p, clen+slop);
q += clen;
ramdisk_image += clen;
ramdisk_size -= clen;
@@ -438,7 +438,7 @@ static void __init parse_setup_data(void)
data_len = data->len + sizeof(struct setup_data);
data_type = data->type;
pa_next = data->next;
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
switch (data_type) {
case SETUP_E820_EXT:
@@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void)
E820_RAM, E820_RESERVED_KERN);
found = 1;
pa_data = data->next;
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
}
if (!found)
return;
@@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
data = early_memremap(pa_data, sizeof(*data));
memblock_reserve(pa_data, sizeof(*data) + data->len);
pa_data = data->next;
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
}
}
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
static int
dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
{
- pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
- "(relocation range: 0x%lx-0x%lx)\n",
- (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
- __START_KERNEL_map, MODULES_VADDR-1);
+ if (kaslr_enabled()) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ (unsigned long)&_text - __START_KERNEL,
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
return 0;
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..3e581865c8e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
regs->seg = GET_SEG(seg) | 3; \
} while (0)
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
{
void __user *buf;
unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
#endif /* CONFIG_X86_32 */
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
+ COPY(dx); COPY(cx); COPY(ip); COPY(ax);
#ifdef CONFIG_X86_64
COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
COPY(r15);
#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_X86_32
COPY_SEG_CPL3(cs);
COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
get_user_ex(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
regs->orig_ax = -1; /* disable syscall checks */
get_user_ex(buf, &sc->fpstate);
-
- get_user_ex(*pax, &sc->ax);
} get_user_catch(err);
err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+ force_iret();
+
return err;
}
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
#else /* !CONFIG_X86_32 */
put_user_ex(regs->flags, &sc->flags);
put_user_ex(regs->cs, &sc->cs);
- put_user_ex(0, &sc->gs);
- put_user_ex(0, &sc->fs);
+ put_user_ex(0, &sc->__pad2);
+ put_user_ex(0, &sc->__pad1);
+ put_user_ex(regs->ss, &sc->ss);
#endif /* CONFIG_X86_32 */
put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
regs->sp = (unsigned long)frame;
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value.
+ */
regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
return 0;
}
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
{
struct pt_regs *regs = current_pt_regs();
struct sigframe __user *frame;
- unsigned long ax;
sigset_t set;
frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->sc, &ax))
+ if (restore_sigcontext(regs, &frame->sc))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
{
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe __user *frame;
- unsigned long ax;
sigset_t set;
frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "rt_sigreturn");
@@ -679,7 +680,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* Ensure the signal handler starts with the new fpu state.
*/
if (used_math())
- drop_init_fpu(current);
+ fpu_reset_state(current);
}
signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_x32 __user *frame;
sigset_t set;
- unsigned long ax;
frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "x32 rt_sigreturn");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..50e547eac8cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
#include <asm/realmode.h>
#include <asm/misc.h>
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
/* enable local interrupts */
@@ -779,6 +776,26 @@ out:
return boot_error;
}
+void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
+
+ per_cpu(current_task, cpu) = idle;
+
+#ifdef CONFIG_X86_32
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ irq_ctx_init(cpu);
+ per_cpu(cpu_current_top_of_stack, cpu) =
+ (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+#else
+ clear_tsk_thread_flag(idle, TIF_FORK);
+ initial_gs = per_cpu_offset(cpu);
+#endif
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+}
+
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +813,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int cpu0_nmi_registered = 0;
unsigned long timeout;
- /* Just in case we booted with a single CPU. */
- alternatives_enable_smp();
-
idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(idle))) - 1);
- per_cpu(current_task, cpu) = idle;
-#ifdef CONFIG_X86_32
- /* Stack for startup_32 can be just as for start_secondary onwards */
- irq_ctx_init(cpu);
-#else
- clear_tsk_thread_flag(idle, TIF_FORK);
- initial_gs = per_cpu_offset(cpu);
-#endif
- per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(idle) -
- KERNEL_STACK_OFFSET + THREAD_SIZE;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
@@ -948,11 +951,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
*/
mtrr_save_state();
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /* x86 CPUs take themselves offline, so delayed offline is OK. */
+ err = cpu_check_up_prepare(cpu);
+ if (err && err != -EBUSY)
+ return err;
/* the FPU context is blank, nobody can own it */
__cpu_disable_lazy_restore(cpu);
+ common_cpu_up(cpu, tidle);
+
err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
@@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
return SMP_NO_APIC;
}
- verify_local_APIC();
-
/*
* If SMP should be disabled, then really disable it!
*/
@@ -1191,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void)
switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
- per_cpu(cpu_state, me) = CPU_ONLINE;
+ cpu_set_state_online(me);
}
void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
-static DEFINE_PER_CPU(struct completion, die_complete);
-
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
- init_completion(&per_cpu(die_complete, smp_processor_id()));
-
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1351,27 @@ int native_cpu_disable(void)
return 0;
}
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
{
- wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+ int ret = 0;
-void native_cpu_die(unsigned int cpu)
-{
/* We don't do anything here: idle task is faking death itself. */
- cpu_die_common(cpu);
-
/* They ack this in play_dead() by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ if (cpu_wait_death(cpu, 5)) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
} else {
pr_err("CPU %u didn't die...\n", cpu);
+ ret = -1;
}
+
+ return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ common_cpu_die(cpu);
}
void play_dead_common(void)
@@ -1375,10 +1380,8 @@ void play_dead_common(void)
reset_lazy_tlbstate();
amd_e400_remove_cpu(raw_smp_processor_id());
- mb();
/* Ack it */
- __this_cpu_write(cpu_state, CPU_DEAD);
- complete(&per_cpu(die_complete, smp_processor_id()));
+ (void)cpu_report_death();
/*
* With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 30277e27431a..10e0272d789a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void)
return va_align.mask;
}
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+ return va_align.bits & get_align_mask();
+}
+
unsigned long align_vdso_addr(unsigned long addr)
{
unsigned long align_mask = get_align_mask();
- return (addr + align_mask) & ~align_mask;
+ addr = (addr + align_mask) & ~align_mask;
+ return addr | get_align_bits();
}
static int __init control_va_addr_alignment(char *str)
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.length = len;
info.low_limit = begin;
info.high_limit = end;
- info.align_mask = filp ? get_align_mask() : 0;
+ info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
return vm_unmapped_area(&info);
}
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
- info.align_mask = filp ? get_align_mask() : 0;
+ info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
addr = vm_unmapped_area(&info);
if (!(addr & ~PAGE_MASK))
return addr;
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
#include <linux/cache.h>
#include <asm/asm-offsets.h>
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#ifdef CONFIG_IA32_EMULATION
+#define SYM(sym, compat) compat
+#else
+#define SYM(sym, compat) sym
+#define ia32_sys_call_table sys_call_table
+#define __NR_ia32_syscall_max __NR_syscall_max
+#endif
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
-__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
+ [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..5ecbfe5099da 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
/* test 3: check the value hasn't changed */
/* If this test fails, we managed to overwrite the data */
if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+ printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
return -ENODEV;
}
/* test 4: check if the rodata section is 4Kb aligned */
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e16eaa..d39c09119db6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+ if (!user_mode(regs) && in_lock_functions(pc)) {
#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->bp + sizeof(long));
#else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d162ff9f..324ab5247687 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
{
enum ctx_state prev_state;
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
/* Other than that, we're just an exception. */
prev_state = exception_enter();
} else {
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
* but we need to notify RCU.
*/
rcu_nmi_enter();
- prev_state = IN_KERNEL; /* the value is irrelevant. */
+ prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
}
/*
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
/* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
- if (user_mode_vm(regs))
+ if (user_mode(regs))
return exception_exit(prev_state);
else
rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
*
* IST exception handlers normally cannot schedule. As a special
* exception, if the exception interrupted userspace code (i.e.
- * user_mode_vm(regs) would return true) and the exception was not
+ * user_mode(regs) would return true) and the exception was not
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
*/
void ist_begin_non_atomic(struct pt_regs *regs)
{
- BUG_ON(!user_mode_vm(regs));
+ BUG_ON(!user_mode(regs));
/*
* Sanity check: we need to be on the normal thread stack. This
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
- & ~(THREAD_SIZE - 1)) != 0);
+ BUG_ON((unsigned long)(current_top_of_stack() -
+ current_stack_pointer()) >= THREAD_SIZE);
preempt_count_sub(HARDIRQ_OFFSET);
}
@@ -194,8 +194,7 @@ static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
/*
* Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
}
return -1;
}
-#endif
+
if (!user_mode(regs)) {
if (!fixup_exception(regs)) {
tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
goto exit;
conditional_sti(regs);
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
die("bounds", regs, error_code);
if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
prev_state = exception_enter();
conditional_sti(regs);
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
goto exit;
}
-#endif
tsk = current;
if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/* Copy the remainder of the stack from the current stack. */
memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
- BUG_ON(!user_mode_vm(&new_stack->regs));
+ BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
}
NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
*/
- if (!dr6 && user_mode_vm(regs))
+ if (!dr6 && user_mode(regs))
user_icebp = 1;
/* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
X86_TRAP_DB);
preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
return;
conditional_sti(regs);
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
{
if (!fixup_exception(regs)) {
task->thread.error_code = error_code;
@@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
/*
* Save the info for the exception handler and clear the error.
*/
- save_init_fpu(task);
+ unlazy_fpu(task);
task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
@@ -863,7 +860,7 @@ void math_state_restore(void)
kernel_fpu_disable();
__thread_fpu_begin(tsk);
if (unlikely(restore_fpu_checking(tsk))) {
- drop_init_fpu(tsk);
+ fpu_reset_state(tsk);
force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
} else {
tsk->thread.fpu_counter++;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
- set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ /*
+ * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
+ * is ready in cpu_init() <-- trap_init(). Before trap_init(),
+ * CPU runs at ring 0 so it is impossible to hit an invalid
+ * stack. Using the original stack works well enough at this
+ * early stage. DEBUG_STACK will be equipped after cpu_init() in
+ * trap_init().
+ *
+ * We don't need to set trace_idt_table like set_intr_gate(),
+ * since we don't have trace_debug and it will be reset to
+ * 'debug' in trap_init() by set_intr_gate_ist().
+ */
+ set_intr_gate_notrace(X86_TRAP_DB, debug);
/* int3 can be called from all */
- set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ set_system_intr_gate(X86_TRAP_BP, &int3);
#ifdef CONFIG_X86_32
set_intr_gate(X86_TRAP_PF, page_fault);
#endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
*/
cpu_init();
+ /*
+ * X86_TRAP_DB and X86_TRAP_BP have been set
+ * in early_trap_init(). However, ITS works only after
+ * cpu_init() loads TSS. See comments in early_trap_init().
+ */
+ set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ /* int3 can be called from all */
+ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+
x86_init.irqs.trap_init();
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0679e..0b81ad67da07 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
int ret = NOTIFY_DONE;
/* We are only interested in userspace traps */
- if (regs && !user_mode_vm(regs))
+ if (regs && !user_mode(regs))
return NOTIFY_DONE;
switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
do_exit(SIGSEGV);
}
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
current->thread.sp0 = current->thread.saved_sp0;
current->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index c7d791f32b98..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
gtod_write_begin(vdata);
/* copy vsyscall data */
- vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode;
- vdata->cycle_last = tk->tkr.cycle_last;
- vdata->mask = tk->tkr.mask;
- vdata->mult = tk->tkr.mult;
- vdata->shift = tk->tkr.shift;
+ vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->tkr_mono.cycle_last;
+ vdata->mask = tk->tkr_mono.mask;
+ vdata->mult = tk->tkr_mono.mult;
+ vdata->shift = tk->tkr_mono.shift;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->tkr.xtime_nsec;
+ vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->tkr.xtime_nsec
+ vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr.shift);
+ << tk->tkr_mono.shift);
while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr.shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
- tk->tkr.shift);
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+ tk->tkr_mono.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index cdc6cf903078..87a815b85f3e 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
config_enabled(CONFIG_IA32_EMULATION));
if (!buf) {
- drop_init_fpu(tsk);
+ fpu_reset_state(tsk);
return 0;
}
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
*/
user_fpu_begin();
if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
- drop_init_fpu(tsk);
+ fpu_reset_state(tsk);
return -1;
}
}
@@ -678,19 +678,13 @@ void xsave_init(void)
this_func();
}
-static inline void __init eager_fpu_init_bp(void)
-{
- current->thread.fpu.state =
- alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
- if (!init_xstate_buf)
- setup_init_fpu_buf();
-}
-
-void eager_fpu_init(void)
+/*
+ * setup_init_fpu_buf() is __init and it is OK to call it here because
+ * init_xstate_buf will be unset only once during boot.
+ */
+void __init_refok eager_fpu_init(void)
{
- static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
-
- clear_used_math();
+ WARN_ON(used_math());
current_thread_info()->status = 0;
if (eagerfpu == ENABLE)
@@ -701,21 +695,8 @@ void eager_fpu_init(void)
return;
}
- if (boot_func) {
- boot_func();
- boot_func = NULL;
- }
-
- /*
- * This is same as math_state_restore(). But use_xsave() is
- * not yet patched to use math_state_restore().
- */
- init_fpu(current);
- __thread_fpu_begin(current);
- if (cpu_has_xsave)
- xrstor_state(init_xstate_buf, -1);
- else
- fxrstor_checking(&init_xstate_buf->i387);
+ if (!init_xstate_buf)
+ setup_init_fpu_buf();
}
/*
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 08f790dfadc9..16e8f962eaad 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
-ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Iarch/x86/kvm
CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a80737ee6e6..59b69f6a2844 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
((best->eax & 0xff00) >> 8) != 0)
return -EINVAL;
+ /* Update physical-address width */
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
kvm_pmu_cpuid_update(vcpu);
return 0;
}
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
}
}
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
- if (!best || best->eax < 0x80000008)
- goto not_found;
- best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best)
- return best->eax & 0xff;
-not_found:
- return 36;
-}
-EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
-
/*
* If no match is found, check whether we exceed the vCPU's limit
* and return the content of the highest valid _standard_ leaf instead.
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 4452eedfaedd..c3b1ad9fca81 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -20,13 +20,19 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 __user *entries);
void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
if (!static_cpu_has(X86_FEATURE_XSAVE))
- return 0;
+ return false;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 106c01557f2b..630bcb0d7a04 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -248,27 +248,7 @@ struct mode_dual {
struct opcode mode64;
};
-/* EFLAGS bit definitions. */
-#define EFLG_ID (1<<21)
-#define EFLG_VIP (1<<20)
-#define EFLG_VIF (1<<19)
-#define EFLG_AC (1<<18)
-#define EFLG_VM (1<<17)
-#define EFLG_RF (1<<16)
-#define EFLG_IOPL (3<<12)
-#define EFLG_NT (1<<14)
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_IF (1<<9)
-#define EFLG_TF (1<<8)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#define EFLG_RESERVED_ONE_MASK 2
enum x86_transfer_type {
X86_TRANSFER_NONE,
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
* These EFLAGS bits are restored from saved value during emulation, and
* any changes are written back to the saved value after emulation.
*/
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+ X86_EFLAGS_PF|X86_EFLAGS_CF)
#ifdef CONFIG_X86_64
#define ON64(x) x
@@ -478,6 +459,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask)
*dest = (*dest & ~mask) | (src & mask);
}
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (bytes) {
+ case 1:
+ *(u8 *)reg = (u8)val;
+ break;
+ case 2:
+ *(u16 *)reg = (u16)val;
+ break;
+ case 4:
+ *reg = (u32)val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *reg = val;
+ break;
+ }
+}
+
static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
{
return (1UL << (ctxt->ad_bytes << 3)) - 1;
@@ -943,6 +943,22 @@ FASTOP2(xadd);
FASTOP2R(cmp, cmp_r);
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsf);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsr);
+}
+
static u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
@@ -1399,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int in_page, n;
unsigned int count = ctxt->rep_prefix ?
address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
- in_page = (ctxt->eflags & EFLG_DF) ?
+ in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
@@ -1412,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
}
if (ctxt->rep_prefix && (ctxt->d & String) &&
- !(ctxt->eflags & EFLG_DF)) {
+ !(ctxt->eflags & X86_EFLAGS_DF)) {
ctxt->dst.data = rc->data + rc->pos;
ctxt->dst.type = OP_MEM_STR;
ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1691,21 +1707,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
static void write_register_operand(struct operand *op)
{
- /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
- switch (op->bytes) {
- case 1:
- *(u8 *)op->addr.reg = (u8)op->val;
- break;
- case 2:
- *(u16 *)op->addr.reg = (u16)op->val;
- break;
- case 4:
- *op->addr.reg = (u32)op->val;
- break; /* 64b: zero-extend */
- case 8:
- *op->addr.reg = op->val;
- break;
- }
+ return assign_register(op->addr.reg, op->val, op->bytes);
}
static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
@@ -1792,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
{
int rc;
unsigned long val, change_mask;
- int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
int cpl = ctxt->ops->cpl(ctxt);
rc = emulate_pop(ctxt, &val, len);
if (rc != X86EMUL_CONTINUE)
return rc;
- change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
- | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
+ change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
switch(ctxt->mode) {
case X86EMUL_MODE_PROT64:
case X86EMUL_MODE_PROT32:
case X86EMUL_MODE_PROT16:
if (cpl == 0)
- change_mask |= EFLG_IOPL;
+ change_mask |= X86_EFLAGS_IOPL;
if (cpl <= iopl)
- change_mask |= EFLG_IF;
+ change_mask |= X86_EFLAGS_IF;
break;
case X86EMUL_MODE_VM86:
if (iopl < 3)
return emulate_gp(ctxt, 0);
- change_mask |= EFLG_IF;
+ change_mask |= X86_EFLAGS_IF;
break;
default: /* real mode */
- change_mask |= (EFLG_IOPL | EFLG_IF);
+ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
break;
}
@@ -1918,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
static int em_pushf(struct x86_emulate_ctxt *ctxt)
{
- ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM;
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
return em_push(ctxt);
}
@@ -1926,6 +1930,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
+ u32 val;
while (reg >= VCPU_REGS_RAX) {
if (reg == VCPU_REGS_RSP) {
@@ -1933,9 +1938,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
--reg;
}
- rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
break;
+ assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
--reg;
}
return rc;
@@ -1956,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
if (rc != X86EMUL_CONTINUE)
return rc;
- ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+ ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
rc = em_push(ctxt);
@@ -2022,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
unsigned long temp_eip = 0;
unsigned long temp_eflags = 0;
unsigned long cs = 0;
- unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
- EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
- EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
- unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+ unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+ X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+ X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+ X86_EFLAGS_AC | X86_EFLAGS_ID |
+ X86_EFLAGS_FIXED;
+ unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+ X86_EFLAGS_VIP;
/* TODO: Add stack limit check */
@@ -2054,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
ctxt->_eip = temp_eip;
-
if (ctxt->op_bytes == 4)
ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
else if (ctxt->op_bytes == 2) {
@@ -2063,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
}
ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
- ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
ctxt->ops->set_nmi_mask(ctxt, false);
return rc;
@@ -2145,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
- ctxt->eflags &= ~EFLG_ZF;
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
} else {
ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
(u32) reg_read(ctxt, VCPU_REGS_RBX);
- ctxt->eflags |= EFLG_ZF;
+ ctxt->eflags |= X86_EFLAGS_ZF;
}
return X86EMUL_CONTINUE;
}
@@ -2222,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
ctxt->src.val = ctxt->dst.orig_val;
fastop(ctxt, em_cmp);
- if (ctxt->eflags & EFLG_ZF) {
+ if (ctxt->eflags & X86_EFLAGS_ZF) {
/* Success: write back to memory; no update of EAX */
ctxt->src.type = OP_NONE;
ctxt->dst.val = ctxt->src.orig_val;
@@ -2381,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~msr_data;
- ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
#endif
} else {
/* legacy mode */
ops->get_msr(ctxt, MSR_STAR, &msr_data);
ctxt->_eip = (u32)msr_data;
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
return X86EMUL_CONTINUE;
@@ -2425,8 +2434,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
- cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK;
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
if (efer & EFER_LMA) {
cs.d = 0;
@@ -2493,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
break;
}
- cs_sel |= SELECTOR_RPL_MASK;
- ss_sel |= SELECTOR_RPL_MASK;
+ cs_sel |= SEGMENT_RPL_MASK;
+ ss_sel |= SEGMENT_RPL_MASK;
ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
@@ -2512,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
return false;
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
- iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
return ctxt->ops->cpl(ctxt) > iopl;
}
@@ -2782,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
return ret;
ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
X86_TRANSFER_TASK_SWITCH, NULL);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- return X86EMUL_CONTINUE;
+ return ret;
}
static int task_switch_32(struct x86_emulate_ctxt *ctxt,
@@ -2954,7 +2961,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
struct operand *op)
{
- int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
+ int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
register_address_increment(ctxt, reg, df * op->bytes);
op->addr.mem.ea = register_address(ctxt, reg);
@@ -3323,7 +3330,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_vmcall(struct x86_emulate_ctxt *ctxt)
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
{
int rc = ctxt->ops->fix_hypercall(ctxt);
@@ -3395,17 +3402,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
return em_lgdt_lidt(ctxt, true);
}
-static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
-{
- int rc;
-
- rc = ctxt->ops->fix_hypercall(ctxt);
-
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return rc;
-}
-
static int em_lidt(struct x86_emulate_ctxt *ctxt)
{
return em_lgdt_lidt(ctxt, false);
@@ -3504,7 +3500,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt)
{
u32 flags;
- flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF;
+ flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF;
flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
ctxt->eflags &= ~0xffUL;
@@ -3769,7 +3766,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
static const struct opcode group7_rm0[] = {
N,
- I(SrcNone | Priv | EmulateOnUD, em_vmcall),
+ I(SrcNone | Priv | EmulateOnUD, em_hypercall),
N, N, N, N, N, N,
};
@@ -3781,7 +3778,7 @@ static const struct opcode group7_rm1[] = {
static const struct opcode group7_rm3[] = {
DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
- II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall),
+ II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
DIP(SrcNone | Prot | Priv, stgi, check_svme),
@@ -4192,7 +4189,8 @@ static const struct opcode twobyte_table[256] = {
N, N,
G(BitOp, group8),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
- F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
+ I(DstReg | SrcMem | ModRM, em_bsf_c),
+ I(DstReg | SrcMem | ModRM, em_bsr_c),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xC7 */
F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
@@ -4759,9 +4757,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
(ctxt->b == 0xae) || (ctxt->b == 0xaf))
&& (((ctxt->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0))
+ ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
|| ((ctxt->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+ ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
return true;
return false;
@@ -4913,7 +4911,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
/* All REP prefixes have the same first termination condition */
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
ctxt->eip = ctxt->_eip;
- ctxt->eflags &= ~EFLG_RF;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
goto done;
}
}
@@ -4963,9 +4961,9 @@ special_insn:
}
if (ctxt->rep_prefix && (ctxt->d & String))
- ctxt->eflags |= EFLG_RF;
+ ctxt->eflags |= X86_EFLAGS_RF;
else
- ctxt->eflags &= ~EFLG_RF;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
if (ctxt->execute) {
if (ctxt->d & Fastop) {
@@ -5014,7 +5012,7 @@ special_insn:
rc = emulate_int(ctxt, ctxt->src.val);
break;
case 0xce: /* into */
- if (ctxt->eflags & EFLG_OF)
+ if (ctxt->eflags & X86_EFLAGS_OF)
rc = emulate_int(ctxt, 4);
break;
case 0xe9: /* jmp rel */
@@ -5027,19 +5025,19 @@ special_insn:
break;
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
- ctxt->eflags ^= EFLG_CF;
+ ctxt->eflags ^= X86_EFLAGS_CF;
break;
case 0xf8: /* clc */
- ctxt->eflags &= ~EFLG_CF;
+ ctxt->eflags &= ~X86_EFLAGS_CF;
break;
case 0xf9: /* stc */
- ctxt->eflags |= EFLG_CF;
+ ctxt->eflags |= X86_EFLAGS_CF;
break;
case 0xfc: /* cld */
- ctxt->eflags &= ~EFLG_DF;
+ ctxt->eflags &= ~X86_EFLAGS_DF;
break;
case 0xfd: /* std */
- ctxt->eflags |= EFLG_DF;
+ ctxt->eflags |= X86_EFLAGS_DF;
break;
default:
goto cannot_emulate;
@@ -5100,7 +5098,7 @@ writeback:
}
goto done; /* skip rip writeback */
}
- ctxt->eflags &= ~EFLG_RF;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
}
ctxt->eip = ctxt->_eip;
@@ -5137,8 +5135,7 @@ twobyte_insn:
case 0x40 ... 0x4f: /* cmov */
if (test_cc(ctxt->b, ctxt->eflags))
ctxt->dst.val = ctxt->src.val;
- else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
- ctxt->op_bytes != 4)
+ else if (ctxt->op_bytes != 4)
ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 298781d4cfb4..4dce6f8b6129 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr)
(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
}
-static int pit_ioport_write(struct kvm_io_device *this,
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
gpa_t addr, int len, const void *data)
{
struct kvm_pit *pit = dev_to_pit(this);
@@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
return 0;
}
-static int pit_ioport_read(struct kvm_io_device *this,
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
gpa_t addr, int len, void *data)
{
struct kvm_pit *pit = dev_to_pit(this);
@@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
return 0;
}
-static int speaker_ioport_write(struct kvm_io_device *this,
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
gpa_t addr, int len, const void *data)
{
struct kvm_pit *pit = speaker_to_pit(this);
@@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this,
return 0;
}
-static int speaker_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index dd1b16b611b0..c84990b42b5b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,7 +3,7 @@
#include <linux/kthread.h>
-#include "iodev.h"
+#include <kvm/iodev.h>
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9541ba34126b..fef922ff2635 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -529,42 +529,42 @@ static int picdev_read(struct kvm_pic *s,
return 0;
}
-static int picdev_master_write(struct kvm_io_device *dev,
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
return picdev_write(container_of(dev, struct kvm_pic, dev_master),
addr, len, val);
}
-static int picdev_master_read(struct kvm_io_device *dev,
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
return picdev_read(container_of(dev, struct kvm_pic, dev_master),
addr, len, val);
}
-static int picdev_slave_write(struct kvm_io_device *dev,
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
addr, len, val);
}
-static int picdev_slave_read(struct kvm_io_device *dev,
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
addr, len, val);
}
-static int picdev_eclr_write(struct kvm_io_device *dev,
+static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
addr, len, val);
}
-static int picdev_eclr_read(struct kvm_io_device *dev,
+static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 46d4449772bc..28146f03c514 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
old_irr = ioapic->irr;
ioapic->irr |= mask;
+ if (edge)
+ ioapic->irr_delivered &= ~mask;
if ((edge && old_irr == ioapic->irr) ||
(!edge && entry.fields.remote_irr)) {
ret = 0;
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.shorthand = 0;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
- ioapic->irr &= ~(1 << irq);
+ ioapic->irr_delivered |= 1 << irq;
if (irq == RTC_GSI && line_status) {
/*
@@ -473,13 +475,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
}
}
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- smp_rmb();
- return test_bit(vector, ioapic->handled_vectors);
-}
-
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
@@ -500,8 +495,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
(addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
}
-static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
- void *val)
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
{
struct kvm_ioapic *ioapic = to_ioapic(this);
u32 result;
@@ -543,8 +538,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
return 0;
}
-static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
- const void *val)
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
{
struct kvm_ioapic *ioapic = to_ioapic(this);
u32 data;
@@ -599,6 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
ioapic->ioregsel = 0;
ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
ioapic->id = 0;
memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
rtc_irq_eoi_tracking_reset(ioapic);
@@ -656,6 +652,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
spin_lock(&ioapic->lock);
memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ state->irr &= ~ioapic->irr_delivered;
spin_unlock(&ioapic->lock);
return 0;
}
@@ -669,6 +666,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
spin_lock(&ioapic->lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index c2e36d934af4..ca0b0b4e6256 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -3,7 +3,7 @@
#include <linux/kvm_host.h>
-#include "iodev.h"
+#include <kvm/iodev.h>
struct kvm;
struct kvm_vcpu;
@@ -77,6 +77,7 @@ struct kvm_ioapic {
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
+ u32 irr_delivered;
};
#ifdef DEBUG
@@ -97,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
return kvm->arch.vioapic;
}
+static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ smp_rmb();
+ return test_bit(vector, ioapic->handled_vectors);
+}
+
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2d03568e9498..ad68c73008c5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -27,7 +27,7 @@
#include <linux/kvm_host.h>
#include <linux/spinlock.h>
-#include "iodev.h"
+#include <kvm/iodev.h>
#include "ioapic.h"
#include "lapic.h"
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4ee827d7bf36..d67206a7b99a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -133,6 +133,28 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
+/* The logical map is definitely wrong if we have multiple
+ * modes at the same time. (Physical map is always right.)
+ */
+static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
+{
+ return !(map->mode & (map->mode - 1));
+}
+
+static inline void
+apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+{
+ unsigned lid_bits;
+
+ BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4);
+ BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8);
+ BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16);
+ lid_bits = map->mode;
+
+ *cid = dest_id >> lid_bits;
+ *lid = dest_id & ((1 << lid_bits) - 1);
+}
+
static void recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
@@ -146,48 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm)
if (!new)
goto out;
- new->ldr_bits = 8;
- /* flat mode is default */
- new->cid_shift = 8;
- new->cid_mask = 0;
- new->lid_mask = 0xff;
- new->broadcast = APIC_BROADCAST;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (apic_x2apic_mode(apic)) {
- new->ldr_bits = 32;
- new->cid_shift = 16;
- new->cid_mask = new->lid_mask = 0xffff;
- new->broadcast = X2APIC_BROADCAST;
- } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
- if (kvm_apic_get_reg(apic, APIC_DFR) ==
- APIC_DFR_CLUSTER) {
- new->cid_shift = 4;
- new->cid_mask = 0xf;
- new->lid_mask = 0xf;
- } else {
- new->cid_shift = 8;
- new->cid_mask = 0;
- new->lid_mask = 0xff;
- }
- }
-
- /*
- * All APICs have to be configured in the same mode by an OS.
- * We take advatage of this while building logical id loockup
- * table. After reset APICs are in software disabled mode, so if
- * we find apic with different setting we assume this is the mode
- * OS wants all apics to be in; build lookup table accordingly.
- */
- if (kvm_apic_sw_enabled(apic))
- break;
- }
-
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvm_lapic *apic = vcpu->arch.apic;
u16 cid, lid;
@@ -198,11 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm)
aid = kvm_apic_id(apic);
ldr = kvm_apic_get_reg(apic, APIC_LDR);
- cid = apic_cluster_id(new, ldr);
- lid = apic_logical_id(new, ldr);
if (aid < ARRAY_SIZE(new->phys_map))
new->phys_map[aid] = apic;
+
+ if (apic_x2apic_mode(apic)) {
+ new->mode |= KVM_APIC_MODE_X2APIC;
+ } else if (ldr) {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ if (!kvm_apic_logical_map_valid(new))
+ continue;
+
+ apic_logical_id(new, ldr, &cid, &lid);
+
if (lid && cid < ARRAY_SIZE(new->logical_map))
new->logical_map[cid][ffs(lid) - 1] = apic;
}
@@ -588,15 +582,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
apic_update_ppr(apic);
}
-static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
{
- return dest == (apic_x2apic_mode(apic) ?
- X2APIC_BROADCAST : APIC_BROADCAST);
+ if (apic_x2apic_mode(apic))
+ return mda == X2APIC_BROADCAST;
+
+ return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
}
-static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
{
- return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ if (apic_x2apic_mode(apic))
+ return mda == kvm_apic_id(apic);
+
+ return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
}
static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -613,6 +615,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
&& (logical_id & mda & 0xffff) != 0;
logical_id = GET_APIC_LOGICAL_ID(logical_id);
+ mda = GET_APIC_DEST_FIELD(mda);
switch (kvm_apic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
@@ -627,10 +630,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
}
}
+/* KVM APIC implementation has two quirks
+ * - dest always begins at 0 while xAPIC MDA has offset 24,
+ * - IOxAPIC messages have to be delivered (directly) to x2APIC.
+ */
+static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
+ struct kvm_lapic *target)
+{
+ bool ipi = source != NULL;
+ bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
+
+ if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+ return X2APIC_BROADCAST;
+
+ return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+}
+
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode)
{
struct kvm_lapic *target = vcpu->arch.apic;
+ u32 mda = kvm_apic_mda(dest, source, target);
apic_debug("target %p, source %p, dest 0x%x, "
"dest_mode 0x%x, short_hand 0x%x\n",
@@ -640,9 +660,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
switch (short_hand) {
case APIC_DEST_NOSHORT:
if (dest_mode == APIC_DEST_PHYSICAL)
- return kvm_apic_match_physical_addr(target, dest);
+ return kvm_apic_match_physical_addr(target, mda);
else
- return kvm_apic_match_logical_addr(target, dest);
+ return kvm_apic_match_logical_addr(target, mda);
case APIC_DEST_SELF:
return target == source;
case APIC_DEST_ALLINC:
@@ -664,6 +684,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic **dst;
int i;
bool ret = false;
+ bool x2apic_ipi = src && apic_x2apic_mode(src);
*r = -1;
@@ -675,15 +696,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
if (irq->shorthand)
return false;
+ if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+ return false;
+
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
if (!map)
goto out;
- if (irq->dest_id == map->broadcast)
- goto out;
-
ret = true;
if (irq->dest_mode == APIC_DEST_PHYSICAL) {
@@ -692,16 +713,20 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
dst = &map->phys_map[irq->dest_id];
} else {
- u32 mda = irq->dest_id << (32 - map->ldr_bits);
- u16 cid = apic_cluster_id(map, mda);
+ u16 cid;
+
+ if (!kvm_apic_logical_map_valid(map)) {
+ ret = false;
+ goto out;
+ }
+
+ apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
if (cid >= ARRAY_SIZE(map->logical_map))
goto out;
dst = map->logical_map[cid];
- bitmap = apic_logical_id(map, mda);
-
if (irq->delivery_mode == APIC_DM_LOWEST) {
int l = -1;
for_each_set_bit(i, &bitmap, 16) {
@@ -1037,7 +1062,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
addr < apic->base_address + LAPIC_MMIO_LENGTH;
}
-static int apic_mmio_read(struct kvm_io_device *this,
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, void *data)
{
struct kvm_lapic *apic = to_lapic(this);
@@ -1357,7 +1382,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
return ret;
}
-static int apic_mmio_write(struct kvm_io_device *this,
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, const void *data)
{
struct kvm_lapic *apic = to_lapic(this);
@@ -1497,8 +1522,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
return;
}
- if (!kvm_vcpu_is_bsp(apic->vcpu))
- value &= ~MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base = value;
/* update jump label if enable bit changes */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 0bc6c656625b..9d28383fc1e7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -1,7 +1,7 @@
#ifndef __KVM_X86_LAPIC_H
#define __KVM_X86_LAPIC_H
-#include "iodev.h"
+#include <kvm/iodev.h>
#include <linux/kvm_host.h>
@@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
return kvm_x86_ops->vm_has_apicv(kvm);
}
-static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
-{
- u16 cid;
- ldr >>= 32 - map->ldr_bits;
- cid = (ldr >> map->cid_shift) & map->cid_mask;
-
- return cid;
-}
-
-static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
-{
- ldr >>= (32 - map->ldr_bits);
- return ldr & map->lid_mask;
-}
-
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
{
return vcpu->arch.apic->pending_events;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cee759299a35..146f295ee322 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
}
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ unsigned long *rmapp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_tlb_flush = 0;
+ pfn_t pfn;
+ struct kvm_mmu_page *sp;
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+ sp = page_header(__pa(sptep));
+ pfn = spte_to_pfn(*sptep);
+
+ /*
+ * Only EPT supported for now; otherwise, one would need to
+ * find out efficiently whether the guest page tables are
+ * also using huge pages.
+ */
+ if (sp->role.direct &&
+ !kvm_is_reserved_pfn(pfn) &&
+ PageTransCompound(pfn_to_page(pfn))) {
+ drop_spte(kvm, sptep);
+ sptep = rmap_get_first(*rmapp, &iter);
+ need_tlb_flush = 1;
+ } else
+ sptep = rmap_get_next(&iter);
+ }
+
+ return need_tlb_flush;
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush = false;
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+ gfn_t gfn_start, gfn_end;
+
+ spin_lock(&kvm->mmu_lock);
+
+ gfn_start = memslot->base_gfn;
+ gfn_end = memslot->base_gfn + memslot->npages - 1;
+
+ if (gfn_start >= gfn_end)
+ goto out;
+
+ rmapp = memslot->arch.rmap[0];
+ last_index = gfn_to_index(gfn_end, memslot->base_gfn,
+ PT_PAGE_TABLE_LEVEL);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+out:
+ spin_unlock(&kvm->mmu_lock);
+}
+
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 8e6b7d869d2f..29fbf9dfdc54 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping {
};
/* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 7};
+static int fixed_pmc_events[] = {1, 0, 7};
static bool pmc_is_gp(struct kvm_pmc *pmc)
{
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cc618c882f90..ce741b8650f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_bsp(&svm->vcpu))
+ if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
svm_init_osvw(&svm->vcpu);
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm)
static int halt_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
- skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_halt(&svm->vcpu);
}
static int vmmcall_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
kvm_emulate_hypercall(&svm->vcpu);
return 1;
}
@@ -2757,11 +2755,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
- vcpu->arch.regs[VCPU_REGS_RAX]);
+ trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
+ kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+ kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
@@ -2770,12 +2768,18 @@ static int invlpga_interception(struct vcpu_svm *svm)
static int skinit_interception(struct vcpu_svm *svm)
{
- trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+ trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+ kvm_emulate_wbinvd(&svm->vcpu);
+ return 1;
+}
+
static int xsetbv_interception(struct vcpu_svm *svm)
{
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
@@ -2902,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm)
return 1;
}
-bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
+static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+ unsigned long val)
{
unsigned long cr0 = svm->vcpu.arch.cr0;
bool ret = false;
@@ -2940,7 +2945,10 @@ static int cr_interception(struct vcpu_svm *svm)
return emulate_on_interception(svm);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
- cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
err = 0;
if (cr >= 16) { /* mov to cr */
@@ -3133,7 +3141,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
static int rdmsr_interception(struct vcpu_svm *svm)
{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
u64 data;
if (svm_get_msr(&svm->vcpu, ecx, &data)) {
@@ -3142,8 +3150,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
} else {
trace_kvm_msr_read(ecx, data);
- svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
- svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
}
@@ -3246,9 +3254,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
static int wrmsr_interception(struct vcpu_svm *svm)
{
struct msr_data msr;
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+ u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u64 data = kvm_read_edx_eax(&svm->vcpu);
msr.data = data;
msr.index = ecx;
@@ -3325,7 +3332,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_READ_CR4] = cr_interception,
[SVM_EXIT_READ_CR8] = cr_interception,
- [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
[SVM_EXIT_WRITE_CR0] = cr_interception,
[SVM_EXIT_WRITE_CR3] = cr_interception,
[SVM_EXIT_WRITE_CR4] = cr_interception,
@@ -3376,7 +3383,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
- [SVM_EXIT_WBINVD] = emulate_on_interception,
+ [SVM_EXIT_WBINVD] = wbinvd_interception,
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
@@ -3555,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
- WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
+ WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae4f6d35d19c..f5e8dce8046c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2470,6 +2470,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_low = 0;
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -3268,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
* default value.
*/
if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
- save->selector &= ~SELECTOR_RPL_MASK;
- save->dpl = save->selector & SELECTOR_RPL_MASK;
+ save->selector &= ~SEGMENT_RPL_MASK;
+ save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
vmx_set_segment(vcpu, save, seg);
@@ -3842,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
unsigned int cs_rpl;
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+ cs_rpl = cs.selector & SEGMENT_RPL_MASK;
if (cs.unusable)
return false;
@@ -3870,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
unsigned int ss_rpl;
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
- ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+ ss_rpl = ss.selector & SEGMENT_RPL_MASK;
if (ss.unusable)
return true;
@@ -3892,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
unsigned int rpl;
vmx_get_segment(vcpu, &var, seg);
- rpl = var.selector & SELECTOR_RPL_MASK;
+ rpl = var.selector & SEGMENT_RPL_MASK;
if (var.unusable)
return true;
@@ -3919,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
if (tr.unusable)
return false;
- if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
return false;
if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
return false;
@@ -3937,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
if (ldtr.unusable)
return true;
- if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
return false;
if (ldtr.type != 2)
return false;
@@ -3954,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
- return ((cs.selector & SELECTOR_RPL_MASK) ==
- (ss.selector & SELECTOR_RPL_MASK));
+ return ((cs.selector & SEGMENT_RPL_MASK) ==
+ (ss.selector & SEGMENT_RPL_MASK));
}
/*
@@ -4711,7 +4712,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_bsp(&vmx->vcpu))
+ if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
apic_base_msr.host_initiated = true;
kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
@@ -5006,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_emulate_halt(vcpu);
+ return kvm_vcpu_halt(vcpu);
}
return 1;
}
@@ -5071,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_invalid_opcode(intr_info)) {
+ if (is_guest_mode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5090,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
+ vcpu->run->internal.data[2] = error_code;
return 0;
}
@@ -5533,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
static int handle_halt(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
return kvm_emulate_halt(vcpu);
}
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
kvm_emulate_hypercall(vcpu);
return 1;
}
@@ -5570,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
kvm_emulate_wbinvd(vcpu);
return 1;
}
@@ -5828,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa_t gpa;
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
skip_emulated_instruction(vcpu);
return 1;
}
@@ -5909,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- ret = kvm_emulate_halt(vcpu);
+ ret = kvm_vcpu_halt(vcpu);
goto out;
}
@@ -7318,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
else if (port < 0x10000)
bitmap = vmcs12->io_bitmap_b;
else
- return 1;
+ return true;
bitmap += (port & 0x7fff) / 8;
if (last_bitmap != bitmap)
if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
- return 1;
+ return true;
if (b & (1 << (port & 7)))
- return 1;
+ return true;
port++;
size--;
last_bitmap = bitmap;
}
- return 0;
+ return false;
}
/*
@@ -7348,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
gpa_t bitmap;
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
- return 1;
+ return true;
/*
* The MSR_BITMAP page is divided into four 1024-byte bitmaps,
@@ -7367,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
if (msr_index < 1024*8) {
unsigned char b;
if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
- return 1;
+ return true;
return 1 & (b >> (msr_index & 7));
} else
- return 1; /* let L1 handle the wrong parameter */
+ return true; /* let L1 handle the wrong parameter */
}
/*
@@ -7392,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
case 0:
if (vmcs12->cr0_guest_host_mask &
(val ^ vmcs12->cr0_read_shadow))
- return 1;
+ return true;
break;
case 3:
if ((vmcs12->cr3_target_count >= 1 &&
@@ -7403,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
vmcs12->cr3_target_value2 == val) ||
(vmcs12->cr3_target_count >= 4 &&
vmcs12->cr3_target_value3 == val))
- return 0;
+ return false;
if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
- return 1;
+ return true;
break;
case 4:
if (vmcs12->cr4_guest_host_mask &
(vmcs12->cr4_read_shadow ^ val))
- return 1;
+ return true;
break;
case 8:
if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
- return 1;
+ return true;
break;
}
break;
case 2: /* clts */
if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
(vmcs12->cr0_read_shadow & X86_CR0_TS))
- return 1;
+ return true;
break;
case 1: /* mov from cr */
switch (cr) {
case 3:
if (vmcs12->cpu_based_vm_exec_control &
CPU_BASED_CR3_STORE_EXITING)
- return 1;
+ return true;
break;
case 8:
if (vmcs12->cpu_based_vm_exec_control &
CPU_BASED_CR8_STORE_EXITING)
- return 1;
+ return true;
break;
}
break;
@@ -7444,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
*/
if (vmcs12->cr0_guest_host_mask & 0xe &
(val ^ vmcs12->cr0_read_shadow))
- return 1;
+ return true;
if ((vmcs12->cr0_guest_host_mask & 0x1) &&
!(vmcs12->cr0_read_shadow & 0x1) &&
(val & 0x1))
- return 1;
+ return true;
break;
}
- return 0;
+ return false;
}
/*
@@ -7474,48 +7477,48 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
KVM_ISA_VMX);
if (vmx->nested.nested_run_pending)
- return 0;
+ return false;
if (unlikely(vmx->fail)) {
pr_info_ratelimited("%s failed vm entry %x\n", __func__,
vmcs_read32(VM_INSTRUCTION_ERROR));
- return 1;
+ return true;
}
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
if (!is_exception(intr_info))
- return 0;
+ return false;
else if (is_page_fault(intr_info))
return enable_ept;
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
- return 0;
+ return false;
return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
case EXIT_REASON_EXTERNAL_INTERRUPT:
- return 0;
+ return false;
case EXIT_REASON_TRIPLE_FAULT:
- return 1;
+ return true;
case EXIT_REASON_PENDING_INTERRUPT:
return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
case EXIT_REASON_NMI_WINDOW:
return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
case EXIT_REASON_TASK_SWITCH:
- return 1;
+ return true;
case EXIT_REASON_CPUID:
if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
- return 0;
- return 1;
+ return false;
+ return true;
case EXIT_REASON_HLT:
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
case EXIT_REASON_INVD:
- return 1;
+ return true;
case EXIT_REASON_INVLPG:
return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
- case EXIT_REASON_RDTSC:
+ case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
@@ -7527,7 +7530,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* VMX instructions trap unconditionally. This allows L1 to
* emulate them for its L2 guest, i.e., allows 3-level nesting!
*/
- return 1;
+ return true;
case EXIT_REASON_CR_ACCESS:
return nested_vmx_exit_handled_cr(vcpu, vmcs12);
case EXIT_REASON_DR_ACCESS:
@@ -7538,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_MSR_WRITE:
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
case EXIT_REASON_INVALID_STATE:
- return 1;
+ return true;
case EXIT_REASON_MWAIT_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
case EXIT_REASON_MONITOR_INSTRUCTION:
@@ -7548,7 +7551,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
nested_cpu_has2(vmcs12,
SECONDARY_EXEC_PAUSE_LOOP_EXITING);
case EXIT_REASON_MCE_DURING_VMENTRY:
- return 0;
+ return false;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
@@ -7557,7 +7560,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_APIC_WRITE:
case EXIT_REASON_EOI_INDUCED:
/* apic_write and eoi_induced should exit unconditionally. */
- return 1;
+ return true;
case EXIT_REASON_EPT_VIOLATION:
/*
* L0 always deals with the EPT violation. If nested EPT is
@@ -7565,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* missing in the guest EPT table (EPT12), the EPT violation
* will be injected with nested_ept_inject_page_fault()
*/
- return 0;
+ return false;
case EXIT_REASON_EPT_MISCONFIG:
/*
* L2 never uses directly L1's EPT, but rather L0's own EPT
@@ -7573,11 +7576,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* (EPT on EPT). So any problems with the structure of the
* table is L0's fault.
*/
- return 0;
+ return false;
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
case EXIT_REASON_XSETBV:
- return 1;
+ return true;
case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
/*
* This should never happen, since it is not possible to
@@ -7587,7 +7590,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
default:
- return 1;
+ return true;
}
}
@@ -8522,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
exec_control);
}
}
+ if (nested && !vmx->rdtscp_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
}
/* Exposing INVPCID only when PCID is exposed */
@@ -8622,10 +8628,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- /* TODO: Also verify bits beyond physical address width are 0 */
- if (!PAGE_ALIGNED(vmcs12->apic_access_addr))
+ if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
+ vmcs12->apic_access_addr >> maxphyaddr)
return false;
/*
@@ -8641,8 +8648,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- /* TODO: Also verify bits beyond physical address width are 0 */
- if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr))
+ if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
+ vmcs12->virtual_apic_page_addr >> maxphyaddr)
return false;
if (vmx->nested.virtual_apic_page) /* shouldn't happen */
@@ -8665,7 +8672,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+ if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
+ vmcs12->posted_intr_desc_addr >> maxphyaddr)
return false;
if (vmx->nested.pi_desc_page) { /* shouldn't happen */
@@ -8864,9 +8872,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
unsigned long count_field,
- unsigned long addr_field,
- int maxphyaddr)
+ unsigned long addr_field)
{
+ int maxphyaddr;
u64 count, addr;
if (vmcs12_read_any(vcpu, count_field, &count) ||
@@ -8876,6 +8884,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
}
if (count == 0)
return 0;
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
(addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
pr_warn_ratelimited(
@@ -8889,19 +8898,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- int maxphyaddr;
-
if (vmcs12->vm_exit_msr_load_count == 0 &&
vmcs12->vm_exit_msr_store_count == 0 &&
vmcs12->vm_entry_msr_load_count == 0)
return 0; /* Fast path */
- maxphyaddr = cpuid_maxphyaddr(vcpu);
if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
- VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+ VM_EXIT_MSR_LOAD_ADDR) ||
nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
- VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+ VM_EXIT_MSR_STORE_ADDR) ||
nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
- VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+ VM_ENTRY_MSR_LOAD_ADDR))
return -EINVAL;
return 0;
}
@@ -9151,8 +9157,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9385,7 +9392,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
- /*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
@@ -9524,7 +9530,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmcs12->launch_state = 1;
if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
- return kvm_emulate_halt(vcpu);
+ return kvm_vcpu_halt(vcpu);
vmx->nested.nested_run_pending = 1;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32bf19ef3115..e1a81267f3f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+ }
+}
+
static void kvm_update_dr6(struct kvm_vcpu *vcpu)
{
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
@@ -1070,19 +1081,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
u64 boot_ns;
- boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->tkr.cycle_last;
- vdata->clock.mask = tk->tkr.mask;
- vdata->clock.mult = tk->tkr.mult;
- vdata->clock.shift = tk->tkr.shift;
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
+ vdata->clock.mask = tk->tkr_mono.mask;
+ vdata->clock.mult = tk->tkr_mono.mult;
+ vdata->clock.shift = tk->tkr_mono.shift;
vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr.xtime_nsec;
+ vdata->nsec_base = tk->tkr_mono.xtime_nsec;
write_seqcount_end(&vdata->seq);
}
@@ -3149,6 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
return -EINVAL;
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
kvm_update_dr6(vcpu);
vcpu->arch.dr7 = dbgregs->dr7;
@@ -4114,8 +4126,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
do {
n = min(len, 8);
if (!(vcpu->arch.apic &&
- !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
- && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
handled += n;
addr += n;
@@ -4134,8 +4146,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
do {
n = min(len, 8);
if (!(vcpu->arch.apic &&
- !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
- && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+ addr, n, v))
+ && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
handled += n;
@@ -4475,7 +4488,8 @@ mmio:
return X86EMUL_CONTINUE;
}
-int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
void *val, unsigned int bytes,
struct x86_exception *exception,
const struct read_write_emulator_ops *ops)
@@ -4538,7 +4552,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
exception, &read_emultor);
}
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
const void *val,
unsigned int bytes,
@@ -4629,10 +4643,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
int r;
if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
vcpu->arch.pio.size, pd);
else
- r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
vcpu->arch.pio.port, vcpu->arch.pio.size,
pd);
return r;
@@ -4705,7 +4719,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
}
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
{
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
@@ -4722,19 +4736,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
wbinvd();
return X86EMUL_CONTINUE;
}
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return kvm_emulate_wbinvd_noskip(vcpu);
+}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
+
static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
{
- kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
+ kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
{
return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value)
{
return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
@@ -5816,7 +5840,7 @@ void kvm_arch_exit(void)
free_percpu(shared_msrs);
}
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
if (irqchip_in_kernel(vcpu->kvm)) {
@@ -5827,6 +5851,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
return 0;
}
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return kvm_vcpu_halt(vcpu);
+}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@ -5903,7 +5934,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.dest_id = apicid;
lapic_irq.delivery_mode = APIC_DM_REMRD;
- kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+ kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -5911,6 +5942,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit, r = 1;
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
@@ -6164,7 +6197,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
}
/*
- * Returns 1 to let __vcpu_run() continue the guest execution loop without
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
* userspace.
*/
@@ -6301,6 +6334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
set_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
trace_kvm_entry(vcpu->vcpu_id);
@@ -6382,42 +6416,47 @@ out:
return r;
}
+static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ return 1;
+ }
+
+ kvm_apic_accept_events(vcpu);
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
+ default:
+ return -EINTR;
+ break;
+ }
+ return 1;
+}
-static int __vcpu_run(struct kvm_vcpu *vcpu)
+static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- r = 1;
- while (r > 0) {
+ for (;;) {
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
- else {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
- kvm_apic_accept_events(vcpu);
- switch(vcpu->arch.mp_state) {
- case KVM_MP_STATE_HALTED:
- vcpu->arch.pv.pv_unhalted = false;
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
- case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.apf.halted = false;
- break;
- case KVM_MP_STATE_INIT_RECEIVED:
- break;
- default:
- r = -EINTR;
- break;
- }
- }
- }
-
+ else
+ r = vcpu_block(kvm, vcpu);
if (r <= 0)
break;
@@ -6429,6 +6468,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
+ break;
}
kvm_check_async_pf_completion(vcpu);
@@ -6437,6 +6477,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
+ break;
}
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -6568,7 +6609,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
- r = __vcpu_run(vcpu);
+ r = vcpu_run(vcpu);
out:
post_kvm_run_save(vcpu);
@@ -7075,11 +7116,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_clear_exception_queue(vcpu);
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = DR6_INIT;
kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
+ vcpu->arch.cr2 = 0;
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0;
vcpu->arch.st.msr_val = 0;
@@ -7240,7 +7284,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
+ if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -7288,6 +7332,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.guest_supported_xcr0 = 0;
vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
@@ -7428,7 +7474,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
- kvm_kvfree(free->arch.rmap[i]);
+ kvfree(free->arch.rmap[i]);
free->arch.rmap[i] = NULL;
}
if (i == 0)
@@ -7436,7 +7482,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
if (!dont || free->arch.lpage_info[i - 1] !=
dont->arch.lpage_info[i - 1]) {
- kvm_kvfree(free->arch.lpage_info[i - 1]);
+ kvfree(free->arch.lpage_info[i - 1]);
free->arch.lpage_info[i - 1] = NULL;
}
}
@@ -7490,12 +7536,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
out_free:
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- kvm_kvfree(slot->arch.rmap[i]);
+ kvfree(slot->arch.rmap[i]);
slot->arch.rmap[i] = NULL;
if (i == 0)
continue;
- kvm_kvfree(slot->arch.lpage_info[i - 1]);
+ kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
return -ENOMEM;
@@ -7618,6 +7664,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
new = id_to_memslot(kvm->memslots, mem->slot);
/*
+ * Dirty logging tracks sptes in 4k granularity, meaning that large
+ * sptes have to be split. If live migration is successful, the guest
+ * in the source machine will be destroyed and large sptes will be
+ * created in the destination. However, if the guest continues to run
+ * in the source machine (for example if live migration fails), small
+ * sptes will remain around and cause bad performance.
+ *
+ * Scan sptes if dirty logging has been stopped, dropping those
+ * which can be collapsed into a single large-page spte. Later
+ * page faults will create the large-page sptes.
+ */
+ if ((change != KVM_MR_DELETE) &&
+ (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_zap_collapsible_sptes(kvm, new);
+
+ /*
* Set up write protection and/or dirty logging for the new slot.
*
* For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..717908b16037 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void)
/* Some systems map "vectors" to interrupts weirdly. Not us! */
__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate(i, irq_entries_start +
+ 8 * (i - FIRST_EXTERNAL_VECTOR));
}
/*
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
{
lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
THREAD_SIZE / PAGE_SIZE);
+ tss->x86_tss.sp0 = thread->sp0;
}
/* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb1d51b..082a85167a5b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
#include <asm/alternative-asm.h>
#include <asm/dwarf2.h>
-.macro SAVE reg
- pushl_cfi %\reg
- CFI_REL_OFFSET \reg, 0
-.endm
-
-.macro RESTORE reg
- popl_cfi %\reg
- CFI_RESTORE \reg
-.endm
-
.macro read64 reg
movl %ebx, %eax
movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
.macro addsub_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
CFI_STARTPROC
- SAVE ebp
- SAVE ebx
- SAVE esi
- SAVE edi
+ pushl_cfi_reg ebp
+ pushl_cfi_reg ebx
+ pushl_cfi_reg esi
+ pushl_cfi_reg edi
movl %eax, %esi
movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
10:
movl %ebx, %eax
movl %ecx, %edx
- RESTORE edi
- RESTORE esi
- RESTORE ebx
- RESTORE ebp
+ popl_cfi_reg edi
+ popl_cfi_reg esi
+ popl_cfi_reg ebx
+ popl_cfi_reg ebp
ret
CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
.macro incdec_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
CFI_STARTPROC
- SAVE ebx
+ pushl_cfi_reg ebx
read64 %esi
1:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
10:
movl %ebx, %eax
movl %ecx, %edx
- RESTORE ebx
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
ENTRY(atomic64_dec_if_positive_cx8)
CFI_STARTPROC
- SAVE ebx
+ pushl_cfi_reg ebx
read64 %esi
1:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
2:
movl %ebx, %eax
movl %ecx, %edx
- RESTORE ebx
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_dec_if_positive_cx8)
ENTRY(atomic64_add_unless_cx8)
CFI_STARTPROC
- SAVE ebp
- SAVE ebx
+ pushl_cfi_reg ebp
+ pushl_cfi_reg ebx
/* these just push these two parameters on the stack */
- SAVE edi
- SAVE ecx
+ pushl_cfi_reg edi
+ pushl_cfi_reg ecx
movl %eax, %ebp
movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
3:
addl $8, %esp
CFI_ADJUST_CFA_OFFSET -8
- RESTORE ebx
- RESTORE ebp
+ popl_cfi_reg ebx
+ popl_cfi_reg ebp
ret
4:
cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
ENTRY(atomic64_inc_not_zero_cx8)
CFI_STARTPROC
- SAVE ebx
+ pushl_cfi_reg ebx
read64 %esi
1:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
movl $1, %eax
3:
- RESTORE ebx
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8eee6615..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
ENTRY(csum_partial)
CFI_STARTPROC
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg esi
+ pushl_cfi_reg ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -127,14 +125,12 @@ ENTRY(csum_partial)
6: addl %ecx,%eax
adcl $0, %eax
7:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 8f
roll $8, %eax
8:
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %esi
- CFI_RESTORE esi
+ popl_cfi_reg ebx
+ popl_cfi_reg esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
ENTRY(csum_partial)
CFI_STARTPROC
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg esi
+ pushl_cfi_reg ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -251,14 +245,12 @@ ENTRY(csum_partial)
addl %ebx,%eax
adcl $0,%eax
80:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 90f
roll $8, %eax
90:
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %esi
- CFI_RESTORE esi
+ popl_cfi_reg ebx
+ popl_cfi_reg esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
subl $4,%esp
CFI_ADJUST_CFA_OFFSET 4
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg ebx
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) )
.previous
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
+ popl_cfi_reg ebx
+ popl_cfi_reg esi
+ popl_cfi_reg edi
popl_cfi %ecx # equivalent to addl $4,%esp
ret
CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
+ pushl_cfi_reg ebx
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) )
jmp 7b
.previous
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebx
- CFI_RESTORE ebx
+ popl_cfi_reg esi
+ popl_cfi_reg edi
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/*
- * Zero a page.
- * rdi page
- */
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi - page
+ */
+ENTRY(clear_page)
CFI_STARTPROC
+
+ ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp clear_page_c_e", X86_FEATURE_ERMS
+
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
CFI_STARTPROC
- movl $4096,%ecx
- xorl %eax,%eax
- rep stosb
- ret
- CFI_ENDPROC
-ENDPROC(clear_page_c_e)
-ENTRY(clear_page)
- CFI_STARTPROC
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
nop
ret
CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
- /*
- * Some CPUs support enhanced REP MOVSB/STOSB instructions.
- * It is recommended to use this when possible.
- * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
- * Otherwise, use original function.
- *
- */
+ENDPROC(clear_page_orig)
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
-3:
- .previous
- .section .altinstructions,"a"
- altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
- .Lclear_page_end-clear_page, 2b-1b
- altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
- .Lclear_page_end-clear_page,3b-2b
- .previous
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
ALIGN
-copy_page_rep:
+ENTRY(copy_page)
CFI_STARTPROC
+ ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
ret
CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- * Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
CFI_STARTPROC
subq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
addq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET -2*8
ret
-.Lcopy_page_end:
CFI_ENDPROC
-ENDPROC(copy_page)
-
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
-2:
- .previous
- .section .altinstructions,"a"
- altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
- .Lcopy_page_end-copy_page, 2b-1b
- .previous
+ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
- .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
- .byte 0xe9 /* 32bit jump */
- .long \orig-1f /* by default jump to orig */
-1:
- .section .altinstr_replacement,"ax"
-2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt1-1b /* offset */ /* or alternatively to alt1 */
-3: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt2-1b /* offset */ /* or alternatively to alt2 */
- .previous
-
- .section .altinstructions,"a"
- altinstruction_entry 0b,2b,\feature1,5,5
- altinstruction_entry 0b,3b,\feature2,5,5
- .previous
- .endm
-
.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
@@ -67,7 +38,6 @@
_ASM_EXTABLE(100b,103b)
_ASM_EXTABLE(101b,103b)
-#endif
.endm
/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
- copy_user_generic_unrolled,copy_user_generic_string, \
- copy_user_enhanced_fast_string
+ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
+ "jmp copy_user_generic_string", \
+ X86_FEATURE_REP_GOOD, \
+ "jmp copy_user_enhanced_fast_string", \
+ X86_FEATURE_ERMS
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
- copy_user_generic_unrolled,copy_user_generic_string, \
- copy_user_enhanced_fast_string
+ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
+ "jmp copy_user_generic_string", \
+ X86_FEATURE_REP_GOOD, \
+ "jmp copy_user_enhanced_fast_string", \
+ X86_FEATURE_ERMS
CFI_ENDPROC
ENDPROC(_copy_from_user)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
/* handle last odd byte */
.Lhandle_1:
- testl $1, %r10d
+ testb $1, %r10b
jz .Lende
xorl %ebx, %ebx
source
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6b478b..8f72b334aea0 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
*/
void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
{
+ /*
+ * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
+ * even if the input buffer is long enough to hold them.
+ */
+ if (buf_len > MAX_INSN_SIZE)
+ buf_len = MAX_INSN_SIZE;
+
memset(insn, 0, sizeof(*insn));
insn->kaddr = kaddr;
insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@ found:
/* VEX.W overrides opnd_size */
insn->opnd_bytes = 8;
} else {
+ /*
+ * For VEX2, fake VEX3-like byte#2.
+ * Makes it easier to decode vex.W, vex.vvvv,
+ * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
+ */
+ insn->vex_prefix.bytes[2] = b2 & 0x7f;
insn->vex_prefix.nbytes = 2;
insn->next_byte += 2;
}
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
* memcpy - Copy a memory block.
*
* Input:
@@ -17,15 +25,11 @@
* Output:
* rax original destination
*/
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memcpy_erms", X86_FEATURE_ERMS
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
- .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
movq %rdi, %rax
movq %rdx, %rcx
shrq $3, %rcx
@@ -34,29 +38,21 @@
movl %edx, %ecx
rep movsb
ret
-.Lmemcpy_e:
- .previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
/*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
*/
- .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
ret
-.Lmemcpy_e_e:
- .previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
CFI_STARTPROC
movq %rdi, %rax
@@ -183,26 +179,4 @@ ENTRY(memcpy)
.Lend:
retq
CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
- /*
- * Some CPUs are adding enhanced REP MOVSB/STOSB feature
- * If the feature is supported, memcpy_c_e() is the first choice.
- * If enhanced rep movsb copy is not available, use fast string copy
- * memcpy_c() when possible. This is faster and code is simpler than
- * original memcpy().
- * Otherwise, original memcpy() is used.
- * In .altinstructions section, ERMS feature is placed after REG_GOOD
- * feature to implement the right patch order.
- *
- * Replace only beginning, memcpy is used to apply alternatives,
- * so it is silly to overwrite itself with nops - reboot is the
- * only outcome...
- */
- .section .altinstructions, "a"
- altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
- .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
- altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
- .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
- .previous
+ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
* This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
-#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
jg 2f
.Lmemmove_begin_forward:
+ ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
13:
retq
CFI_ENDPROC
-
- .section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
- /* Forward moving data. */
- movq %rdx, %rcx
- rep movsb
- retq
-.Lmemmove_end_forward_efs:
- .previous
-
- .section .altinstructions,"a"
- altinstruction_entry .Lmemmove_begin_forward, \
- .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
- .Lmemmove_end_forward-.Lmemmove_begin_forward, \
- .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
- .previous
ENDPROC(__memmove)
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+.weak memset
+
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well.
- *
+ *
* rdi destination
- * rsi value (char)
- * rdx count (bytes)
- *
+ * rsi value (char)
+ * rdx count (bytes)
+ *
* rax original destination
- */
- .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+ * to use it when possible. If not available, use fast string instructions.
+ *
+ * Otherwise, use original memset function.
+ */
+ ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memset_erms", X86_FEATURE_ERMS
+
movq %rdi,%r9
movq %rdx,%rcx
andl $7,%edx
@@ -31,8 +42,8 @@
rep stosb
movq %r9,%rax
ret
-.Lmemset_e:
- .previous
+ENDPROC(memset)
+ENDPROC(__memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
*
* rax original destination
*/
- .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
ret
-.Lmemset_e_e:
- .previous
-
-.weak memset
+ENDPROC(memset_erms)
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
CFI_STARTPROC
movq %rdi,%r10
@@ -134,23 +140,4 @@ ENTRY(__memset)
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
- /* Some CPUs support enhanced REP MOVSB/STOSB feature.
- * It is recommended to use this when possible.
- *
- * If enhanced REP MOVSB/STOSB feature is not available, use fast string
- * instructions.
- *
- * Otherwise, use original memset function.
- *
- * In .altinstructions section, ERMS feature is placed after REG_GOOD
- * feature to implement the right patch order.
- */
- .section .altinstructions,"a"
- altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
- .Lfinal-__memset,.Lmemset_e-.Lmemset_c
- altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
- .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
- .previous
+ENDPROC(memset_orig)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13eefad10..3ca5218fbece 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
CFI_STARTPROC
- pushq_cfi %rbx
- pushq_cfi %rbp
+ pushq_cfi_reg rbx
+ pushq_cfi_reg rbp
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
movl %ebp, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
- popq_cfi %rbp
- popq_cfi %rbx
+ popq_cfi_reg rbp
+ popq_cfi_reg rbx
ret
3:
CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
CFI_STARTPROC
- pushl_cfi %ebx
- pushl_cfi %ebp
- pushl_cfi %esi
- pushl_cfi %edi
+ pushl_cfi_reg ebx
+ pushl_cfi_reg ebp
+ pushl_cfi_reg esi
+ pushl_cfi_reg edi
pushl_cfi $0 /* Return value */
pushl_cfi %eax
movl 4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
movl %esi, 24(%eax)
movl %edi, 28(%eax)
popl_cfi %eax
- popl_cfi %edi
- popl_cfi %esi
- popl_cfi %ebp
- popl_cfi %ebx
+ popl_cfi_reg edi
+ popl_cfi_reg esi
+ popl_cfi_reg ebp
+ popl_cfi_reg ebx
ret
3:
CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f042468..2322abe4da3b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
*/
#define save_common_regs \
- pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+ pushl_cfi_reg ecx
#define restore_common_regs \
- popl_cfi %ecx; CFI_RESTORE ecx
+ popl_cfi_reg ecx
/* Avoid uglifying the argument copying x86-64 needs to do. */
.macro movq src, dst
@@ -64,22 +64,22 @@
*/
#define save_common_regs \
- pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
- pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
- pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
- pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
- pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
- pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
- pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+ pushq_cfi_reg rdi; \
+ pushq_cfi_reg rsi; \
+ pushq_cfi_reg rcx; \
+ pushq_cfi_reg r8; \
+ pushq_cfi_reg r9; \
+ pushq_cfi_reg r10; \
+ pushq_cfi_reg r11
#define restore_common_regs \
- popq_cfi %r11; CFI_RESTORE r11; \
- popq_cfi %r10; CFI_RESTORE r10; \
- popq_cfi %r9; CFI_RESTORE r9; \
- popq_cfi %r8; CFI_RESTORE r8; \
- popq_cfi %rcx; CFI_RESTORE rcx; \
- popq_cfi %rsi; CFI_RESTORE rsi; \
- popq_cfi %rdi; CFI_RESTORE rdi
+ popq_cfi_reg r11; \
+ popq_cfi_reg r10; \
+ popq_cfi_reg r9; \
+ popq_cfi_reg r8; \
+ popq_cfi_reg rcx; \
+ popq_cfi_reg rsi; \
+ popq_cfi_reg rdi
#endif
@@ -87,12 +87,10 @@
ENTRY(call_rwsem_down_read_failed)
CFI_STARTPROC
save_common_regs
- __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
- CFI_REL_OFFSET __ASM_REG(dx), 0
+ __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
movq %rax,%rdi
call rwsem_down_read_failed
- __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
- CFI_RESTORE __ASM_REG(dx)
+ __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
ENTRY(call_rwsem_downgrade_wake)
CFI_STARTPROC
save_common_regs
- __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
- CFI_REL_OFFSET __ASM_REG(dx), 0
+ __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
movq %rax,%rdi
call rwsem_downgrade_wake
- __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
- CFI_RESTORE __ASM_REG(dx)
+ __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf5ac2c..5eb715087b80 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
.globl \name
\name:
CFI_STARTPROC
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ecx
+ pushl_cfi_reg edx
.if \put_ret_addr_in_eax
/* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
.endif
call \func
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %eax
- CFI_RESTORE eax
+ popl_cfi_reg edx
+ popl_cfi_reg ecx
+ popl_cfi_reg eax
ret
CFI_ENDPROC
_ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5ebd614a..f89ba4e93025 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,18 @@
CFI_STARTPROC
/* this one pushes 9 elems, the next one would be %rIP */
- SAVE_ARGS
+ pushq_cfi_reg rdi
+ pushq_cfi_reg rsi
+ pushq_cfi_reg rdx
+ pushq_cfi_reg rcx
+ pushq_cfi_reg rax
+ pushq_cfi_reg r8
+ pushq_cfi_reg r9
+ pushq_cfi_reg r10
+ pushq_cfi_reg r11
.if \put_ret_addr_in_rdi
+ /* 9*8(%rsp) is return addr on stack */
movq_cfi_restore 9*8, rdi
.endif
@@ -45,11 +54,22 @@
#endif
#endif
- /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+#if defined(CONFIG_TRACE_IRQFLAGS) \
+ || defined(CONFIG_DEBUG_LOCK_ALLOC) \
+ || defined(CONFIG_PREEMPT)
CFI_STARTPROC
- SAVE_ARGS
+ CFI_ADJUST_CFA_OFFSET 9*8
restore:
- RESTORE_ARGS
+ popq_cfi_reg r11
+ popq_cfi_reg r10
+ popq_cfi_reg r9
+ popq_cfi_reg r8
+ popq_cfi_reg rax
+ popq_cfi_reg rcx
+ popq_cfi_reg rdx
+ popq_cfi_reg rsi
+ popq_cfi_reg rdi
ret
CFI_ENDPROC
_ASM_NOKPROBE(restore)
+#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index c905e89e19fe..1f33b3d1fd68 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -69,21 +69,20 @@ EXPORT_SYMBOL(copy_in_user);
* it is not necessary to optimize tail handling.
*/
__visible unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
+copy_user_handle_tail(char *to, char *from, unsigned len)
{
- char c;
- unsigned zero_len;
-
for (; len; --len, to++) {
+ char c;
+
if (__get_user_nocheck(c, from++, sizeof(char)))
break;
if (__put_user_nocheck(c, to, sizeof(char)))
break;
}
-
- for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
- if (__put_user_nocheck(c, to++, sizeof(char)))
- break;
clac();
+
+ /* If the destination is a kernel buffer, we always clear the end */
+ if ((unsigned long)to >= TASK_SIZE_MAX)
+ memset(to, 0, len);
return len;
}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c6895d..816488c0b97e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@ dd: ESC
de: ESC
df: ESC
# 0xe0 - 0xef
+# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
+# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
+# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
e0: LOOPNE/LOOPNZ Jb (f64)
e1: LOOPE/LOOPZ Jb (f64)
e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
e5: IN eAX,Ib
e6: OUT Ib,AL
e7: OUT Ib,eAX
+# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
+# in "near" jumps and calls is 16-bit. For CALL,
+# push of return address is 16-bit wide, RSP is decremented by 2
+# but is not truncated to 16 bits, unlike RIP.
e8: CALL Jz (f64)
e9: JMP-near Jz (f64)
ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
# 0x0f 0x80-0x8f
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
80: JO Jz (f64)
81: JNO Jz (f64)
82: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
GrpTable: Grp5
0: INC Ev
1: DEC Ev
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
2: CALLN Ev (f64)
3: CALLF Ep
4: JMPN Ev (f64)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_MEMTEST) += memtest.o
-
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025fb46f1..181c53bac3a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
int ret = 0;
/* kprobe_running() needs smp_processor_id() */
- if (kprobes_built_in() && !user_mode_vm(regs)) {
+ if (kprobes_built_in() && !user_mode(regs)) {
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, 14))
ret = 1;
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
return 0;
while (instr < max_instr) {
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
if (error_code & PF_USER)
return false;
- if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
+ if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
return false;
return true;
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
local_irq_enable();
error_code |= PF_USER;
flags |= FAULT_FLAG_USER;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efca6d06..1d553186c434 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,29 +29,33 @@
/*
* Tables translating between page_cache_type_t and pte encoding.
- * Minimal supported modes are defined statically, modified if more supported
- * cache modes are available.
- * Index into __cachemode2pte_tbl is the cachemode.
- * Index into __pte2cachemode_tbl are the caching attribute bits of the pte
- * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ *
+ * Minimal supported modes are defined statically, they are modified
+ * during bootup if more supported cache modes are available.
+ *
+ * Index into __cachemode2pte_tbl[] is the cachemode.
+ *
+ * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
+ * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
*/
uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
- [_PAGE_CACHE_MODE_WB] = 0,
- [_PAGE_CACHE_MODE_WC] = _PAGE_PWT,
- [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD,
- [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT,
- [_PAGE_CACHE_MODE_WT] = _PAGE_PCD,
- [_PAGE_CACHE_MODE_WP] = _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
+ [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 ,
+ [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
EXPORT_SYMBOL(__cachemode2pte_tbl);
+
uint8_t __pte2cachemode_tbl[8] = {
- [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB,
- [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC,
- [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS,
- [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC,
- [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
- [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
- [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC,
+ [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
+ [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+ [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
EXPORT_SYMBOL(__pte2cachemode_tbl);
@@ -131,21 +135,7 @@ void __init early_alloc_pgt_buf(void)
int after_bootmem;
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
-
-static void __init init_gbpages(void)
-{
-#ifdef CONFIG_X86_64
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-#endif
-}
+early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
struct map_range {
unsigned long start;
@@ -157,16 +147,12 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
- init_gbpages();
-
#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (direct_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
if (cpu_has_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
#endif
@@ -179,6 +165,15 @@ static void __init probe_page_size_mask(void)
if (cpu_has_pge) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
+ } else
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+
+ /* Enable 1 GB linear kernel mappings if available: */
+ if (direct_gbpages && cpu_has_gbpages) {
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ } else {
+ direct_gbpages = 0;
}
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 30eb05ae7061..3fba623e3ba5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
return 0;
}
-static int __init parse_direct_gbpages_off(char *arg)
-{
- direct_gbpages = 0;
- return 0;
-}
-early_param("nogbpages", parse_direct_gbpages_off);
-
-static int __init parse_direct_gbpages_on(char *arg)
-{
- direct_gbpages = 1;
- return 0;
-}
-early_param("gbpages", parse_direct_gbpages_on);
-
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
* physical space so we can cache the place of the first one and move
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
/*
* Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
*
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
* have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+ return cpu_has_gbpages;
+#else
+ return 0;
+#endif
+}
+
+int arch_ioremap_pmd_supported(void)
+{
+ return cpu_has_pse;
+}
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
deleted file mode 100644
index 1e9da795767a..000000000000
--- a/arch/x86/mm/memtest.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/pfn.h>
-#include <linux/memblock.h>
-
-static u64 patterns[] __initdata = {
- /* The first entry has to be 0 to leave memtest with zeroed memory */
- 0,
- 0xffffffffffffffffULL,
- 0x5555555555555555ULL,
- 0xaaaaaaaaaaaaaaaaULL,
- 0x1111111111111111ULL,
- 0x2222222222222222ULL,
- 0x4444444444444444ULL,
- 0x8888888888888888ULL,
- 0x3333333333333333ULL,
- 0x6666666666666666ULL,
- 0x9999999999999999ULL,
- 0xccccccccccccccccULL,
- 0x7777777777777777ULL,
- 0xbbbbbbbbbbbbbbbbULL,
- 0xddddddddddddddddULL,
- 0xeeeeeeeeeeeeeeeeULL,
- 0x7a6c7258554e494cULL, /* yeah ;-) */
-};
-
-static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
-{
- printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
- (unsigned long long) pattern,
- (unsigned long long) start_bad,
- (unsigned long long) end_bad);
- memblock_reserve(start_bad, end_bad - start_bad);
-}
-
-static void __init memtest(u64 pattern, u64 start_phys, u64 size)
-{
- u64 *p, *start, *end;
- u64 start_bad, last_bad;
- u64 start_phys_aligned;
- const size_t incr = sizeof(pattern);
-
- start_phys_aligned = ALIGN(start_phys, incr);
- start = __va(start_phys_aligned);
- end = start + (size - (start_phys_aligned - start_phys)) / incr;
- start_bad = 0;
- last_bad = 0;
-
- for (p = start; p < end; p++)
- *p = pattern;
-
- for (p = start; p < end; p++, start_phys_aligned += incr) {
- if (*p == pattern)
- continue;
- if (start_phys_aligned == last_bad + incr) {
- last_bad += incr;
- continue;
- }
- if (start_bad)
- reserve_bad_mem(pattern, start_bad, last_bad + incr);
- start_bad = last_bad = start_phys_aligned;
- }
- if (start_bad)
- reserve_bad_mem(pattern, start_bad, last_bad + incr);
-}
-
-static void __init do_one_pass(u64 pattern, u64 start, u64 end)
-{
- u64 i;
- phys_addr_t this_start, this_end;
-
- for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
- this_start = clamp_t(phys_addr_t, this_start, start, end);
- this_end = clamp_t(phys_addr_t, this_end, start, end);
- if (this_start < this_end) {
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long)this_start,
- (unsigned long long)this_end,
- (unsigned long long)cpu_to_be64(pattern));
- memtest(pattern, this_start, this_end - this_start);
- }
- }
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
- if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
- else
- memtest_pattern = ARRAY_SIZE(patterns);
-
- return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-void __init early_memtest(unsigned long start, unsigned long end)
-{
- unsigned int i;
- unsigned int idx = 0;
-
- if (!memtest_pattern)
- return;
-
- printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
- for (i = memtest_pattern-1; i < UINT_MAX; --i) {
- idx = i % ARRAY_SIZE(patterns);
- do_one_pass(patterns[idx], start, end);
- }
-}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
{
- unsigned long rnd = 0;
+ unsigned long rnd;
/*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
- if (current->flags & PF_RANDOMIZE) {
- if (mmap_is_ia32())
- rnd = get_random_int() % (1<<8);
- else
- rnd = get_random_int() % (1<<28);
- }
+ * 8 bits of randomness in 32bit mmaps, 20 address space bits
+ * 28 bits of randomness in 64bit mmaps, 40 address space bits
+ */
+ if (mmap_is_ia32())
+ rnd = (unsigned long)get_random_int() % (1<<8);
+ else
+ rnd = (unsigned long)get_random_int() % (1<<28);
+
return rnd << PAGE_SHIFT;
}
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
{
unsigned long gap = rlimit(RLIMIT_STACK);
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+ return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}
/*
* Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
* does, but not when emulating X86_32
*/
-static unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(unsigned long rnd)
{
if (mmap_is_ia32())
return TASK_UNMAPPED_BASE;
else
- return TASK_UNMAPPED_BASE + mmap_rnd();
+ return TASK_UNMAPPED_BASE + rnd;
}
/*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
- mm->mmap_legacy_base = mmap_legacy_base();
- mm->mmap_base = mmap_base();
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ mm->mmap_legacy_base = mmap_legacy_base(random_factor);
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
+ mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index cd4785bbacb9..4053bb58bf92 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void)
&memblock.reserved, mb->nid);
}
- /* Mark all kernel nodes. */
+ /*
+ * Mark all kernel nodes.
+ *
+ * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
+ * may not include all the memblock.reserved memory ranges because
+ * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+ */
for_each_memblock(reserved, r)
- node_set(r->nid, numa_kernel_nodes);
+ if (r->nid != MAX_NUMNODES)
+ node_set(r->nid, numa_kernel_nodes);
/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
for (i = 0; i < numa_meminfo.nr_blks; i++) {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 536ea2fb6e33..89af288ec674 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m)
seq_printf(m, "DirectMap4M: %8lu kB\n",
direct_pages_count[PG_LEVEL_2M] << 12);
#endif
-#ifdef CONFIG_X86_64
if (direct_gbpages)
seq_printf(m, "DirectMap1G: %8lu kB\n",
direct_pages_count[PG_LEVEL_1G] << 20);
-#endif
}
#else
static inline void split_page_count(int level) { }
@@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages)
{
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
-EXPORT_SYMBOL_GPL(set_memory_ro);
int set_memory_rw(unsigned long addr, int numpages)
{
return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
-EXPORT_SYMBOL_GPL(set_memory_rw);
int set_memory_np(unsigned long addr, int numpages)
{
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 7ac68698406c..35af6771a95a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
}
#ifdef CONFIG_STRICT_DEVMEM
-/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
return 1;
@@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
while (cursor < to) {
if (!devmem_is_allowed(pfn)) {
- printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
- current->comm, from, to - 1);
+ printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+ current->comm, from, to - 1);
return 0;
}
cursor += PAGE_SIZE;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7b22adaad4f1..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#include <asm/mtrr.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
tlb_remove_page(tlb, pte);
}
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb_remove_page(tlb, page);
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
- if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- PAGETABLE_LEVELS == 4) {
+ if (CONFIG_PGTABLE_LEVELS == 2 ||
+ (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ CONFIG_PGTABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -275,12 +276,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
+/*
+ * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
+ * assumes that pgd should be in one page.
+ *
+ * But kernel with PAE paging that is not running as a Xen domain
+ * only needs to allocate 32 bytes for pgd instead of one page.
+ */
+#ifdef CONFIG_X86_PAE
+
+#include <linux/slab.h>
+
+#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_ALIGN 32
+
+static struct kmem_cache *pgd_cache;
+
+static int __init pgd_cache_init(void)
+{
+ /*
+ * When PAE kernel is running as a Xen domain, it does not use
+ * shared kernel pmd. And this requires a whole page for pgd.
+ */
+ if (!SHARED_KERNEL_PMD)
+ return 0;
+
+ /*
+ * when PAE kernel is not running as a Xen domain, it uses
+ * shared kernel pmd. Shared kernel pmd does not require a whole
+ * page for pgd. We are able to just allocate a 32-byte for pgd.
+ * During boot time, we create a 32-byte slab for pgd table allocation.
+ */
+ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
+ SLAB_PANIC, NULL);
+ if (!pgd_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+core_initcall(pgd_cache_init);
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ /*
+ * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
+ * We allocate one page for pgd.
+ */
+ if (!SHARED_KERNEL_PMD)
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+
+ /*
+ * Now PAE kernel is not running as a Xen domain. We can allocate
+ * a 32-byte slab for pgd to save memory space.
+ */
+ return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ if (!SHARED_KERNEL_PMD)
+ free_page((unsigned long)pgd);
+ else
+ kmem_cache_free(pgd_cache, pgd);
+}
+#else
+static inline pgd_t *_pgd_alloc(void)
+{
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ free_page((unsigned long)pgd);
+}
+#endif /* CONFIG_X86_PAE */
+
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
- pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+ pgd = _pgd_alloc();
if (pgd == NULL)
goto out;
@@ -310,7 +386,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
out_free_pmds:
free_pmds(mm, pmds);
out_free_pgd:
- free_page((unsigned long)pgd);
+ _pgd_free(pgd);
out:
return NULL;
}
@@ -320,7 +396,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
- free_page((unsigned long)pgd);
+ _pgd_free(pgd);
}
/*
@@ -485,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr;
+
+ /*
+ * Do not use a huge page when the range is covered by non-WB type
+ * of MTRRs.
+ */
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+ if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pud, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr;
+
+ /*
+ * Do not use a huge page when the range is covered by non-WB type
+ * of MTRRs.
+ */
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+ if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pmd, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_large(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 5d04be5efb64..4e664bdb535a 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
{
struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
- if (!user_mode_vm(regs)) {
+ if (!user_mode(regs)) {
unsigned long stack = kernel_stack_pointer(regs);
if (depth)
dump_trace(NULL, regs, (unsigned long *)stack, 0,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2fb384724ebb..8fd6f44aee83 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
if (!bus) {
pci_free_resource_list(&resources);
kfree(sd);
+ return;
}
+ pci_bus_add_devices(bus);
}
void __init pcibios_set_cache_line_size(void)
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index d143d216d52b..d7f997f7c26d 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -67,7 +67,7 @@ void __init efi_bgrt_init(void)
image = efi_lookup_mapped_addr(bgrt_tab->image_address);
if (!image) {
- image = early_memremap(bgrt_tab->image_address,
+ image = early_ioremap(bgrt_tab->image_address,
sizeof(bmp_header));
ioremapped = true;
if (!image) {
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void)
}
if (ioremapped) {
- image = early_memremap(bgrt_tab->image_address,
+ image = early_ioremap(bgrt_tab->image_address,
bmp_header.size);
if (!image) {
pr_err("Ignoring BGRT: failed to map image memory\n");
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index dbc8627a5cdf..02744df576d5 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -85,12 +85,20 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
efi_memory_desc_t *virtual_map)
{
efi_status_t status;
+ unsigned long flags;
+ pgd_t *save_pgd;
- efi_call_phys_prolog();
+ save_pgd = efi_call_phys_prolog();
+
+ /* Disable interrupts around EFI calls: */
+ local_irq_save(flags);
status = efi_call_phys(efi_phys.set_virtual_address_map,
memory_map_size, descriptor_size,
descriptor_version, virtual_map);
- efi_call_phys_epilog();
+ local_irq_restore(flags);
+
+ efi_call_phys_epilog(save_pgd);
+
return status;
}
@@ -491,7 +499,8 @@ void __init efi_init(void)
if (efi_memmap_init())
return;
- print_efi_memmap();
+ if (efi_enabled(EFI_DBG))
+ print_efi_memmap();
}
void __init efi_late_init(void)
@@ -939,6 +948,8 @@ static int __init arch_parse_efi_cmdline(char *str)
{
if (parse_option_str(str, "old_map"))
set_bit(EFI_OLD_MEMMAP, &efi.flags);
+ if (parse_option_str(str, "debug"))
+ set_bit(EFI_DBG, &efi.flags);
return 0;
}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e7cda52936..ed5b67338294 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -33,11 +33,10 @@
/*
* To make EFI call EFI runtime service in physical addressing mode we need
- * prolog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
+ * prolog/epilog before/after the invocation to claim the EFI runtime service
+ * handler exclusively and to duplicate a memory mapping in low memory space,
+ * say 0 - 3G.
*/
-static unsigned long efi_rt_eflags;
void efi_sync_low_kernel_mappings(void) {}
void __init efi_dump_pagetable(void) {}
@@ -57,21 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md)
void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
-void __init efi_call_phys_prolog(void)
+pgd_t * __init efi_call_phys_prolog(void)
{
struct desc_ptr gdt_descr;
+ pgd_t *save_pgd;
- local_irq_save(efi_rt_eflags);
-
+ /* Current pgd is swapper_pg_dir, we'll restore it later: */
+ save_pgd = swapper_pg_dir;
load_cr3(initial_page_table);
__flush_tlb_all();
gdt_descr.address = __pa(get_cpu_gdt_table(0));
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
+
+ return save_pgd;
}
-void __init efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
struct desc_ptr gdt_descr;
@@ -79,10 +81,8 @@ void __init efi_call_phys_epilog(void)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- load_cr3(swapper_pg_dir);
+ load_cr3(save_pgd);
__flush_tlb_all();
-
- local_irq_restore(efi_rt_eflags);
}
void __init efi_runtime_mkexec(void)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 17e80d829df0..a0ac0f9c307f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,9 +41,6 @@
#include <asm/realmode.h>
#include <asm/time.h>
-static pgd_t *save_pgd __initdata;
-static unsigned long efi_flags __initdata;
-
/*
* We allocate runtime services regions bottom-up, starting from -4G, i.e.
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
@@ -78,17 +75,18 @@ static void __init early_code_mapping_set_exec(int executable)
}
}
-void __init efi_call_phys_prolog(void)
+pgd_t * __init efi_call_phys_prolog(void)
{
unsigned long vaddress;
+ pgd_t *save_pgd;
+
int pgd;
int n_pgds;
if (!efi_enabled(EFI_OLD_MEMMAP))
- return;
+ return NULL;
early_code_mapping_set_exec(1);
- local_irq_save(efi_flags);
n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
@@ -99,24 +97,29 @@ void __init efi_call_phys_prolog(void)
set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
}
__flush_tlb_all();
+
+ return save_pgd;
}
-void __init efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
/*
* After the lock is released, the original page table is restored.
*/
- int pgd;
- int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+ int pgd_idx;
+ int nr_pgds;
- if (!efi_enabled(EFI_OLD_MEMMAP))
+ if (!save_pgd)
return;
- for (pgd = 0; pgd < n_pgds; pgd++)
- set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
+ nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+ for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+ set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+
kfree(save_pgd);
+
__flush_tlb_all();
- local_irq_restore(efi_flags);
early_code_mapping_set_exec(0);
}
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index c9a0838890e2..278e4da4222f 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -11,6 +11,7 @@
*/
#include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
#include <asm/imr.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -101,6 +102,12 @@ static void __init imr_self_test(void)
}
}
+static const struct x86_cpu_id imr_ids[] __initconst = {
+ { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, imr_ids);
+
/**
* imr_self_test_init - entry point for IMR driver.
*
@@ -108,7 +115,8 @@ static void __init imr_self_test(void)
*/
static int __init imr_self_test_init(void)
{
- imr_self_test();
+ if (x86_match_cpu(imr_ids))
+ imr_self_test();
return 0;
}
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 9a2e590dd202..7fa8b3b53bc0 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 08e350e757dc..55130846ac87 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 994798548b1a..3b6ec42718e4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
struct reset_args reset_args;
reset_args.sender = sender;
- cpus_clear(*mask);
+ cpumask_clear(mask);
/* find a single cpu for each uvhub in this distribution mask */
maskbits = sizeof(struct pnmask) * BITSPERBYTE;
/* each bit is a pnode relative to the partition base pnode */
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
continue;
apnode = pnode + bcp->partition_base_pnode;
cpu = pnode_to_first_cpu(apnode, smaster);
- cpu_set(cpu, *mask);
+ cpumask_set_cpu(cpu, mask);
}
/* IPI all cpus; preemption is already disabled */
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
/* don't actually do a shootdown of the local cpu */
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
- if (cpu_isset(cpu, *cpumask))
+ if (cpumask_test_cpu(cpu, cpumask))
stat->s_ntargself++;
bau_desc = bcp->descriptor_base;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5648a0..757678fb26e1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_table(cpu);
tss_desc tss;
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ece1c9f..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
110 i386 iopl sys_iopl
111 i386 vhangup sys_vhangup
112 i386 idle
-113 i386 vm86old sys_vm86old sys32_vm86_warning
+113 i386 vm86old sys_vm86old sys_ni_syscall
114 i386 wait4 sys_wait4 compat_sys_wait4
115 i386 swapoff sys_swapoff
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
@@ -172,7 +172,7 @@
163 i386 mremap sys_mremap
164 i386 setresuid sys_setresuid16
165 i386 getresuid sys_getresuid16
-166 i386 vm86 sys_vm86 sys32_vm86_warning
+166 i386 vm86 sys_vm86 sys_ni_syscall
167 i386 query_module
168 i386 poll sys_poll
169 i386 nfsservctl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fbb57aa..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
169 common reboot sys_reboot
170 common sethostname sys_sethostname
171 common setdomainname sys_setdomainname
-172 common iopl stub_iopl
+172 common iopl sys_iopl
173 common ioperm sys_ioperm
174 64 create_module
175 common init_module sys_init_module
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..8ffd2146fa6a 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
*/
static inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9db2217..a75d8700472a 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
*/
/* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall
/*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 7b9be9822724..275a3a8b78af 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
-HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi
hostprogs-y += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
@@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
+clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322751e0..40d2473836c9 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
cycle_t ret;
u64 last;
u32 version;
+ u32 migrate_count;
u8 flags;
unsigned cpu, cpu1;
/*
- * Note: hypervisor must guarantee that:
- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
- * 2. that per-CPU pvclock time info is updated if the
- * underlying CPU changes.
- * 3. that version is increased whenever underlying CPU
- * changes.
- *
+ * When looping to get a consistent (time-info, tsc) pair, we
+ * also need to deal with the possibility we can switch vcpus,
+ * so make sure we always re-fetch time-info for the current vcpu.
*/
do {
cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -102,20 +99,27 @@ static notrace cycle_t vread_pvclock(int *mode)
* __getcpu() calls (Gleb).
*/
- pvti = get_pvti(cpu);
+ /* Make sure migrate_count will change if we leave the VCPU. */
+ do {
+ pvti = get_pvti(cpu);
+ migrate_count = pvti->migrate_count;
+
+ cpu1 = cpu;
+ cpu = __getcpu() & VGETCPU_CPU_MASK;
+ } while (unlikely(cpu != cpu1));
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/*
* Test we're still on the cpu as well as the version.
- * We could have been migrated just after the first
- * vgetcpu but before fetching the version, so we
- * wouldn't notice a version change.
+ * - We must read TSC of pvti's VCPU.
+ * - KVM doesn't follow the versioning protocol, so data could
+ * change before version if we left the VCPU.
*/
- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
- } while (unlikely(cpu != cpu1 ||
- (pvti->pvti.version & 1) ||
- pvti->pvti.version != version));
+ smp_rmb();
+ } while (unlikely((pvti->pvti.version & 1) ||
+ pvti->pvti.version != version ||
+ pvti->migrate_count != migrate_count));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
*mode = VCLOCK_NONE;
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S
index 5415b5613d55..6b286bb5251c 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -19,8 +19,6 @@ __kernel_vsyscall:
.Lpush_ebp:
movl %ecx, %ebp
syscall
- movl $__USER32_DS, %ecx
- movl %ecx, %ss
movl %ebp, %ecx
popl %ebp
.Lpop_ebp:
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..81665c9f2132 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
+ tss->x86_tss.sp0 = thread->sp0;
}
static void xen_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..65083ad63b6f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
/*
* (Yet another) pagetable walker. This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.set_pgd = xen_set_pgd;
#endif
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..86484384492e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
set_cpu_online(cpu, true);
- this_cpu_write(cpu_state, CPU_ONLINE);
-
- wmb();
+ cpu_set_state_online(cpu); /* Implies full memory barrier. */
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
-
- wmb(); /* make sure everything is out */
}
/*
@@ -445,21 +441,19 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
{
int rc;
- per_cpu(current_task, cpu) = idle;
-#ifdef CONFIG_X86_32
- irq_ctx_init(cpu);
-#else
- clear_tsk_thread_flag(idle, TIF_FORK);
-#endif
- per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(idle) -
- KERNEL_STACK_OFFSET + THREAD_SIZE;
+ common_cpu_up(cpu, idle);
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /*
+ * PV VCPUs are always successfully taken down (see 'while' loop
+ * in xen_cpu_die()), so -EBUSY is an error.
+ */
+ rc = cpu_check_up_prepare(cpu);
+ if (rc)
+ return rc;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -468,10 +462,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
if (rc)
return rc;
- if (num_online_cpus() == 1)
- /* Just in case we booted with a single CPU. */
- alternatives_enable_smp();
-
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
@@ -479,10 +469,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
- while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+ while (cpu_report_state(cpu) != CPU_ONLINE)
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
- barrier();
- }
return 0;
}
@@ -511,11 +499,11 @@ static void xen_cpu_die(unsigned int cpu)
schedule_timeout(HZ/10);
}
- cpu_die_common(cpu);
-
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
+ if (common_cpu_die(cpu) == 0) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ }
}
static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +735,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc;
+
+ /*
+ * This can happen if CPU was offlined earlier and
+ * offlining timed out in common_cpu_die().
+ */
+ if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ }
+
/*
* xen_smp_intr_init() needs to run before native_cpu_up()
* so that IPI vectors are set up on the booting CPU before
@@ -768,12 +766,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
return rc;
}
-static void xen_hvm_cpu_die(unsigned int cpu)
-{
- xen_cpu_die(cpu);
- native_cpu_die(cpu);
-}
-
void __init xen_hvm_smp_init(void)
{
if (!xen_have_vector_callback)
@@ -781,7 +773,7 @@ void __init xen_hvm_smp_init(void)
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
smp_ops.cpu_up = xen_hvm_cpu_up;
- smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.cpu_die = xen_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index c4df9dbd63b7..d9497698645a 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,5 +1,5 @@
#include <linux/types.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
@@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled)
static void xen_vcpu_notify_restore(void *data)
{
- unsigned long reason = (unsigned long)data;
-
/* Boot processor notified via generic timekeeping_resume() */
- if ( smp_processor_id() == 0)
+ if (smp_processor_id() == 0)
return;
- clockevents_notify(reason, NULL);
+ tick_resume_local();
}
void xen_arch_resume(void)
{
- on_each_cpu(xen_vcpu_notify_restore,
- (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
+ on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
}
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefda4275..985fc3ee0973 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
- movq %rsp, PER_CPU_VAR(old_rsp)
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
- pushq PER_CPU_VAR(old_rsp)
+ pushq PER_CPU_VAR(rsp_scratch)
pushq %r11
pushq $__USER_CS
pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
- movq %rsp, PER_CPU_VAR(old_rsp)
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
- pushq PER_CPU_VAR(old_rsp)
+ pushq PER_CPU_VAR(rsp_scratch)
pushq %r11
pushq $__USER32_CS
pushq %rcx