diff options
Diffstat (limited to 'arch/x86')
154 files changed, 4126 insertions, 3699 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 3942f74c92d7..1538562cc720 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,3 +1,6 @@ + +obj-y += entry/ + obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support @@ -11,7 +14,7 @@ obj-y += kernel/ obj-y += mm/ obj-y += crypto/ -obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6bbb991d0f3c..7e39f9b22705 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -9,140 +9,141 @@ config 64BIT config X86_32 def_bool y depends on !64BIT - select CLKSRC_I8253 - select HAVE_UID16 config X86_64 def_bool y depends on 64BIT - select X86_DEV_DMA_OPS - select ARCH_USE_CMPXCHG_LOCKREF - select HAVE_LIVEPATCH ### Arch settings config X86 def_bool y - select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI + select ACPI_LEGACY_TABLES_LOOKUP if ACPI + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI + select ANON_INODES + select ARCH_CLOCKSOURCE_DATA + select ARCH_DISCARD_MEMBLOCK + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_SG_CHAIN + select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO - select HAVE_AOUT if X86_32 - select HAVE_UNSTABLE_SCHED_CLOCK - select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 - select ARCH_SUPPORTS_INT128 if X86_64 - select HAVE_IDE - select HAVE_OPROFILE - select HAVE_PCSPKR_PLATFORM - select HAVE_PERF_EVENTS - select HAVE_IOREMAP_PROT - select HAVE_KPROBES - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK - select ARCH_WANT_OPTIONAL_GPIOLIB + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if X86_64 + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_64 + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_FRAME_POINTERS - select HAVE_DMA_ATTRS - select HAVE_DMA_CONTIGUOUS - select HAVE_KRETPROBES + select ARCH_WANT_IPC_PARSE_VERSION if X86_32 + select ARCH_WANT_OPTIONAL_GPIOLIB + select BUILDTIME_EXTABLE_SORT + select CLKEVT_I8253 + select CLKSRC_I8253 if X86_32 + select CLOCKSOURCE_VALIDATE_LAST_CYCLE + select CLOCKSOURCE_WATCHDOG + select CLONE_BACKWARDS if X86_32 + select COMPAT_OLD_SIGACTION if IA32_EMULATION + select DCACHE_WORD_ACCESS + select GENERIC_CLOCKEVENTS + select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) + select GENERIC_CLOCKEVENTS_MIN_ADJUST + select GENERIC_CMOS_UPDATE + select GENERIC_CPU_AUTOPROBE select GENERIC_EARLY_IOREMAP - select HAVE_OPTPROBES - select HAVE_KPROBES_ON_FTRACE - select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FENTRY if X86_64 + select GENERIC_FIND_FIRST_BIT + select GENERIC_IOMAP + select GENERIC_IRQ_PROBE + select GENERIC_IRQ_SHOW + select GENERIC_PENDING_IRQ if SMP + select GENERIC_SMP_IDLE_THREAD + select GENERIC_STRNCPY_FROM_USER + select GENERIC_STRNLEN_USER + select GENERIC_TIME_VSYSCALL + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI + select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_AOUT if X86_32 + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KGDB + select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SOFT_DIRTY if X86_64 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_BPF_JIT if X86_64 + select HAVE_CC_STACKPROTECTOR + select HAVE_CMPXCHG_DOUBLE + select HAVE_CMPXCHG_LOCAL + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_C_RECORDMCOUNT + select HAVE_DEBUG_KMEMLEAK + select HAVE_DEBUG_STACKOVERFLOW + select HAVE_DMA_API_DEBUG + select HAVE_DMA_ATTRS + select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_SYSCALL_TRACEPOINTS - select SYSCTL_EXCEPTION_TRACE - select HAVE_KVM - select HAVE_ARCH_KGDB - select HAVE_ARCH_TRACEHOOK - select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS - select USER_STACKTRACE_SUPPORT - select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_DMA_API_DEBUG - select HAVE_KERNEL_GZIP + select HAVE_FENTRY if X86_64 + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_GRAPH_FP_TEST + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACER + select HAVE_GENERIC_DMA_COHERENT if X86_32 + select HAVE_HW_BREAKPOINT + select HAVE_IDE + select HAVE_IOREMAP_PROT + select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 + select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA - select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO - select HAVE_KERNEL_LZ4 - select HAVE_HW_BREAKPOINT + select HAVE_KERNEL_XZ + select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE + select HAVE_KRETPROBES + select HAVE_KVM + select HAVE_LIVEPATCH if X86_64 + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS - select PERF_EVENTS + select HAVE_OPROFILE + select HAVE_OPTPROBES + select HAVE_PCSPKR_PLATFORM + select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_DEBUG_KMEMLEAK - select ANON_INODES - select HAVE_ALIGNED_STRUCT_PAGE if SLUB - select HAVE_CMPXCHG_LOCAL - select HAVE_CMPXCHG_DOUBLE - select HAVE_ARCH_KMEMCHECK - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UID16 if X86_32 + select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER - select ARCH_HAS_ELF_RANDOMIZE - select HAVE_ARCH_JUMP_LABEL - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select SPARSE_IRQ - select GENERIC_FIND_FIRST_BIT - select GENERIC_IRQ_PROBE - select GENERIC_PENDING_IRQ if SMP - select GENERIC_IRQ_SHOW - select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING - select HAVE_BPF_JIT if X86_64 - select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) - select ARCH_HAS_SG_CHAIN - select CLKEVT_I8253 - select ARCH_HAVE_NMI_SAFE_CMPXCHG - select GENERIC_IOMAP - select DCACHE_WORD_ACCESS - select GENERIC_SMP_IDLE_THREAD - select ARCH_WANT_IPC_PARSE_VERSION if X86_32 - select HAVE_ARCH_SECCOMP_FILTER - select BUILDTIME_EXTABLE_SORT - select GENERIC_CMOS_UPDATE - select HAVE_ARCH_SOFT_DIRTY if X86_64 - select CLOCKSOURCE_WATCHDOG - select GENERIC_CLOCKEVENTS - select ARCH_CLOCKSOURCE_DATA - select CLOCKSOURCE_VALIDATE_LAST_CYCLE - select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL - select GENERIC_STRNCPY_FROM_USER - select GENERIC_STRNLEN_USER - select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_IRQ_TIME_ACCOUNTING - select VIRT_TO_BUS - select MODULES_USE_ELF_REL if X86_32 - select MODULES_USE_ELF_RELA if X86_64 - select CLONE_BACKWARDS if X86_32 - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_QUEUE_RWLOCK - select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION - select OLD_SIGACTION if X86_32 - select COMPAT_OLD_SIGACTION if IA32_EMULATION + select MODULES_USE_ELF_RELA if X86_64 + select MODULES_USE_ELF_REL if X86_32 + select OLD_SIGACTION if X86_32 + select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION + select PERF_EVENTS select RTC_LIB - select HAVE_DEBUG_STACKOVERFLOW - select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 - select HAVE_CC_STACKPROTECTOR - select GENERIC_CPU_AUTOPROBE - select HAVE_ARCH_AUDITSYSCALL - select ARCH_SUPPORTS_ATOMIC_RMW - select HAVE_ACPI_APEI if ACPI - select HAVE_ACPI_APEI_NMI if ACPI - select ACPI_LEGACY_TABLES_LOOKUP if ACPI - select X86_FEATURE_NAMES if PROC_FS + select SPARSE_IRQ select SRCU + select SYSCTL_EXCEPTION_TRACE + select USER_STACKTRACE_SUPPORT + select VIRT_TO_BUS + select X86_DEV_DMA_OPS if X86_64 + select X86_FEATURE_NAMES if PROC_FS config INSTRUCTION_DECODER def_bool y @@ -260,10 +261,6 @@ config X86_64_SMP def_bool y depends on X86_64 && SMP -config X86_HT - def_bool y - depends on SMP - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -441,6 +438,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on X86_X2APIC + depends on PCI ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. @@ -665,7 +663,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK + select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -850,11 +848,12 @@ config NR_CPUS default "1" if !SMP default "8192" if MAXSMP default "32" if SMP && X86_BIGSMP - default "8" if SMP + default "8" if SMP && X86_32 + default "64" if SMP ---help--- This allows you to specify the maximum number of CPUs which this kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum - supported value is 4096, otherwise the maximum value is 512. The + supported value is 8192, otherwise the maximum value is 512. The minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds @@ -862,7 +861,7 @@ config NR_CPUS config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" - depends on X86_HT + depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making when dealing with Intel Pentium 4 chips with HyperThreading at a @@ -872,7 +871,7 @@ config SCHED_SMT config SCHED_MC def_bool y prompt "Multi-core scheduler support" - depends on X86_HT + depends on SMP ---help--- Multi-core scheduler support improves the CPU scheduler's decision making when dealing with multi-core CPU chips at a cost of slightly diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 72484a645f05..a5973f851750 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -332,4 +332,15 @@ config X86_DEBUG_STATIC_CPU_HAS If unsure, say N. +config PUNIT_ATOM_DEBUG + tristate "ATOM Punit debug driver" + select DEBUG_FS + select IOSF_MBI + ---help--- + This is a debug driver, which gets the power states + of all Punit North Complex devices. The power states of + each device is exposed as part of the debugfs interface. + The current power state can be read from + /sys/kernel/debug/punit_atom/dev_power_state + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 57996ee840dd..118e6debc483 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -149,12 +149,6 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp -# do binutils support CFI? -cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) -# is .cfi_signal_frame supported too? -cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) -cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) - # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) @@ -162,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) @@ -187,7 +181,7 @@ archscripts: scripts_basic # Syscall table generation archheaders: - $(Q)$(MAKE) $(build)=arch/x86/syscalls all + $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all archprepare: ifeq ($(CONFIG_KEXEC_FILE),y) @@ -250,7 +244,7 @@ install: PHONY += vdso_install vdso_install: - $(Q)$(MAKE) $(build)=arch/x86/vdso $@ + $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@ archclean: $(Q)rm -rf $(objtree)/arch/i386 diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 000000000000..7a144971db79 --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the x86 low level entry code +# +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + +obj-y += vdso/ +obj-y += vsyscall/ + +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o + diff --git a/arch/x86/include/asm/calling.h b/arch/x86/entry/calling.h index 1c8b50edb2db..f4e6308c4200 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/entry/calling.h @@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with */ -#include <asm/dwarf2.h> - #ifdef CONFIG_X86_64 /* @@ -91,28 +89,27 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - subq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 15*8+\addskip + addq $-(15*8+\addskip), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 .if \r11 - movq_cfi r11, 6*8+\offset + movq %r11, 6*8+\offset(%rsp) .endif .if \r8910 - movq_cfi r10, 7*8+\offset - movq_cfi r9, 8*8+\offset - movq_cfi r8, 9*8+\offset + movq %r10, 7*8+\offset(%rsp) + movq %r9, 8*8+\offset(%rsp) + movq %r8, 9*8+\offset(%rsp) .endif .if \rax - movq_cfi rax, 10*8+\offset + movq %rax, 10*8+\offset(%rsp) .endif .if \rcx - movq_cfi rcx, 11*8+\offset + movq %rcx, 11*8+\offset(%rsp) .endif - movq_cfi rdx, 12*8+\offset - movq_cfi rsi, 13*8+\offset - movq_cfi rdi, 14*8+\offset + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_EXTRA_REGS offset=0 - movq_cfi r15, 0*8+\offset - movq_cfi r14, 1*8+\offset - movq_cfi r13, 2*8+\offset - movq_cfi r12, 3*8+\offset - movq_cfi rbp, 4*8+\offset - movq_cfi rbx, 5*8+\offset + movq %r15, 0*8+\offset(%rsp) + movq %r14, 1*8+\offset(%rsp) + movq %r13, 2*8+\offset(%rsp) + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) .endm .macro SAVE_EXTRA_REGS_RBP offset=0 - movq_cfi rbp, 4*8+\offset + movq %rbp, 4*8+\offset(%rsp) .endm .macro RESTORE_EXTRA_REGS offset=0 - movq_cfi_restore 0*8+\offset, r15 - movq_cfi_restore 1*8+\offset, r14 - movq_cfi_restore 2*8+\offset, r13 - movq_cfi_restore 3*8+\offset, r12 - movq_cfi_restore 4*8+\offset, rbp - movq_cfi_restore 5*8+\offset, rbx + movq 0*8+\offset(%rsp), %r15 + movq 1*8+\offset(%rsp), %r14 + movq 2*8+\offset(%rsp), %r13 + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx .endm .macro ZERO_EXTRA_REGS @@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 6*8, r11 + movq 6*8(%rsp), %r11 .endif .if \rstor_r8910 - movq_cfi_restore 7*8, r10 - movq_cfi_restore 8*8, r9 - movq_cfi_restore 9*8, r8 + movq 7*8(%rsp), %r10 + movq 8*8(%rsp), %r9 + movq 9*8(%rsp), %r8 .endif .if \rstor_rax - movq_cfi_restore 10*8, rax + movq 10*8(%rsp), %rax .endif .if \rstor_rcx - movq_cfi_restore 11*8, rcx + movq 11*8(%rsp), %rcx .endif .if \rstor_rdx - movq_cfi_restore 12*8, rdx + movq 12*8(%rsp), %rdx .endif - movq_cfi_restore 13*8, rsi - movq_cfi_restore 14*8, rdi + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 @@ -204,8 +201,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - addq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) + subq $-(15*8+\addskip), %rsp .endm .macro icebp @@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx .endm .macro RESTORE_ALL - popl_cfi_reg ebx - popl_cfi_reg ecx - popl_cfi_reg edx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebp - popl_cfi_reg eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S new file mode 100644 index 000000000000..21dc60a60b5f --- /dev/null +++ b/arch/x86/entry/entry_32.S @@ -0,0 +1,1248 @@ +/* + * Copyright (C) 1991,1992 Linus Torvalds + * + * entry_32.S contains the system-call and low-level fault and trap handling routines. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all registers on the stack. + * If the order here is changed, it needs to be + * updated in fork.c:copy_process(), signal.c:do_signal(), + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + */ + +#include <linux/linkage.h> +#include <linux/err.h> +#include <asm/thread_info.h> +#include <asm/irqflags.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/page_types.h> +#include <asm/percpu.h> +#include <asm/processor-flags.h> +#include <asm/ftrace.h> +#include <asm/irq_vectors.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> +#include <asm/asm.h> +#include <asm/smap.h> + +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysenter_audit syscall_trace_entry +# define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +# define preempt_stop(clobbers) +# define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs +.endm + +.macro POP_GS pop=0 +98: popl %gs + .if \pop <> 0 + add $\pop, %esp + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b, 99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b, 99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds +2: popl %es +3: popl %fs + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 5b) + _ASM_EXTABLE(3b, 6b) + POP_GS_EX +.endm + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + jmp syscall_exit +END(ret_from_fork) + +ENTRY(ret_from_kernel_thread) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + movl PT_EBP(%esp), %eax + call *PT_EBX(%esp) + movl $0, PT_EAX(%esp) + jmp syscall_exit +ENDPROC(ret_from_kernel_thread) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +need_resched: + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + +/* + * SYSENTER_RETURN points to after the SYSENTER instruction + * in the vsyscall page. See vsyscall-sysentry.S, which defines + * the symbol. + */ + + # SYSENTER call handler stub +ENTRY(entry_SYSENTER_32) + movl TSS_sysenter_sp0(%esp), %esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $__USER_DS + pushl %ebp + pushfl + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. + */ + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + + pushl %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3, %ebp + jae syscall_fault + ASM_STAC +1: movl (%ebp), %ebp + ASM_CLAC + movl %ebp, PT_EBP(%esp) + _ASM_EXTABLE(1b, syscall_fault) + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(, %eax, 4) +sysenter_after_call: + movl %eax, PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp, %ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp), %eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax, %edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al, %eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp), %eax /* reload syscall return value */ + jmp sysenter_exit +#endif + +.pushsection .fixup, "ax" +2: movl $0, PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b, 2b) + PTGS_TO_GS_EX +ENDPROC(entry_SYSENTER_32) + + # system call handler stub +ENTRY(entry_INT80_32) + ASM_CLAC + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(, %eax, 4) +syscall_after_call: + movl %eax, PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup, "ax" +ENTRY(iret_exc ) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous + _ASM_EXTABLE(irq_return, iret_exc) + +#ifdef CONFIG_X86_ESPFIX32 +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the IRET: + */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck +#endif +ENDPROC(entry_INT80_32) + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space +1: +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace + +#ifdef CONFIG_VM86 + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b +#endif +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS, PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + +syscall_fault: + ASM_CLAC + GET_THREAD_INFO(%ebp) + movl $-EFAULT, PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS, %eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS, %eax + jmp sysenter_after_call +END(sysenter_badsys) + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp, %eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ +ENDPROC(name) + + +#ifdef CONFIG_TRACING +# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +# define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) + +/* The include is where all of the SMP etc. interrupts come from */ +#include <asm/entry_arch.h> + +ENTRY(coprocessor_error) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_error + jmp error_code +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + ASM_CLAC + pushl $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM +#else + pushl $do_simd_coprocessor_error +#endif + jmp error_code +END(simd_coprocessor_error) + +ENTRY(device_not_available) + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + ASM_CLAC + pushl $0 + pushl $do_overflow + jmp error_code +END(overflow) + +ENTRY(bounds) + ASM_CLAC + pushl $0 + pushl $do_bounds + jmp error_code +END(bounds) + +ENTRY(invalid_op) + ASM_CLAC + pushl $0 + pushl $do_invalid_op + jmp error_code +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + ASM_CLAC + pushl $do_invalid_TSS + jmp error_code +END(invalid_TSS) + +ENTRY(segment_not_present) + ASM_CLAC + pushl $do_segment_not_present + jmp error_code +END(segment_not_present) + +ENTRY(stack_segment) + ASM_CLAC + pushl $do_stack_segment + jmp error_code +END(stack_segment) + +ENTRY(alignment_check) + ASM_CLAC + pushl $do_alignment_check + jmp error_code +END(alignment_check) + +ENTRY(divide_error) + ASM_CLAC + pushl $0 # no error code + pushl $do_divide_error + jmp error_code +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + ASM_CLAC + pushl $0 + pushl machine_check_vector + jmp error_code +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + ASM_CLAC + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + +ENTRY(xen_hypervisor_callback) + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + + /* + * Check to see if we got the event in the critical + * region in xen_iret_direct, after we've reenabled + * events and checked for pending events. This simulates + * iret instruction's behaviour where it delivers a + * pending interrupt when enabling interrupts: + */ + movl PT_EIP(%esp), %eax + cmpl $xen_iret_start_crit, %eax + jb 1f + cmpl $xen_iret_end_crit, %eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp ret_from_intr +ENDPROC(xen_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we fix up by reattempting the load, and zeroing the segment + * register if the load fails. + * Category 2 we fix up by jumping to do_iret_error. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by maintaining a status value in EAX. + */ +ENTRY(xen_failsafe_callback) + pushl %eax + movl $1, %eax +1: mov 4(%esp), %ds +2: mov 8(%esp), %es +3: mov 12(%esp), %fs +4: mov 16(%esp), %gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax, %eax + popl %eax + lea 16(%esp), %esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + +.section .fixup, "ax" +6: xorl %eax, %eax + movl %eax, 4(%esp) + jmp 1b +7: xorl %eax, %eax + movl %eax, 8(%esp) + jmp 2b +8: xorl %eax, %eax + movl %eax, 12(%esp) + jmp 3b +9: xorl %eax, %eax + movl %eax, 16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b, 6b) + _ASM_EXTABLE(2b, 7b) + _ASM_EXTABLE(3b, 8b) + _ASM_EXTABLE(4b, 9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +ftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS, 13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) + ASM_CLAC + pushl $trace_do_page_fault + jmp error_code +END(trace_page_fault) +#endif + +ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + call *%edi + jmp ret_from_exception +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp +.endm + +ENTRY(debug) + ASM_CLAC + cmpl $entry_SYSENTER_32, (%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx, %edx # error code 0 + movl %esp, %eax # pt_regs pointer + call do_debug + jmp ret_from_exception +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack +#endif + cmpl $entry_SYSENTER_32, (%esp) + je nmi_stack_fixup + pushl %eax + movl %esp, %eax + /* + * Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1), %eax + cmpl $(THREAD_SIZE-20), %eax + popl %eax + jae nmi_stack_correct + cmpl $entry_SYSENTER_32, 12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + +nmi_stack_fixup: + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + cmpw $__KERNEL_CS, 16(%esp) + jne nmi_stack_correct + cmpl $debug, (%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn, (%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* + * create the pointer to lss back + */ + pushl %ss + pushl %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx, %edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return +#endif +END(nmi) + +ENTRY(int3) + ASM_CLAC + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_int3 + jmp ret_from_exception +END(int3) + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + ASM_CLAC + pushl $do_async_page_fault + jmp error_code +END(async_page_fault) +#endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/entry/entry_64.S index 22aadc917868..3bb2c4302df1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -4,34 +4,25 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - */ - -/* + * * entry.S contains the system-call and fault low-level handling routines. * * Some of this is documented in Documentation/x86/entry_64.txt * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * * Some macro usage: - * - CFI macros are used to generate dwarf2 unwind information for better - * backtraces. They don't change any code. - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. + * - ENTRY/END: Define functions in the symbol table. + * - TRACE_IRQ_*: Trace hardirq state for lock debugging. + * - idtentry: Define exception entry points. */ - #include <linux/linkage.h> #include <asm/segment.h> #include <asm/cache.h> #include <asm/errno.h> -#include <asm/dwarf2.h> -#include <asm/calling.h> +#include "calling.h" #include <asm/asm-offsets.h> #include <asm/msr.h> #include <asm/unistd.h> @@ -49,13 +40,12 @@ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 +.code64 +.section .entry.text, "ax" #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -64,11 +54,10 @@ ENTRY(native_usergs_sysret64) ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ - .macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON 1: #endif @@ -88,89 +77,34 @@ ENDPROC(native_usergs_sysret64) #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) .macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_OFF - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_ON - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON_DEBUG 1: .endm #else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif /* - * empty frame - */ - .macro EMPTY_FRAME start=1 offset=0 - .if \start - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,8+\offset - .else - CFI_DEF_CFA_OFFSET 8+\offset - .endif - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) - */ - .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, 5*8+\offset - /*CFI_REL_OFFSET ss, 4*8+\offset*/ - CFI_REL_OFFSET rsp, 3*8+\offset - /*CFI_REL_OFFSET rflags, 2*8+\offset*/ - /*CFI_REL_OFFSET cs, 1*8+\offset*/ - CFI_REL_OFFSET rip, 0*8+\offset - .endm - -/* - * initial frame state for exceptions with error code (and interrupts - * with vector already pushed) - */ - .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, 1*8+\offset - .endm - -/* - * frame that enables passing a complete pt_regs to a C function. - */ - .macro DEFAULT_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset - CFI_REL_OFFSET rdi, RDI+\offset - CFI_REL_OFFSET rsi, RSI+\offset - CFI_REL_OFFSET rdx, RDX+\offset - CFI_REL_OFFSET rcx, RCX+\offset - CFI_REL_OFFSET rax, RAX+\offset - CFI_REL_OFFSET r8, R8+\offset - CFI_REL_OFFSET r9, R9+\offset - CFI_REL_OFFSET r10, R10+\offset - CFI_REL_OFFSET r11, R11+\offset - CFI_REL_OFFSET rbx, RBX+\offset - CFI_REL_OFFSET rbp, RBP+\offset - CFI_REL_OFFSET r12, R12+\offset - CFI_REL_OFFSET r13, R13+\offset - CFI_REL_OFFSET r14, R14+\offset - CFI_REL_OFFSET r15, R15+\offset - .endm - -/* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack @@ -186,7 +120,7 @@ ENDPROC(native_usergs_sysret64) * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) * * Only called from user space. * @@ -195,13 +129,7 @@ ENDPROC(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ -ENTRY(system_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - +ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -213,14 +141,14 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -GLOBAL(system_call_after_swapgs) +GLOBAL(entry_SYSCALL_64_after_swapgs) - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq_cfi $__USER_DS /* pt_regs->ss */ - pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -229,36 +157,34 @@ GLOBAL(system_call_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - pushq_cfi_reg r8 /* pt_regs->r8 */ - pushq_cfi_reg r9 /* pt_regs->r9 */ - pushq_cfi_reg r10 /* pt_regs->r10 */ - pushq_cfi_reg r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 6*8 - - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys -system_call_fastpath: + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys +entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* * Syscall return path ending with SYSRET (fast path). @@ -279,19 +205,15 @@ system_call_fastpath: * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is * very bad. */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - - CFI_REMEMBER_STATE + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - CFI_REGISTER rip,rcx - movq EFLAGS(%rsp),%r11 - /*CFI_REGISTER rflags,r11*/ - movq RSP(%rsp),%rsp + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + movq RSP(%rsp), %rsp /* - * 64bit SYSRET restores rip from rcx, + * 64-bit SYSRET restores rip from rcx, * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. @@ -307,25 +229,23 @@ system_call_fastpath: */ USERGS_SYSRET64 - CFI_RESTORE_STATE - /* Do syscall entry tracing */ tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax, %rdx + call syscall_trace_enter_phase2 /* * Reload registers from stack in case ptrace changed them. @@ -335,15 +255,15 @@ tracesys_phase2: RESTORE_C_REGS_EXCEPT_RAX RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx /* fixup for C */ + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -355,31 +275,33 @@ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi + movl $_TIF_ALLWORK_MASK, %edi /* edi: mask to check */ GLOBAL(int_with_check) LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz int_careful + andl $~TS_COMPAT, TI_status(%rcx) jmp syscall_return - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ + /* + * Either reschedule or signal or syscall exit tracking needed. + * First do a reschedule test. + * edx: work, edi: workmask + */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + bt $TIF_NEED_RESCHED, %edx + jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: @@ -387,27 +309,27 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq_cfi %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq_cfi %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest + testl $_TIF_WORK_SYSCALL_EXIT, %edx + jz int_signal + pushq %rdi + leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi + jmp int_restore_rest int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi + testl $_TIF_DO_NOTIFY_MASK, %edx + jz 1f + movq %rsp, %rdi /* &ptregs -> arg1 */ + xorl %esi, %esi /* oldset -> arg2 */ + call do_notify_resume +1: movl $_TIF_WORK_MASK, %edi int_restore_rest: RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check syscall_return: /* The IRETQ could re-enable interrupts: */ @@ -418,10 +340,10 @@ syscall_return: * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq RCX(%rsp),%rcx - movq RIP(%rsp),%r11 - cmpq %rcx,%r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 + cmpq %rcx, %r11 /* RCX == RIP */ + jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -434,19 +356,21 @@ syscall_return: .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif + /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 jne opportunistic_sysret_failed - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed /* * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, @@ -455,47 +379,41 @@ syscall_return: * with register state that satisfies the opportunistic SYSRET * conditions. For example, single-stepping this user code: * - * movq $stuck_here,%rcx + * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed /* nothing to check for RSP */ - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - CFI_REMEMBER_STATE /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp),%rsp + movq RSP(%rsp), %rsp USERGS_SYSRET64 - CFI_RESTORE_STATE opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret - CFI_ENDPROC -END(system_call) +END(entry_SYSCALL_64) .macro FORK_LIKE func ENTRY(stub_\func) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 - jmp sys_\func - CFI_ENDPROC + jmp sys_\func END(stub_\func) .endm @@ -504,8 +422,6 @@ END(stub_\func) FORK_LIKE vfork ENTRY(stub_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execve return_from_execve: testl %eax, %eax @@ -515,11 +431,9 @@ return_from_execve: 1: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) + movq %rax, RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_execve) /* * Remaining execve stubs are only 7 bytes long. @@ -527,32 +441,23 @@ END(stub_execve) */ .align 8 GLOBAL(stub_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub_execveat) #if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) GLOBAL(stub32_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve - CFI_ENDPROC END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) GLOBAL(stub32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub32_execveat) END(stub_x32_execveat) #endif @@ -562,8 +467,6 @@ END(stub_x32_execveat) * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* * SAVE_EXTRA_REGS result is not normally needed: * sigreturn overwrites all pt_regs->GPREGS. @@ -572,24 +475,19 @@ ENTRY(stub_rt_sigreturn) * we SAVE_EXTRA_REGS here. */ SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn + call sys_rt_sigreturn return_from_stub: addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call - CFI_ENDPROC + movq %rax, RAX(%rsp) + jmp int_ret_from_sys_call END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub - CFI_ENDPROC + call sys32_x32_rt_sigreturn + jmp return_from_stub END(stub_x32_rt_sigreturn) #endif @@ -599,36 +497,36 @@ END(stub_x32_rt_sigreturn) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - DEFAULT_FRAME - LOCK ; btr $TIF_FORK,TI_flags(%r8) + LOCK ; btr $TIF_FORK, TI_flags(%r8) - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags + pushq $0x0002 + popfq /* reset kernel eflags */ - call schedule_tail # rdi: 'prev' task parameter + call schedule_tail /* rdi: 'prev' task parameter */ RESTORE_EXTRA_REGS - testb $3, CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) /* from kernel_thread? */ /* * By the time we get here, we have no idea whether our pt_regs, * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. + * the slow path, or one of the 32-bit compat paths. * Use IRET code path to return, since it can safely handle * all of the above. */ jnz int_ret_from_sys_call - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) + /* + * We came from kernel_thread + * nb: we depend on RESTORE_EXTRA_REGS above + */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call - CFI_ENDPROC + jmp int_ret_from_sys_call END(ret_from_fork) /* @@ -637,16 +535,13 @@ END(ret_from_fork) */ .align 8 ENTRY(irq_entries_start) - INTR_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -8 .align 8 .endr - CFI_ENDPROC END(irq_entries_start) /* @@ -672,7 +567,7 @@ END(irq_entries_start) /* this goes to 0(%rsp) for unwinder, not for saving the value: */ SAVE_EXTRA_REGS_RBP -RBP - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ testb $3, CS-RBP(%rsp) jz 1f @@ -685,24 +580,14 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - CFI_DEF_CFA_REGISTER rsi - pushq %rsi - /* - * For debugger: - * "CFA (Current Frame Address) is the value on stack + offset" - */ - CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 (rsp) */, 0, \ - 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ - 0x22 /* DW_OP_plus */ + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rsi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func + call \func .endm /* @@ -711,42 +596,36 @@ END(irq_entries_start) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - XCPT_FRAME ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ interrupt do_IRQ /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) /* Restore saved previous stack */ - popq %rsi - CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ + popq %rsi /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP + leaq -RBP(%rsi), %rsp testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ - +retint_user: GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ + + /* %rcx: thread info. Interrupts are off. */ retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK, %edi retint_check: LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - CFI_REMEMBER_STATE - jnz retint_careful + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz retint_careful -retint_swapgs: /* return to user-space */ +retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ @@ -761,9 +640,9 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ + bt $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) +0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq jmp 0b @@ -781,8 +660,6 @@ retint_kernel: restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: INTERRUPT_RETURN ENTRY(native_iret) @@ -791,8 +668,8 @@ ENTRY(native_iret) * 64-bit mode SS:RSP on the exception stack is always valid. */ #ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt + testb $4, (SS-RIP)(%rsp) + jnz native_irq_return_ldt #endif .global native_irq_return_iret @@ -807,62 +684,60 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq_cfi %rax - pushq_cfi %rdi + pushq %rax + pushq %rdi SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq_cfi %rdi - orq PER_CPU_VAR(espfix_stack),%rax + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (1*8)(%rdi) + movq (3*8)(%rsp), %rax /* CS */ + movq %rax, (2*8)(%rdi) + movq (4*8)(%rsp), %rax /* RFLAGS */ + movq %rax, (3*8)(%rdi) + movq (6*8)(%rsp), %rax /* SS */ + movq %rax, (5*8)(%rdi) + movq (5*8)(%rsp), %rax /* RSP */ + movq %rax, (4*8)(%rdi) + andl $0xffff0000, %eax + popq %rdi + orq PER_CPU_VAR(espfix_stack), %rax SWAPGS - movq %rax,%rsp - popq_cfi %rax - jmp native_irq_return_iret + movq %rax, %rsp + popq %rax + jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + bt $TIF_NEED_RESCHED, %edx + jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp retint_check + jmp retint_check retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs + testl $_TIF_DO_NOTIFY_MASK, %edx + jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume + movq $-1, ORIG_RAX(%rsp) + xorl %esi, %esi /* oldset */ + movq %rsp, %rdi /* &pt_regs */ + call do_notify_resume RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule + jmp retint_with_reschedule - CFI_ENDPROC END(common_interrupt) /* @@ -870,13 +745,11 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) - INTR_FRAME ASM_CLAC - pushq_cfi $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym - jmp ret_from_intr - CFI_ENDPROC + jmp ret_from_intr END(\sym) .endm @@ -898,55 +771,45 @@ trace_apicinterrupt \num \sym .endm #ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt #endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi + +apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ - kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt +apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt #endif #ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt +apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt #endif #ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt +apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt +apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt #endif /* @@ -961,100 +824,87 @@ ENTRY(\sym) .error "using shift_ist requires paranoid=1" .endif - .if \has_error_code - XCPT_FRAME - .else - INTR_FRAME - .endif - ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - CFI_REMEMBER_STATE - testb $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ + testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ + jnz 1f .endif - call paranoid_entry + call paranoid_entry .else - call error_entry + call error_entry .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - DEFAULT_FRAME 0 - .if \paranoid .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif .endif - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif - call \do_sym + call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit + jmp paranoid_exit .else - jmp error_exit + jmp error_exit .endif .if \paranoid == 1 - CFI_RESTORE_STATE /* * Paranoid entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ 1: - call error_entry + call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ + movq %rsp, %rdi /* pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif - call \do_sym + call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit /* %ebx: no swapgs flag */ .endif - - CFI_ENDPROC END(\sym) .endm @@ -1069,65 +919,58 @@ idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* Reload gs selector with exception handling */ - /* edi: new selector */ +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* + * Reload gs selector with exception handling + * edi: new selector + */ ENTRY(native_load_gs_index) - CFI_STARTPROC - pushfq_cfi + pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: - movl %edi,%gs -2: mfence /* workaround */ + movl %edi, %gs +2: mfence /* workaround */ SWAPGS - popfq_cfi + popfq ret - CFI_ENDPROC END(native_load_gs_index) - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" + _ASM_EXTABLE(gs_change, bad_gs) + .section .fixup, "ax" /* running with kernelgs */ bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b + SWAPGS /* switch back to user gs */ + xorl %eax, %eax + movl %eax, %gs + jmp 2b .previous /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - CFI_STARTPROC - pushq_cfi %rbp - CFI_REL_OFFSET rbp,0 - mov %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq + pushq %rbp + mov %rsp, %rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr), %rsp + push %rbp /* frame pointer backlink */ + call __do_softirq leaveq - CFI_RESTORE rbp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) ret - CFI_ENDPROC END(do_softirq_own_stack) #ifdef CONFIG_XEN @@ -1146,29 +989,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) - CFI_STARTPROC +ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ - movq %rdi, %rsp # we don't return, adjust the stack frame - CFI_ENDPROC - DEFAULT_FRAME -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - CFI_DEF_CFA_REGISTER rsp - decl PER_CPU_VAR(irq_count) + movq %rdi, %rsp /* we don't return, adjust the stack frame */ +11: incl PER_CPU_VAR(irq_count) + movq %rsp, %rbp + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rbp /* frame pointer backlink */ + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall + call xen_maybe_preempt_hcall #endif - jmp error_exit - CFI_ENDPROC + jmp error_exit END(xen_do_hypervisor_callback) /* @@ -1185,51 +1023,35 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - INTR_FRAME 1 (6*8) - /*CFI_REL_OFFSET gs,GS*/ - /*CFI_REL_OFFSET fs,FS*/ - /*CFI_REL_OFFSET es,ES*/ - /*CFI_REL_OFFSET ds,DS*/ - CFI_REL_OFFSET r11,8 - CFI_REL_OFFSET rcx,0 - movl %ds,%ecx - cmpw %cx,0x10(%rsp) - CFI_REMEMBER_STATE - jne 1f - movl %es,%ecx - cmpw %cx,0x18(%rsp) - jne 1f - movl %fs,%ecx - cmpw %cx,0x20(%rsp) - jne 1f - movl %gs,%ecx - cmpw %cx,0x28(%rsp) - jne 1f + movl %ds, %ecx + cmpw %cx, 0x10(%rsp) + jne 1f + movl %es, %ecx + cmpw %cx, 0x18(%rsp) + jne 1f + movl %fs, %ecx + cmpw %cx, 0x20(%rsp) + jne 1f + movl %gs, %ecx + cmpw %cx, 0x28(%rsp) + jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - CFI_RESTORE rcx - movq 8(%rsp),%r11 - CFI_RESTORE r11 - addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 /* RIP */ - pushq_cfi %r11 - pushq_cfi %rcx - jmp general_protection - CFI_RESTORE_STATE + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - CFI_RESTORE rcx - movq 8(%rsp),%r11 - CFI_RESTORE r11 - addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS - jmp error_exit - CFI_ENDPROC + jmp error_exit END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1242,21 +1064,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 + #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 + +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 + #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif + #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* @@ -1265,19 +1091,17 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx + movl $1, %ebx + movl $MSR_GS_BASE, %ecx rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ + testl %edx, %edx + js 1f /* negative -> in kernel */ SWAPGS - xorl %ebx,%ebx + xorl %ebx, %ebx 1: ret - CFI_ENDPROC END(paranoid_entry) /* @@ -1289,17 +1113,17 @@ END(paranoid_entry) * in syscall entry), so checking for preemption here would * be complicated. Fortunately, we there's no good reason * to try to handle preemption here. + * + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + testl %ebx, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore + jmp paranoid_exit_restore paranoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG paranoid_exit_restore: @@ -1307,24 +1131,24 @@ paranoid_exit_restore: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN - CFI_ENDPROC END(paranoid_exit) /* * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx + xorl %ebx, %ebx testb $3, CS+8(%rsp) jz error_kernelspace -error_swapgs: + + /* We entered from user mode */ SWAPGS -error_sti: + +error_entry_done: TRACE_IRQS_OFF ret @@ -1335,56 +1159,66 @@ error_sti: * for these here too. */ error_kernelspace: - CFI_REL_OFFSET rcx, RCX+8 - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti + incl %ebx + leaq native_irq_return_iret(%rip), %rcx + cmpq %rcx, RIP+8(%rsp) + je error_bad_iret + movl %ecx, %eax /* zero extend */ + cmpq %rax, RIP+8(%rsp) + je bstep_iret + cmpq $gs_change, RIP+8(%rsp) + jne error_entry_done + + /* + * hack: gs_change can fail with user gsbase. If this happens, fix up + * gsbase and proceed. We'll fix up the exception and land in + * gs_change's error handler with kernel gsbase. + */ + SWAPGS + jmp error_entry_done bstep_iret: /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) + movq %rcx, RIP+8(%rsp) /* fall through */ error_bad_iret: + /* + * We came from an IRET to user mode, so we have user gsbase. + * Switch to kernel gsbase: + */ SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti - CFI_ENDPROC + + /* + * Pretend that the exception came from user mode: set up pt_regs + * as if we faulted immediately after IRET and clear EBX so that + * error_exit knows that we will be returning to user mode. + */ + mov %rsp, %rdi + call fixup_bad_iret + mov %rax, %rsp + decl %ebx + jmp error_entry_done END(error_entry) -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +/* + * On entry, EBS is a "return to kernel mode" flag: + * 1: already in kernel mode, don't need SWAPGS + * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode + */ ENTRY(error_exit) - DEFAULT_FRAME - movl %ebx,%eax + movl %ebx, %eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jnz retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs - CFI_ENDPROC + testl %eax, %eax + jnz retint_kernel + jmp retint_user END(error_exit) /* Runs on exception stack */ ENTRY(nmi) - INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME /* * We allow breakpoints in NMIs. If a breakpoint occurs, then @@ -1419,22 +1253,21 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user * space, which means it is definitely not nested. */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi /* * Check the special variable on the stack to see if NMIs are * executing. */ - cmpl $1, -8(%rsp) - je nested_nmi + cmpl $1, -8(%rsp) + je nested_nmi /* * Now test if the previous stack was an NMI stack. @@ -1448,51 +1281,46 @@ ENTRY(nmi) cmpq %rdx, 4*8(%rsp) /* If the stack pointer is above the NMI stack, this is a normal NMI */ ja first_nmi + subq $EXCEPTION_STKSZ, %rdx cmpq %rdx, 4*8(%rsp) /* If it is below the NMI stack, it is a normal NMI */ jb first_nmi /* Ah, it is within the NMI stack, treat it as nested */ - CFI_REMEMBER_STATE - nested_nmi: /* * Do nothing if we interrupted the fixup in repeat_nmi. * It's about to repeat the NMI handler, so we are fine * with ignoring this one. */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 1*8 - leaq -10*8(%rsp), %rdx - pushq_cfi $__KERNEL_DS - pushq_cfi %rdx - pushfq_cfi - pushq_cfi $__KERNEL_CS - pushq_cfi $repeat_nmi + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ - addq $(6*8), %rsp - CFI_ADJUST_CFA_OFFSET -6*8 + addq $(6*8), %rsp nested_nmi_out: - popq_cfi %rdx - CFI_RESTORE rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN - CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1530,23 +1358,18 @@ first_nmi: * is also used by nested NMIs and can not be trusted on exit. */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx - CFI_RESTORE rdx + movq (%rsp), %rdx /* Set the NMI executing variable on the stack. */ - pushq_cfi $1 + pushq $1 - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp - CFI_ADJUST_CFA_OFFSET 5*8 + /* Leave room for the "copied" frame */ + subq $(5*8), %rsp /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 11*8(%rsp) + pushq 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1565,16 +1388,14 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 10*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp - CFI_ADJUST_CFA_OFFSET -10*8 + addq $(10*8), %rsp .rept 5 - pushq_cfi -6*8(%rsp) + pushq -6*8(%rsp) .endr - subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET 5*8 + subq $(5*8), %rsp end_repeat_nmi: /* @@ -1582,7 +1403,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1592,8 +1413,7 @@ end_repeat_nmi: * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call paranoid_entry - DEFAULT_FRAME 0 + call paranoid_entry /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1604,21 +1424,21 @@ end_repeat_nmi: * origin fault. Save it off and restore it if it changes. * Use the r12 callee-saved register. */ - movq %cr2, %r12 + movq %cr2, %r12 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: @@ -1628,15 +1448,11 @@ nmi_restore: REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) - jmp irq_return - CFI_ENDPROC + movq $0, 5*8(%rsp) + INTERRUPT_RETURN END(nmi) ENTRY(ignore_sysret) - CFI_STARTPROC - mov $-ENOSYS,%eax + mov $-ENOSYS, %eax sysret - CFI_ENDPROC END(ignore_sysret) - diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S new file mode 100644 index 000000000000..bb187a6a877c --- /dev/null +++ b/arch/x86/entry/entry_64_compat.S @@ -0,0 +1,556 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +#include "calling.h" +#include <asm/asm-offsets.h> +#include <asm/current.h> +#include <asm/errno.h> +#include <asm/ia32_unistd.h> +#include <asm/thread_info.h> +#include <asm/segment.h> +#include <asm/irqflags.h> +#include <asm/asm.h> +#include <asm/smap.h> +#include <linux/linkage.h> +#include <linux/err.h> + +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32-bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(entry_SYSENTER_compat) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32-bit zero extended + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp), %esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + /* + * At this point, registers hold syscall args in the 32-bit syscall ABI: + * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. + * + * We want to pass them to __audit_syscall_entry(), which is a 64-bit + * C function with 5 parameters, so shuffle them to match what + * the function expects: RDI,RSI,RDX,RCX,R8. + */ + movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ + xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ + /* arg3 (RDX) <= 2nd syscall arg (ECX) */ + movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ + movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ + call __audit_syscall_entry + + /* + * We are going to jump back to the syscall dispatch code. + * Prepare syscall args as required by the 64-bit C ABI. + * Registers clobbered by __audit_syscall_entry() are + * loaded from pt_regs on stack: + */ + movl ORIG_RAX(%rsp), %eax /* syscall number */ + movl %ebx, %edi /* arg1 */ + movl RCX(%rsp), %esi /* arg2 */ + movl RDX(%rsp), %edx /* arg3 */ + movl RSI(%rsp), %ecx /* arg4 */ + movl RDI(%rsp), %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(entry_SYSENTER_compat) + +/* + * 32-bit SYSCALL instruction entry. + * + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(entry_SYSCALL_compat) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp, %ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + movl RCX(%rsp), %ebp + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 + TRACE_IRQS_ON + movl RSP(%rsp), %esp + /* + * 64-bit->32-bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32-bit->32-bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + auditsys_entry_common + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp cstar_do_call +END(entry_SYSCALL_compat) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT, RAX(%rsp) +ia32_ret_from_sys_call: + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_ret_from_sys_call + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(entry_INT80_compat) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ + cld + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys + +ia32_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(entry_INT80_compat) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip), %rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip), %rax + /* + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: + */ + xchg %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/entry/syscall_32.c index 3777189c4a19..8ea34f94e973 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max +#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/entry/syscall_64.c index 4ac730b37f0b..4ac730b37f0b 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/entry/syscall_64.c diff --git a/arch/x86/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index a55abb9f6c5e..57aa59fd140c 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -1,5 +1,5 @@ -out := $(obj)/../include/generated/asm -uapi := $(obj)/../include/generated/uapi/asm +out := $(obj)/../../include/generated/asm +uapi := $(obj)/../../include/generated/uapi/asm # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ef8187f9d28d..ef8187f9d28d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9ef32d5f1b19..9ef32d5f1b19 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh index 31fd5f1f38f7..31fd5f1f38f7 100644 --- a/arch/x86/syscalls/syscallhdr.sh +++ b/arch/x86/entry/syscalls/syscallhdr.sh diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 0e7f8ec071e7..0e7f8ec071e7 100644 --- a/arch/x86/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/entry/thunk_32.S index 5eb715087b80..e9acf5f4fc92 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -6,16 +6,14 @@ */ #include <linux/linkage.h> #include <asm/asm.h> - #include <asm/dwarf2.h> /* put return address in eax (arg1) */ .macro THUNK name, func, put_ret_addr_in_eax=0 .globl \name \name: - CFI_STARTPROC - pushl_cfi_reg eax - pushl_cfi_reg ecx - pushl_cfi_reg edx + pushl %eax + pushl %ecx + pushl %edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -23,11 +21,10 @@ .endif call \func - popl_cfi_reg edx - popl_cfi_reg ecx - popl_cfi_reg eax + popl %edx + popl %ecx + popl %eax ret - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/entry/thunk_64.S index f89ba4e93025..3e95681b4e2d 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -6,35 +6,32 @@ * Subject to the GNU public license, v.2. No warranty of any kind. */ #include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/calling.h> +#include "calling.h" #include <asm/asm.h> /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name \name: - CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - pushq_cfi_reg rdi - pushq_cfi_reg rsi - pushq_cfi_reg rdx - pushq_cfi_reg rcx - pushq_cfi_reg rax - pushq_cfi_reg r8 - pushq_cfi_reg r9 - pushq_cfi_reg r10 - pushq_cfi_reg r11 + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 .if \put_ret_addr_in_rdi /* 9*8(%rsp) is return addr on stack */ - movq_cfi_restore 9*8, rdi + movq 9*8(%rsp), %rdi .endif call \func jmp restore - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm @@ -57,19 +54,16 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) - CFI_STARTPROC - CFI_ADJUST_CFA_OFFSET 9*8 restore: - popq_cfi_reg r11 - popq_cfi_reg r10 - popq_cfi_reg r9 - popq_cfi_reg r8 - popq_cfi_reg rax - popq_cfi_reg rcx - popq_cfi_reg rdx - popq_cfi_reg rsi - popq_cfi_reg rdi + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi ret - CFI_ENDPROC _ASM_NOKPROBE(restore) #endif diff --git a/arch/x86/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore index aae8ffdd5880..aae8ffdd5880 100644 --- a/arch/x86/vdso/.gitignore +++ b/arch/x86/entry/vdso/.gitignore diff --git a/arch/x86/vdso/Makefile b/arch/x86/entry/vdso/Makefile index e97032069f88..e97032069f88 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh index 7ee90a9b549d..7ee90a9b549d 100755 --- a/arch/x86/vdso/checkundef.sh +++ b/arch/x86/entry/vdso/checkundef.sh diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 9793322751e0..9793322751e0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index de2c921025f5..de2c921025f5 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S index 79a071e4357e..79a071e4357e 100644 --- a/arch/x86/vdso/vdso-note.S +++ b/arch/x86/entry/vdso/vdso-note.S diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 6807932643c2..6807932643c2 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 8627db24a7f6..8627db24a7f6 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 0224987556ce..0224987556ce 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index e904c270573b..e904c270573b 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore index e45fba9d0ced..e45fba9d0ced 100644 --- a/arch/x86/vdso/vdso32/.gitignore +++ b/arch/x86/entry/vdso/vdso32/.gitignore diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S index b15b7c01aedb..b15b7c01aedb 100644 --- a/arch/x86/vdso/vdso32/int80.S +++ b/arch/x86/entry/vdso/vdso32/int80.S diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index c83f25734696..c83f25734696 100644 --- a/arch/x86/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index d7ec4e251c0a..d7ec4e251c0a 100644 --- a/arch/x86/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S index 6b286bb5251c..6b286bb5251c 100644 --- a/arch/x86/vdso/vdso32/syscall.S +++ b/arch/x86/entry/vdso/vdso32/syscall.S diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S index e354bceee0e0..e354bceee0e0 100644 --- a/arch/x86/vdso/vdso32/sysenter.S +++ b/arch/x86/entry/vdso/vdso32/sysenter.S diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 175cc72c0f68..175cc72c0f68 100644 --- a/arch/x86/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c index 541468e25265..541468e25265 100644 --- a/arch/x86/vdso/vdso32/vdso-fakesections.c +++ b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 31056cf294bf..31056cf294bf 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S index 697c11ece90c..697c11ece90c 100644 --- a/arch/x86/vdso/vdsox32.lds.S +++ b/arch/x86/entry/vdso/vdsox32.lds.S diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index 8ec3d1f4ce9a..8ec3d1f4ce9a 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c diff --git a/arch/x86/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1c9f750c3859..1c9f750c3859 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000000..a9f4856f622a --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 2dcc6ff6fdcc..2dcc6ff6fdcc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S index c9596a9af159..c9596a9af159 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index 51e330416995..51e330416995 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h index a8b2edec54fe..9dd7359a38a8 100644 --- a/arch/x86/kernel/vsyscall_trace.h +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -24,6 +24,6 @@ TRACE_EVENT(emulate_vsyscall, #endif #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ #define TRACE_INCLUDE_FILE vsyscall_trace #include <trace/define_trace.h> diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index bb635c641869..cd4339bae066 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -2,7 +2,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S deleted file mode 100644 index 63450a596800..000000000000 --- a/arch/x86/ia32/ia32entry.S +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - -#include <asm/dwarf2.h> -#include <asm/calling.h> -#include <asm/asm-offsets.h> -#include <asm/current.h> -#include <asm/errno.h> -#include <asm/ia32_unistd.h> -#include <asm/thread_info.h> -#include <asm/segment.h> -#include <asm/irqflags.h> -#include <asm/asm.h> -#include <asm/smap.h> -#include <linux/linkage.h> -#include <linux/err.h> - -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit ia32_ret_from_sys_call -#define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - - /* clobbers %rax */ - .macro CLEAR_RREGS _r9=rax - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) - .endm - - /* - * Reload arg registers from stack in case ptrace changed them. - * We don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. Instead, we just truncate that - * value to 32 bits again as we did on entry from user mode. - * If it's a new value set by user_regset during entry tracing, - * this matches the normal truncation of the user-mode value. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - .macro LOAD_ARGS32 _r9=0 - .if \_r9 - movl R9(%rsp),%r9d - .endif - movl RCX(%rsp),%ecx - movl RDX(%rsp),%edx - movl RSI(%rsp),%esi - movl RDI(%rsp),%edi - movl %eax,%eax /* zero extension */ - .endm - - .macro CFI_STARTPROC32 simple - CFI_STARTPROC \simple - CFI_UNDEFINED r8 - CFI_UNDEFINED r9 - CFI_UNDEFINED r10 - CFI_UNDEFINED r11 - CFI_UNDEFINED r12 - CFI_UNDEFINED r13 - CFI_UNDEFINED r14 - CFI_UNDEFINED r15 - .endm - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - -/* - * 32bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rsp,rbp - - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - CFI_REGISTER rip,r10 - - /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %rbp /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushfq_cfi /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 - - /* - * no need to do an access_ok check here because rbp has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%rbp),%ebp - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE - jnz sysenter_tracesys -sysenter_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp),%ecx /* User %eip */ - CFI_REGISTER rip,rcx - RESTORE_RSI_RDI - xorl %edx,%edx /* avoid info leaks */ - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - movl EFLAGS(%rsp),%r11d /* User eflags */ - /*CFI_RESTORE rflags*/ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp),%esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - - CFI_RESTORE_STATE - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi,%r8d /* 5th arg: 4th syscall arg */ - movl %ecx,%r9d /*swap with edx*/ - movl %edx,%ecx /* 4th arg: 3rd syscall arg */ - movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ - movl %ebx,%esi /* 2nd arg: 1st syscall arg */ - movl %eax,%edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp),%eax /* reload syscall number */ - movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI(%rsp),%r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp),%rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - CLEAR_RREGS - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq_cfi - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - CLEAR_RREGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - jmp sysenter_do_call - CFI_ENDPROC -ENDPROC(ia32_sysenter_target) - -/* - * 32bit SYSCALL instruction entry. - * - * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp,%r8d - CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %r8 /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rbp /* pt_regs->cx */ - movl %ebp,%ecx - pushq_cfi $-ENOSYS /* pt_regs->ax */ - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 - - /* - * no need to do an access_ok check here because r8 has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%r8),%r9d - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE - jnz cstar_tracesys -cstar_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - RESTORE_RSI_RDI_RDX - movl RIP(%rsp),%ecx - CFI_REGISTER rip,rcx - movl EFLAGS(%rsp),%r11d - /*CFI_REGISTER rflags,r11*/ - xorq %r10,%r10 - xorq %r9,%r9 - xorq %r8,%r8 - TRACE_IRQS_ON - movl RSP(%rsp),%esp - CFI_RESTORE rsp - /* - * 64bit->32bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32bit->32bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - CFI_RESTORE_STATE - movl %r9d,R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp),%r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - xchgl %r9d,%ebp - SAVE_EXTRA_REGS - CLEAR_RREGS r9 - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - xchgl %ebp,%r9d - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT,%rax - jmp ia32_sysret - CFI_ENDPROC - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,5*8 - /*CFI_REL_OFFSET ss,4*8 */ - CFI_REL_OFFSET rsp,3*8 - /*CFI_REL_OFFSET rflags,2*8 */ - /*CFI_REL_OFFSET cs,1*8 */ - CFI_REL_OFFSET rip,0*8 - - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys -ia32_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative -ia32_sysret: - movq %rax,RAX(%rsp) -1: -ia32_ret_from_sys_call: - CLEAR_RREGS - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - CLEAR_RREGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - jmp ia32_do_call - CFI_ENDPROC -END(ia32_syscall) - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip),%rax - jmp ia32_ptregs_common - .endm - - CFI_STARTPROC32 - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip),%rax - mov %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - CFI_ENDPROC - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SIZEOF_PTREGS - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP -/* CFI_REL_OFFSET cs,CS*/ -/* CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP -/* CFI_REL_OFFSET ss,SS*/ - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret - CFI_ENDPROC -END(ia32_ptregs_common) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 959e45b81fe2..e51a8f803f55 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -35,12 +35,12 @@ #define smp_mb() mb() #define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #endif /* SMP */ #define read_barrier_depends() do { } while (0) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 47c8e32f621a..b6f7457d12e4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -8,7 +8,7 @@ /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteBack + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent @@ -35,9 +35,11 @@ int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); int _set_memory_wb(unsigned long addr, int numpages); int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); @@ -48,10 +50,12 @@ int set_memory_4k(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); int set_memory_array_wb(unsigned long *addr, int addrinarray); int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); int set_pages_array_wb(struct page **pages, int addrinarray); /* diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 99c105d78b7e..ad19841eddfe 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -4,8 +4,6 @@ #include <linux/compiler.h> #include <asm/alternative.h> /* Provides LOCK_PREFIX */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h deleted file mode 100644 index de1cdaf4d743..000000000000 --- a/arch/x86/include/asm/dwarf2.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef _ASM_X86_DWARF2_H -#define _ASM_X86_DWARF2_H - -#ifndef __ASSEMBLY__ -#warning "asm/dwarf2.h should be only included in pure assembly files" -#endif - -/* - * Macros for dwarf2 CFI unwind table entries. - * See "as.info" for details on these pseudo ops. Unfortunately - * they are only supported in very new binutils, so define them - * away for older version. - */ - -#ifdef CONFIG_AS_CFI - -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined -#define CFI_ESCAPE .cfi_escape - -#ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame -#else -#define CFI_SIGNAL_FRAME -#endif - -#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) - /* - * Emit CFI data in .debug_frame sections, not .eh_frame sections. - * The latter we currently just discard since we don't do DWARF - * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. - */ - .cfi_sections .debug_frame -#endif - -#else - -/* - * Due to the structure of pre-exisiting code, don't use assembler line - * comment character # to ignore the arguments. Instead, use a dummy macro. - */ -.macro cfi_ignore a=0, b=0, c=0, d=0 -.endm - -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore -#define CFI_DEF_CFA_REGISTER cfi_ignore -#define CFI_DEF_CFA_OFFSET cfi_ignore -#define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_ESCAPE cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore - -#endif - -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - -#endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 27ca0afcccd7..df002992d8fd 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -52,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_AMD +BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) +#endif #endif diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 3b629f47eb65..793179cf8e21 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,20 +1,17 @@ #ifdef __ASSEMBLY__ #include <asm/asm.h> -#include <asm/dwarf2.h> /* The annotation hides the frame from the unwinder and makes it look like a ordinary ebp save/restore. This avoids some special cases for frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - __ASM_SIZE(push,_cfi) %__ASM_REG(bp) - CFI_REL_OFFSET __ASM_REG(bp), 0 + __ASM_SIZE(push,) %__ASM_REG(bp) __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) .endm .macro ENDFRAME - __ASM_SIZE(pop,_cfi) %__ASM_REG(bp) - CFI_RESTORE __ASM_REG(bp) + __ASM_SIZE(pop,) %__ASM_REG(bp) .endm #else .macro FRAME diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 986606539395..7178043b0e1d 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -34,6 +34,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#ifdef CONFIG_X86_MCE_AMD + unsigned int irq_deferred_error_count; +#endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 10c80d4f8386..6615032e19c8 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -40,6 +40,7 @@ extern asmlinkage void reschedule_interrupt(void); extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); +extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); @@ -54,6 +55,7 @@ extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); +extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34a5b93704d3..83ec9b1d77cc 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,11 +35,13 @@ */ #define ARCH_HAS_IOREMAP_WC +#define ARCH_HAS_IOREMAP_WT #include <linux/string.h> #include <linux/compiler.h> #include <asm/page.h> #include <asm/early_ioremap.h> +#include <asm/pgtable_types.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -177,6 +179,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * look at pci_iomap(). */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); @@ -197,8 +200,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> -#include <linux/vmalloc.h> - /* * Convert a virtual cached pointer to an uncached pointer */ @@ -320,6 +321,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -338,6 +340,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff #ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_index(int handle); +#define arch_phys_wc_index arch_phys_wc_index + extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0ed29ac13a9d..4c2d2eb2060a 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -83,22 +83,23 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -/* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM -#define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 -#endif - /* * IRQ work vector: */ #define IRQ_WORK_VECTOR 0xf6 #define UV_BAU_MESSAGE 0xf5 +#define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * Local APIC timer IRQ vector is on a different priority level, * to work around the 'lost local interrupt if more than 2 IRQ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dea2e7e962e3..f4a555beef19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -207,6 +207,7 @@ union kvm_mmu_page_role { unsigned nxe:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; + unsigned smap_andnot_wp:1; }; }; @@ -400,6 +401,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; + bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -743,6 +745,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1f5a86d518db..982dfc3679ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -17,11 +17,16 @@ #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ #define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ +#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ /* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */ + +/* MCG_EXT_CTL register defines */ +#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */ /* MCi_STATUS register defines */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ @@ -104,6 +109,7 @@ struct mce_log { struct mca_config { bool dont_log_ce; bool cmci_disabled; + bool lmce_disabled; bool ignore_ce; bool disabled; bool ser; @@ -117,8 +123,19 @@ struct mca_config { }; struct mce_vendor_flags { - __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ - __reserved_0 : 63; + /* + * overflow recovery cpuid bit indicates that overflow + * conditions are not fatal + */ + __u64 overflow_recov : 1, + + /* + * SUCCOR stands for S/W UnCorrectable error COntainment + * and Recovery. It indicates support for data poisoning + * in HW and deferred error interrupts. + */ + succor : 1, + __reserved_0 : 62; }; extern struct mce_vendor_flags mce_flags; @@ -168,12 +185,16 @@ void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); +void lmce_clear(void); +void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} +static inline void lmce_clear(void) {} +static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -223,6 +244,9 @@ void do_machine_check(struct pt_regs *, long); extern void (*mce_threshold_vector)(void); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* Deferred error interrupt handler */ +extern void (*deferred_error_int_vector)(void); + /* * Thermal handler */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c469490db4a8..9ebc3d009373 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -56,6 +56,7 @@ #define MSR_IA32_MCG_CAP 0x00000179 #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 @@ -140,6 +141,7 @@ #define MSR_CORE_C3_RESIDENCY 0x000003fc #define MSR_CORE_C6_RESIDENCY 0x000003fd #define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff #define MSR_PKG_C2_RESIDENCY 0x0000060d #define MSR_PKG_C8_RESIDENCY 0x00000630 #define MSR_PKG_C9_RESIDENCY 0x00000631 @@ -379,6 +381,7 @@ #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) +#define FEATURE_CONTROL_LMCE (1<<20) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index de36f22eb0b9..e6a707eb5081 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -1,13 +1,14 @@ #ifndef _ASM_X86_MSR_H #define _ASM_X86_MSR_H -#include <uapi/asm/msr.h> +#include "msr-index.h" #ifndef __ASSEMBLY__ #include <asm/asm.h> #include <asm/errno.h> #include <asm/cpumask.h> +#include <uapi/asm/msr.h> struct msr { union { @@ -205,8 +206,13 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ - (u32)((val) >> 32)) +/* + * 64-bit version of wrmsr_safe(): + */ +static inline int wrmsrl_safe(u32 msr, u64 val) +{ + return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); +} #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index f768f6298419..b94f6f64e23d 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -31,7 +31,7 @@ * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR -extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, @@ -48,14 +48,13 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern int phys_wc_to_mtrr_index(int handle); # else -static inline u8 mtrr_type_lookup(u64 addr, u64 end) +static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* * Return no-MTRRs: */ - return 0xff; + return MTRR_TYPE_INVALID; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) @@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline int phys_wc_to_mtrr_index(int handle) -{ - return -1; -} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) @@ -127,4 +122,8 @@ struct mtrr_gentry32 { _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) #endif /* CONFIG_COMPAT */ +/* Bit fields for enabled in struct mtrr_state_type */ +#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 +#define MTRR_STATE_MTRR_ENABLED 0x02 + #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 8957810ad7d1..d143bfad45d7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUED_SPINLOCKS + +static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, + u32 val) +{ + PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val); +} + +static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock); +} + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + PVOP_VCALL2(pv_lock_ops.wait, ptr, val); +} + +static __always_inline void pv_kick(int cpu) +{ + PVOP_VCALL1(pv_lock_ops.kick, cpu); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } -#endif +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +#endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 344c646e7f06..a6b8f9fadb06 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -334,9 +334,19 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +struct qspinlock; + struct pv_lock_ops { +#ifdef CONFIG_QUEUED_SPINLOCKS + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + struct paravirt_callee_save queued_spin_unlock; + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); +#else /* !CONFIG_QUEUED_SPINLOCKS */ struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 91bc4ba95f91..ca6c228d5e62 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -4,14 +4,9 @@ #include <linux/types.h> #include <asm/pgtable_types.h> -#ifdef CONFIG_X86_PAT -extern int pat_enabled; -#else -static const int pat_enabled; -#endif - +bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(void); +void pat_init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe57e7a98839..2562e303405b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * requested memtype: * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back + * - request is write-through, return cannot be write-back + * - request is write-through, return cannot be write-combine */ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WC && - new_pcm == _PAGE_CACHE_MODE_WB)) { + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WC)) { return 0; } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 78f0c8cbe316..13f310bfc09a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -367,6 +367,9 @@ extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); +#define pgprot_writethrough pgprot_writethrough +extern pgprot_t pgprot_writethrough(pgprot_t prot); + /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a90f8972dad5..a4a77286cb1d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,12 +5,14 @@ /* misc architecture specific prototypes */ -void system_call(void); void syscall_init(void); -void ia32_syscall(void); -void ia32_cstar_target(void); -void ia32_sysenter_target(void); +void entry_SYSCALL_64(void); +void entry_SYSCALL_compat(void); +void entry_INT80_32(void); +void entry_INT80_compat(void); +void entry_SYSENTER_32(void); +void entry_SYSENTER_compat(void); void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h new file mode 100644 index 000000000000..9d51fae1cba3 --- /dev/null +++ b/arch/x86/include/asm/qspinlock.h @@ -0,0 +1,57 @@ +#ifndef _ASM_X86_QSPINLOCK_H +#define _ASM_X86_QSPINLOCK_H + +#include <asm/cpufeature.h> +#include <asm-generic/qspinlock_types.h> +#include <asm/paravirt.h> + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void native_queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release((u8 *)lock, 0); +} + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + pv_queued_spin_unlock(lock); +} +#else +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +#endif + +#define virt_queued_spin_lock virt_queued_spin_lock + +static inline bool virt_queued_spin_lock(struct qspinlock *lock) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0) + cpu_relax(); + + return true; +} + +#include <asm-generic/qspinlock.h> + +#endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h new file mode 100644 index 000000000000..b002e711ba88 --- /dev/null +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); + +#endif diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aeb4666e0c0a..2270e41b32fd 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +/** + * pcommit_sfence() - persistent commit and fence + * + * The PCOMMIT instruction ensures that data that has been flushed from the + * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to + * memory and is durable on the DIMM. The primary use case for this is + * persistent memory. + * + * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT + * with appropriate fencing. + * + * Example: + * void flush_and_commit_buffer(void *vaddr, unsigned int size) + * { + * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + * void *vend = vaddr + size; + * void *p; + * + * for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + * p < vend; p += boot_cpu_data.x86_clflush_size) + * clwb(p); + * + * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes + * // MFENCE via mb() also works + * wmb(); + * + * // PCOMMIT and the required SFENCE for ordering + * pcommit_sfence(); + * } + * + * After this function completes the data pointed to by 'vaddr' has been + * accepted to memory and will be durable if the 'vaddr' points to persistent + * memory. + * + * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify + * things we include both the PCOMMIT and the required SFENCE in the + * alternatives generated by pcommit_sfence(). + */ static inline void pcommit_sfence(void) { alternative(ASM_NOP7, diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 64b611782ef0..be0a05913b91 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,6 +42,10 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#else + #ifdef CONFIG_PARAVIRT_SPINLOCKS static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) @@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 5f9d7572d82b..65c3e37f879a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,6 +23,9 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm-generic/qspinlock_types.h> +#else typedef struct arch_spinlock { union { __ticketpair_t head_tail; @@ -33,6 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ #include <asm-generic/qrwlock_types.h> diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 0e8f04f2c26f..8d717faeed22 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -26,7 +26,7 @@ #define _ASM_X86_TOPOLOGY_H #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP # define ENABLE_TOPO_DEFINES # endif #else diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 4cab890007a7..38a09a13a9bc 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -101,6 +101,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single); DEFINE_IRQ_VECTOR_EVENT(threshold_apic); /* + * deferred_error_apic - called when entering/exiting a deferred apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); + +/* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4e49d7dff78e..c5380bea2a36 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); -asmlinkage void mce_threshold_interrupt(void); +asmlinkage void smp_threshold_interrupt(void); +asmlinkage void smp_deferred_error_interrupt(void); #endif extern enum ctx_state ist_enter(struct pt_regs *regs); diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h index 155e51048fa4..c41f4fe25483 100644 --- a/arch/x86/include/uapi/asm/msr.h +++ b/arch/x86/include/uapi/asm/msr.h @@ -1,8 +1,6 @@ #ifndef _UAPI_ASM_X86_MSR_H #define _UAPI_ASM_X86_MSR_H -#include <asm/msr-index.h> - #ifndef __ASSEMBLY__ #include <linux/types.h> diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index d0acb658c8f4..7528dcf59691 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -103,7 +103,7 @@ struct mtrr_state_type { #define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) #define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) -/* These are the region types */ +/* MTRR memory types, which are defined in SDM */ #define MTRR_TYPE_UNCACHABLE 0 #define MTRR_TYPE_WRCOMB 1 /*#define MTRR_TYPE_ 2*/ @@ -113,5 +113,11 @@ struct mtrr_state_type { #define MTRR_TYPE_WRBACK 6 #define MTRR_NUM_TYPES 7 +/* + * Invalid MTRR memory type. mtrr_type_lookup() returns this value when + * MTRRs are disabled. Note, this value is allocated from the reserved + * values (0x7-0xff) of the MTRR memory types. + */ +#define MTRR_TYPE_INVALID 0xff #endif /* _UAPI_ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9bcd0b56ca17..01663ee5f1b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index dcaab87da629..d8f42f902a0f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e4cf63301ff4..eb4f01269b5d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -288,7 +288,7 @@ static int nearby_node(int apicid) * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u32 nodes, cores_per_cu = 1; @@ -341,7 +341,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -420,7 +420,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) static void early_init_amd_mc(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits, ecx; /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..cc7f753e571d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -508,7 +508,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) void detect_ht(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; static bool printed; @@ -844,7 +844,7 @@ static void generic_identify(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); # else c->apicid = c->initial_apicid; @@ -1026,7 +1026,7 @@ void enable_sep_cpu(void) (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); out: put_cpu(); @@ -1204,10 +1204,10 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_LSTAR, entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, ia32_cstar_target); + wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1216,7 +1216,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index edcb0e28c336..be4febc58b94 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned int cpu = c->cpu_index; #endif @@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l3_id; #endif } -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP /* * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..5b974c97e31e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -708,6 +708,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { int i, ret = 0; + char *tmp; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); @@ -716,9 +717,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg, true) >= - MCE_PANIC_SEVERITY) + + if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + *msg = tmp; ret = 1; + } } return ret; } @@ -1047,6 +1050,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; prev_state = ist_enter(regs); @@ -1074,11 +1078,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1155,8 +1168,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise @@ -1637,10 +1660,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } @@ -1976,6 +2005,7 @@ void mce_disable_bank(int bank) /* * mce=off Disables machine check * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) @@ -1999,6 +2029,8 @@ static int __init mcheck_enable(char *str) cfg->disabled = true; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = true; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) @@ -2008,11 +2040,8 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &(cfg->tolerant)); - if (*str == ',') { - ++str; + if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); - } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 55ad9b37cae8..e99b15077e94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,19 +1,13 @@ /* - * (c) 2005-2012 Advanced Micro Devices, Inc. + * (c) 2005-2015 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. - * * Maintained by: Borislav Petkov <bp@alien8.de> * - * April 2006 - * - added support for AMD Family 0x10 processors - * May 2012 - * - major scrubbing - * - * All MC4_MISCi registers are shared between multi-cores + * All MC4_MISCi registers are shared between cores on a node. */ #include <linux/interrupt.h> #include <linux/notifier.h> @@ -32,6 +26,7 @@ #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/trace/irq_vectors.h> #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -47,6 +42,13 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; /* * CPU Initialization @@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) threshold_restart_bank(&tr); }; -static int setup_APIC_mce(int reserved, int new) +static int setup_APIC_mce_threshold(int reserved, int new) { if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) @@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new) return reserved; } +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); + offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) @@ -262,6 +304,73 @@ init: mce_threshold_block_init(&b, offset); } } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); +} + +static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +{ + struct mce m; + u64 status; + + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + if (!(status & MCI_STATUS_VAL)) + return; + + mce_setup(&m); + + m.status = status; + m.bank = bank; + + if (threshold_err) + m.misc = misc; + + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + + mce_log(&m); + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); +} + +static inline void __smp_deferred_error_interrupt(void) +{ + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); +} + +asmlinkage __visible void smp_deferred_error_interrupt(void) +{ + entering_irq(); + __smp_deferred_error_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + u64 status; + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + + if (!(status & MCI_STATUS_VAL) || + !(status & MCI_STATUS_DEFERRED)) + continue; + + __log_error(bank, false, 0); + break; + } } /* @@ -273,12 +382,12 @@ init: * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ + static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; int cpu = smp_processor_id(); unsigned int bank, block; - struct mce m; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void) return; log: - mce_setup(&m); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - if (!(m.status & MCI_STATUS_VAL)) - return; - m.misc = ((u64)high << 32) | low; - m.bank = bank; - mce_log(&m); - - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + __log_error(bank, true, ((u64)high << 32) | low); } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b4a41cf030ed..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -91,6 +91,36 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) @@ -405,8 +435,22 @@ static void intel_init_cmci(void) cmci_recheck(); } +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5f90b85ff22e..70d7c93f4550 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, continue; base = range_state[i].base_pfn; if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && - (mtrr_state.enabled & 1)) { + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..3b533cf37c74 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr) return 0; } -/* - * Error/Semi-error returns: - * 0xFF - when MTRR is not enabled - * *repeat == 1 implies [start:end] spanned across MTRR range and type returned - * corresponds only to [start:*partial_end]. - * Caller has to lookup again for [*partial_end:end]. +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * Return the MTRR fixed memory type of 'start'. + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched + */ +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + /* 0x0 - 0x7FFFF */ + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + /* 0x80000 - 0xBFFFF */ + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Arguments: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. + * + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ -static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat, u8 *uniform) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; - if (!mtrr_state_set) - return 0xFF; - - if (!mtrr_state.enabled) - return 0xFF; + *uniform = 1; - /* Make end inclusive end, instead of exclusive */ + /* Make end inclusive instead of exclusive */ end--; - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state.have_fixed && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - if (!(mtrr_state.enabled & 2)) - return mtrr_state.def_type; - - prev_match = 0xFF; + prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + unsigned short start_state, end_state, inclusive; if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) continue; @@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); - if (start_state != end_state) { + if ((start_state != end_state) || inclusive) { /* * We have start:end spanning across an MTRR. - * We split the region into - * either - * (start:mtrr_end) (mtrr_end:end) - * or - * (start:mtrr_start) (mtrr_start:end) + * We split the region into either + * + * - start_state:1 + * (start:mtrr_end)(mtrr_end:end) + * - end_state:1 + * (start:mtrr_start)(mtrr_start:end) + * - inclusive:1 + * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) + * * depending on kind of overlap. - * Return the type for first region and a pointer to - * the start of second region so that caller will - * lookup again on the second region. - * Note: This way we handle multiple overlaps as well. + * + * Return the type of the first region and a pointer + * to the start of next region so that caller will be + * advised to lookup again after having adjusted start + * and end. + * + * Note: This way we handle overlaps with multiple + * entries and the default type properly. */ if (start_state) *partial_end = base + get_mtrr_size(mask); @@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) end = *partial_end - 1; /* end is inclusive */ *repeat = 1; + *uniform = 0; } if ((start & mask) != (base & mask)) continue; curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { + if (prev_match == MTRR_TYPE_INVALID) { prev_match = curr_match; continue; } + *uniform = 0; if (check_type_overlap(&prev_match, &curr_match)) return curr_match; } - if (mtrr_tom2) { - if (start >= (1ULL<<32) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; - } - - if (prev_match != 0xFF) + if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; } -/* - * Returns the effective MTRR type for the region - * Error return: - * 0xFF - when MTRR is not enabled +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ -u8 mtrr_type_lookup(u64 start, u64 end) +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type; + u8 type, prev_type, is_uniform = 1, dummy; int repeat; u64 partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + is_uniform = 0; + type = mtrr_type_lookup_fixed(start, end); + goto out; + } + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); /* * Common path is with repeat = 0. * However, we can have cases where [start:end] spans across some - * MTRR range. Do repeated lookups for that case here. + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. */ while (repeat) { prev_type = type; start = partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + is_uniform = 0; + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); if (check_type_overlap(&prev_type, &type)) - return type; + goto out; } + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + type = MTRR_TYPE_WRBACK; + +out: + *uniform = is_uniform; return type; } @@ -347,7 +408,9 @@ static void __init print_mtrr_state(void) mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_debug("MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -360,7 +423,7 @@ static void __init print_mtrr_state(void) print_fixed_last(); } pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { @@ -382,7 +445,7 @@ static void __init print_mtrr_state(void) } /* Grab all of the MTRR state for this CPU into *state */ -void __init get_mtrr_state(void) +bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned long flags; @@ -426,6 +489,8 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ea5f363a1948..e7ed0d8ebacb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,6 +59,12 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); @@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, int i, replace, error; mtrr_type ltype; - if (!mtrr_if) + if (!mtrr_enabled()) return -ENXIO; error = mtrr_if->validate_add_page(base, size, type); @@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, bool increment) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, @@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) unsigned long lbase, lsize; int error = -EINVAL; - if (!mtrr_if) - return -ENXIO; + if (!mtrr_enabled()) + return -ENODEV; max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ @@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) */ int mtrr_del(int reg, unsigned long base, unsigned long size) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); @@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del); * attempts to add a WC MTRR covering size bytes starting at base and * logs an error if this fails. * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * * Drivers must store the return value to pass to mtrr_del_wc_if_needed, * but drivers should not try to interpret that return value. */ @@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled) + if (pat_enabled() || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); @@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle) EXPORT_SYMBOL(arch_phys_wc_del); /* - * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * arch_phys_wc_index - translates arch_phys_wc_add's return value * @handle: Return value from arch_phys_wc_add * * This will turn the return value from arch_phys_wc_add into an mtrr @@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del); * in printk line. Alas there is an illegitimate use in some ancient * drm ioctls. */ -int phys_wc_to_mtrr_index(int handle) +int arch_phys_wc_index(int handle) { if (handle < MTRR_TO_PHYS_WC_OFFSET) return -1; else return handle - MTRR_TO_PHYS_WC_OFFSET; } -EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); +EXPORT_SYMBOL_GPL(arch_phys_wc_index); /* * HACK ALERT! @@ -734,10 +747,12 @@ void __init mtrr_bp_init(void) } if (mtrr_if) { + __mtrr_enabled = true; set_num_var_ranges(); init_table(); if (use_intel()) { - get_mtrr_state(); + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; @@ -745,10 +760,16 @@ void __init mtrr_bp_init(void) } } } + + if (!mtrr_enabled()) + pr_info("MTRR: Disabled\n"); } void mtrr_ap_init(void) { + if (!mtrr_enabled()) + return; + if (!use_intel() || mtrr_aps_delayed_init) return; /* @@ -774,6 +795,9 @@ void mtrr_save_state(void) { int first_cpu; + if (!mtrr_enabled()) + return; + get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); @@ -782,6 +806,8 @@ void mtrr_save_state(void) void set_mtrr_aps_delayed_init(void) { + if (!mtrr_enabled()) + return; if (!use_intel()) return; @@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void) */ void mtrr_aps_init(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; /* @@ -810,7 +836,7 @@ void mtrr_aps_init(void) void mtrr_bp_restore(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; mtrr_if->set_all(); @@ -818,7 +844,7 @@ void mtrr_bp_restore(void) static int __init mtrr_init_finialize(void) { - if (!mtrr_if) + if (!mtrr_enabled()) return 0; if (use_intel()) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index df5e41f31a27..951884dcc433 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); -void get_mtrr_state(void); +bool get_mtrr_state(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c76d3e37c6e1..e068d6683dba 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -22,6 +22,7 @@ #include <linux/elfcore.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <asm/processor.h> #include <asm/hardirq.h> diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index fe9f0b79a18b..5cb9a4d6f623 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, /* - * HPET on current version of Baytrail platform has accuracy - * problems, disable it for now: + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S deleted file mode 100644 index 1c309763e321..000000000000 --- a/arch/x86/kernel/entry_32.S +++ /dev/null @@ -1,1401 +0,0 @@ -/* - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - %fs - * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS - * 2C(%esp) - orig_eax - * 30(%esp) - %eip - * 34(%esp) - %cs - * 38(%esp) - %eflags - * 3C(%esp) - %oldesp - * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include <linux/linkage.h> -#include <linux/err.h> -#include <asm/thread_info.h> -#include <asm/irqflags.h> -#include <asm/errno.h> -#include <asm/segment.h> -#include <asm/smp.h> -#include <asm/page_types.h> -#include <asm/percpu.h> -#include <asm/dwarf2.h> -#include <asm/processor-flags.h> -#include <asm/ftrace.h> -#include <asm/irq_vectors.h> -#include <asm/cpufeature.h> -#include <asm/alternative-asm.h> -#include <asm/asm.h> -#include <asm/smap.h> - -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work -#endif - - .section .entry.text, "ax" - -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -#define preempt_stop(clobbers) -#define resume_kernel restore_all -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * User gs save/restore - * - * %gs is used for userland TLS and kernel only uses it for stack - * canary which is required to be at %gs:20 by gcc. Read the comment - * at the top of stackprotector.h for more info. - * - * Local labels 98 and 99 are used. - */ -#ifdef CONFIG_X86_32_LAZY_GS - - /* unfortunately push/pop can't be no-op */ -.macro PUSH_GS - pushl_cfi $0 -.endm -.macro POP_GS pop=0 - addl $(4 + \pop), %esp - CFI_ADJUST_CFA_OFFSET -(4 + \pop) -.endm -.macro POP_GS_EX -.endm - - /* all the rest are no-op */ -.macro PTGS_TO_GS -.endm -.macro PTGS_TO_GS_EX -.endm -.macro GS_TO_REG reg -.endm -.macro REG_TO_PTGS reg -.endm -.macro SET_KERNEL_GS reg -.endm - -#else /* CONFIG_X86_32_LAZY_GS */ - -.macro PUSH_GS - pushl_cfi %gs - /*CFI_REL_OFFSET gs, 0*/ -.endm - -.macro POP_GS pop=0 -98: popl_cfi %gs - /*CFI_RESTORE gs*/ - .if \pop <> 0 - add $\pop, %esp - CFI_ADJUST_CFA_OFFSET -\pop - .endif -.endm -.macro POP_GS_EX -.pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs -.endm -.macro PTGS_TO_GS_EX -.pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro GS_TO_REG reg - movl %gs, \reg - /*CFI_REGISTER gs, \reg*/ -.endm -.macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) - /*CFI_REL_OFFSET gs, PT_GS*/ -.endm -.macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs -.endm - -#endif /* CONFIG_X86_32_LAZY_GS */ - -.macro SAVE_ALL - cld - PUSH_GS - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0;*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0;*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0;*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs - SET_KERNEL_GS %edx -.endm - -.macro RESTORE_INT_REGS - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax -.endm - -.macro RESTORE_REGS pop=0 - RESTORE_INT_REGS -1: popl_cfi %ds - /*CFI_RESTORE ds;*/ -2: popl_cfi %es - /*CFI_RESTORE es;*/ -3: popl_cfi %fs - /*CFI_RESTORE fs;*/ - POP_GS \pop -.pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b -.popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) - POP_GS_EX -.endm - -.macro RING0_INT_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 3*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_EC_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 4*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_PTREGS_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, PT_OLDESP-PT_EBX - /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ - CFI_OFFSET eip, PT_EIP-PT_OLDESP - /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ - /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ - CFI_OFFSET eax, PT_EAX-PT_OLDESP - CFI_OFFSET ebp, PT_EBP-PT_OLDESP - CFI_OFFSET edi, PT_EDI-PT_OLDESP - CFI_OFFSET esi, PT_ESI-PT_OLDESP - CFI_OFFSET edx, PT_EDX-PT_OLDESP - CFI_OFFSET ecx, PT_ECX-PT_OLDESP - CFI_OFFSET ebx, PT_EBX-PT_OLDESP -.endm - -ENTRY(ret_from_fork) - CFI_STARTPROC - pushl_cfi %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi - jmp syscall_exit - CFI_ENDPROC -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - CFI_STARTPROC - pushl_cfi %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit - CFI_ENDPROC -ENDPROC(ret_from_kernel_thread) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing - ALIGN - RING0_PTREGS_FRAME -ret_from_exception: - preempt_stop(CLBR_ANY) -ret_from_intr: - GET_THREAD_INFO(%ebp) -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace - -ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all -END(ret_from_exception) - -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched -END(resume_kernel) -#endif - CFI_ENDPROC - -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ - - # sysenter call handler stub -ENTRY(ia32_sysenter_target) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp - movl TSS_sysenter_sp0(%esp),%esp -sysenter_past_esp: - /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl_cfi $__USER_DS - /*CFI_REL_OFFSET ss, 0*/ - pushl_cfi %ebp - CFI_REL_OFFSET esp, 0 - pushfl_cfi - orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $__USER_CS - /*CFI_REL_OFFSET cs, 0*/ - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. - */ - pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - CFI_REL_OFFSET eip, 0 - - pushl_cfi %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp),%ebp - ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) -sysenter_after_call: - movl %eax,PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp - TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs - PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT - -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl_cfi PT_ESI(%esp) /* a3: 5th arg */ - pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl_cfi %ecx /* get that remapped edx off the stack */ - popl_cfi %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit -#endif - - CFI_ENDPROC -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b -.popsection - _ASM_EXTABLE(1b,2b) - PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) - - # system call handler stub -ENTRY(system_call) - RING0_INT_FRAME # can't unwind into user space anyway - ASM_CLAC - pushl_cfi %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(,%eax,4) -syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value -syscall_exit: - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work - -restore_all: - TRACE_IRQS_IRET -restore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - CFI_REMEMBER_STATE - je ldt_ss # returning to user-space with LDT SS -#endif -restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: - INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code -.previous - _ASM_EXTABLE(irq_return,iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 - CFI_RESTORE_STATE -ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl_cfi $__ESPFIX_SS - pushl_cfi %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ - DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - CFI_ADJUST_CFA_OFFSET -8 - jmp restore_nocheck -#endif - CFI_ENDPROC -ENDPROC(system_call) - - # perform work that needs to be done immediately before resumption - ALIGN - RING0_PTREGS_FRAME # can't unwind into user space anyway -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl_cfi %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl_cfi %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace -END(syscall_exit_work) - CFI_ENDPROC - - RING0_INT_FRAME # can't unwind into user space anyway -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call -END(syscall_badsys) - -sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call -END(sysenter_badsys) - CFI_ENDPROC - -.macro FIXUP_ESPFIX_STACK -/* - * Switch back for ESPFIX stack to the normal zerobased stack - * - * We can't call C functions using the ESPFIX stack. This code reads - * the high word of the segment base from the GDT and swiches to the - * normal stack and adjusts ESP with the matching offset. - */ -#ifdef CONFIG_X86_ESPFIX32 - /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl_cfi $__KERNEL_DS - pushl_cfi %eax - lss (%esp), %esp /* switch to the normal stack segment */ - CFI_ADJUST_CFA_OFFSET -8 -#endif -.endm -.macro UNWIND_ESPFIX_STACK -#ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax - /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - /* switch to normal stack */ - FIXUP_ESPFIX_STACK -27: -#endif -.endm - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - RING0_INT_FRAME - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -4 - .align 8 - .endr -END(irq_entries_start) - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ - SAVE_ALL - TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr -ENDPROC(common_interrupt) - CFI_ENDPROC - -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - RING0_INT_FRAME; \ - ASM_CLAC; \ - pushl_cfi $~(nr); \ - SAVE_ALL; \ - TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ - CFI_ENDPROC; \ -ENDPROC(name) - - -#ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -#define TRACE_BUILD_INTERRUPT(name, nr) -#endif - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) - -/* The include is where all of the SMP etc. interrupts come from */ -#include <asm/entry_arch.h> - -ENTRY(coprocessor_error) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_error - jmp error_code - CFI_ENDPROC -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl_cfi $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl_cfi $do_simd_coprocessor_error -#endif - jmp error_code - CFI_ENDPROC -END(simd_coprocessor_error) - -ENTRY(device_not_available) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $-1 # mark this as an int - pushl_cfi $do_device_not_available - jmp error_code - CFI_ENDPROC -END(device_not_available) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) - iret - _ASM_EXTABLE(native_iret, iret_exc) -END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) -#endif - -ENTRY(overflow) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_overflow - jmp error_code - CFI_ENDPROC -END(overflow) - -ENTRY(bounds) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_bounds - jmp error_code - CFI_ENDPROC -END(bounds) - -ENTRY(invalid_op) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_invalid_op - jmp error_code - CFI_ENDPROC -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_segment_overrun - jmp error_code - CFI_ENDPROC -END(coprocessor_segment_overrun) - -ENTRY(invalid_TSS) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_invalid_TSS - jmp error_code - CFI_ENDPROC -END(invalid_TSS) - -ENTRY(segment_not_present) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_segment_not_present - jmp error_code - CFI_ENDPROC -END(segment_not_present) - -ENTRY(stack_segment) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_stack_segment - jmp error_code - CFI_ENDPROC -END(stack_segment) - -ENTRY(alignment_check) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_alignment_check - jmp error_code - CFI_ENDPROC -END(alignment_check) - -ENTRY(divide_error) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 # no error code - pushl_cfi $do_divide_error - jmp error_code - CFI_ENDPROC -END(divide_error) - -#ifdef CONFIG_X86_MCE -ENTRY(machine_check) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi machine_check_vector - jmp error_code - CFI_ENDPROC -END(machine_check) -#endif - -ENTRY(spurious_interrupt_bug) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_spurious_interrupt_bug - jmp error_code - CFI_ENDPROC -END(spurious_interrupt_bug) - -#ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ -ENTRY(xen_sysenter_target) - RING0_INT_FRAME - addl $5*4, %esp /* remove xen-provided frame */ - CFI_ADJUST_CFA_OFFSET -5*4 - jmp sysenter_past_esp - CFI_ENDPROC - -ENTRY(xen_hypervisor_callback) - CFI_STARTPROC - pushl_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - TRACE_IRQS_OFF - - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr - CFI_ENDPROC -ENDPROC(xen_hypervisor_callback) - -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. -ENTRY(xen_failsafe_callback) - CFI_STARTPROC - pushl_cfi %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs - /* EAX == 0 => Category 1 (Bad segment) - EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl_cfi %eax - lea 16(%esp),%esp - CFI_ADJUST_CFA_OFFSET -16 - jz 5f - jmp iret_exc -5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - jmp ret_from_exception - CFI_ENDPROC - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b -.previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) -ENDPROC(xen_failsafe_callback) - -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -#endif /* CONFIG_HYPERV */ - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -ftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -.globl ftrace_stub -ftrace_stub: - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz trace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $trace_do_page_fault - jmp error_code - CFI_ENDPROC -END(trace_page_fault) -#endif - -ENTRY(page_fault) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_page_fault - ALIGN -error_code: - /* the function address is in %gs's slot on the stack */ - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx - cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -END(page_fault) - -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - CFI_DEF_CFA esp, 0 - CFI_UNDEFINED eip - pushfl_cfi - pushl_cfi $__KERNEL_CS - pushl_cfi $sysenter_past_esp - CFI_REL_OFFSET eip, 0 -.endm - -ENTRY(debug) - RING0_INT_FRAME - ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: - pushl_cfi $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -ENTRY(nmi) - RING0_INT_FRAME - ASM_CLAC -#ifdef CONFIG_X86_ESPFIX32 - pushl_cfi %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl_cfi %eax - je nmi_espfix_stack -#endif - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl_cfi %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl_cfi %eax - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl_cfi %eax - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct - -#ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl_cfi %ss - pushl_cfi %esp - addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl_cfi 16(%esp) - .endr - pushl_cfi %eax - SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return -#endif - CFI_ENDPROC -END(nmi) - -ENTRY(int3) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -END(int3) - -ENTRY(general_protection) - RING0_EC_FRAME - pushl_cfi $do_general_protection - jmp error_code - CFI_ENDPROC -END(general_protection) - -#ifdef CONFIG_KVM_GUEST -ENTRY(async_page_fault) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_async_page_fault - jmp error_code - CFI_ENDPROC -END(async_page_fault) -#endif - diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 02d257256200..544dec4cc605 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: .Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 43eafc8afb69..e5c27f729a38 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) .Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 009183276bb7..6185d3141219 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -173,6 +173,21 @@ static void init_thread_xstate(void) xstate_size = sizeof(struct i387_fxsave_struct); else xstate_size = sizeof(struct i387_fsave_struct); + + /* + * Quirk: we don't yet handle the XSAVES* instructions + * correctly, as we don't correctly convert between + * standard and compacted format when interfacing + * with user-space - so disable it for now. + * + * The difference is small: with recent CPUs the + * compacted format is only marginally smaller than + * the standard FPU state format. + * + * ( This is easy to backport while we are fixing + * XSAVES* support. ) + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); } /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7e10c8b4b318..88b366487b0e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -122,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 680723a8e4b6..a3a5e158ed69 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -135,6 +135,10 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#ifdef CONFIG_X86_MCE_AMD + alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); +#endif + #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..1681504e44a4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } + +#ifdef CONFIG_QUEUED_SPINLOCKS + +#include <asm/qspinlock.h> + +static void kvm_wait(u8 *ptr, u8 val) +{ + unsigned long flags; + + if (in_nmi()) + return; + + local_irq_save(flags); + + if (READ_ONCE(*ptr) != val) + goto out; + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + local_irq_restore(flags); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + enum kvm_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) } } +#endif /* !CONFIG_QUEUED_SPINLOCKS */ + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; +#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = kvm_wait; + pv_lock_ops.kick = kvm_kick_cpu; +#else /* !CONFIG_QUEUED_SPINLOCKS */ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); pv_lock_ops.unlock_kick = kvm_unlock_kick; +#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..11546b462fa6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include <linux/ftrace.h> #include <linux/io.h> #include <linux/suspend.h> +#include <linux/vmalloc.h> #include <asm/init.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index bbb6c7316341..33ee3e0efd65 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,11 +8,33 @@ #include <asm/paravirt.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +__visible void __native_queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} + +PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); + +bool pv_is_native_spin_unlock(void) +{ + return pv_lock_ops.queued_spin_unlock.func == + __raw_callee_save___native_queued_spin_unlock; +} +#endif + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP +#ifdef CONFIG_QUEUED_SPINLOCKS + .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), + .wait = paravirt_nop, + .kick = paravirt_nop, +#else /* !CONFIG_QUEUED_SPINLOCKS */ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), .unlock_kick = paravirt_nop, -#endif +#endif /* !CONFIG_QUEUED_SPINLOCKS */ +#endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab6..e1b013696dde 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { /* arg in %eax, return in %eax */ @@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) return 0; } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_cpu_ops, read_tsc); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 0de21c62c348..8aa05583bc42 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) start__mov64, end__mov64); } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -58,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5e0791f9d3dc..de379366f6d1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -72,8 +72,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> - -asmlinkage int system_call(void); +#include <asm/proto.h> #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -813,18 +812,6 @@ dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) -{ } /* @@ -992,12 +979,12 @@ void __init trap_init(void) set_bit(i, used_vectors); #ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59b69f6a2844..1d08ad3582d0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,6 +16,8 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <asm/i387.h> /* For use_eager_fpu. Ugh! */ +#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> #include <asm/xsave.h> #include "cpuid.h" @@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu); + /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c3b1ad9fca81..496b3695d3d3 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -117,4 +117,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_RTM)); } + +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d43867c33bc4..44a7d2515497 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3736,8 +3736,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3918,6 +3918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -3936,6 +3937,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.cr0_wp = is_write_protection(vcpu); context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); + context->base_role.smap_andnot_wp + = smap && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4207,12 +4210,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; + union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + }; /* * If we don't have indirect shadow pages, it means no page is @@ -4238,7 +4247,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7d65637c851..0ada65ecddcf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,8 +71,6 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); -void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -166,6 +164,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + WARN_ON(pfec & PFERR_RSVD_MASK); + return (mmu->permissions[index] >> pte_access) & 1; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fd49c867b25a..6e6d115fe9b5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; + + /* + * page fault with PFEC.RSVD = 1 is caused by shadow + * page fault, should not be used to walk guest page + * table. + */ + error_code &= ~PFERR_RSVD_MASK; }; r = mmu_topup_memory_caches(vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce741b8650f6..9afa233b5482 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4381,6 +4381,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b61687bd79..2d73807f0d31 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10185,6 +10185,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c73efcd03e29..ea306adbbc13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,8 +702,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | - X86_CR4_PAE | X86_CR4_SMEP; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP; + if (cr4 & CR4_RESERVED_BITS) return 1; @@ -744,9 +745,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_SMAP) - update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -6197,6 +6195,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); /* @@ -7060,7 +7060,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + if (!vcpu->arch.eager_fpu) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + trace_kvm_fpu(0); } @@ -7076,11 +7078,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return kvm_x86_ops->vcpu_create(kvm, id); + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + /* + * Activate fpu unconditionally in case the guest needs eager FPU. It will be + * deactivated soon if it doesn't. + */ + kvm_x86_ops->fpu_activate(vcpu); + return vcpu; } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 982989d282ff..f2587888d987 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -17,7 +17,6 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o -lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 00933d5e992f..9b0ca8fe80fc 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -11,26 +11,23 @@ #include <linux/linkage.h> #include <asm/alternative-asm.h> -#include <asm/dwarf2.h> /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl_cfi + pushfl cli .endm .macro UNLOCK reg - popfl_cfi + popfl .endm #define BEGIN(op) \ .macro endp; \ - CFI_ENDPROC; \ ENDPROC(atomic64_##op##_386); \ .purgem endp; \ .endm; \ ENTRY(atomic64_##op##_386); \ - CFI_STARTPROC; \ LOCK v; #define ENDP endp diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 082a85167a5b..db3ae85440ff 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -11,7 +11,6 @@ #include <linux/linkage.h> #include <asm/alternative-asm.h> -#include <asm/dwarf2.h> .macro read64 reg movl %ebx, %eax @@ -22,16 +21,11 @@ .endm ENTRY(atomic64_read_cx8) - CFI_STARTPROC - read64 %ecx ret - CFI_ENDPROC ENDPROC(atomic64_read_cx8) ENTRY(atomic64_set_cx8) - CFI_STARTPROC - 1: /* we don't need LOCK_PREFIX since aligned 64-bit writes * are atomic on 586 and newer */ @@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_set_cx8) ENTRY(atomic64_xchg_cx8) - CFI_STARTPROC - 1: LOCK_PREFIX cmpxchg8b (%esi) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx - pushl_cfi_reg esi - pushl_cfi_reg edi + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi movl %eax, %esi movl %edx, %edi @@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %edi + popl %esi + popl %ebx + popl %ebp ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -93,8 +81,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -119,8 +105,7 @@ incdec_return inc add adc incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx + pushl %ebp + pushl %ebx /* these just push these two parameters on the stack */ - pushl_cfi_reg edi - pushl_cfi_reg ecx + pushl %edi + pushl %ecx movl %eax, %ebp movl %edx, %edi @@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8) movl $1, %eax 3: addl $8, %esp - CFI_ADJUST_CFA_OFFSET -8 - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %ebx + popl %ebp ret 4: cmpl %edx, 4(%esp) jne 2b xorl %eax, %eax jmp 3b - CFI_ENDPROC ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 9bc944a91274..c1e623209853 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -26,7 +26,6 @@ */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/errno.h> #include <asm/asm.h> @@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * alignment for the unrolled loop. */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -129,10 +127,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #else @@ -140,9 +137,8 @@ ENDPROC(csum_partial) /* Version for PentiumII/PPro */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -249,10 +245,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #endif @@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define FP 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC subl $4,%esp - CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %edi + pushl %esi + pushl %ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -401,12 +394,11 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi_reg ebx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi %ecx # equivalent to addl $4,%esp + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #else @@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic) #define ARGBASE 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg edi - pushl_cfi_reg esi + pushl %ebx + pushl %edi + pushl %esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -489,11 +480,10 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebx + popl %esi + popl %edi + popl %ebx ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #undef ROUND diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index e67e579c93bd..a2fe51b00cce 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,4 @@ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/cpufeature.h> #include <asm/alternative-asm.h> @@ -15,7 +14,6 @@ * %rdi - page */ ENTRY(clear_page) - CFI_STARTPROC ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ "jmp clear_page_c_e", X86_FEATURE_ERMS @@ -24,11 +22,9 @@ ENTRY(clear_page) xorl %eax,%eax rep stosq ret - CFI_ENDPROC ENDPROC(clear_page) ENTRY(clear_page_orig) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx @@ -48,14 +44,11 @@ ENTRY(clear_page_orig) jnz .Lloop nop ret - CFI_ENDPROC ENDPROC(clear_page_orig) ENTRY(clear_page_c_e) - CFI_STARTPROC movl $4096,%ecx xorl %eax,%eax rep stosb ret - CFI_ENDPROC ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 40a172541ee2..9b330242e740 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -6,7 +6,6 @@ * */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/percpu.h> .text @@ -21,7 +20,6 @@ * %al : Operation successful */ ENTRY(this_cpu_cmpxchg16b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not @@ -32,7 +30,7 @@ CFI_STARTPROC # *atomic* on a single cpu (as provided by the this_cpu_xx class of # macros). # - pushfq_cfi + pushfq cli cmpq PER_CPU_VAR((%rsi)), %rax @@ -43,17 +41,13 @@ CFI_STARTPROC movq %rbx, PER_CPU_VAR((%rsi)) movq %rcx, PER_CPU_VAR(8(%rsi)) - CFI_REMEMBER_STATE - popfq_cfi + popfq mov $1, %al ret - CFI_RESTORE_STATE .Lnot_same: - popfq_cfi + popfq xor %al,%al ret -CFI_ENDPROC - ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index b4807fce5177..ad5349778490 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,6 @@ */ #include <linux/linkage.h> -#include <asm/dwarf2.h> .text @@ -20,14 +19,13 @@ * %ecx : high 32 bits of new value */ ENTRY(cmpxchg8b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg8b (%esi)' on UP except we don't # set the whole ZF thing (caller will just compare # eax:edx with the expected value) # - pushfl_cfi + pushfl cli cmpl (%esi), %eax @@ -38,18 +36,15 @@ CFI_STARTPROC movl %ebx, (%esi) movl %ecx, 4(%esi) - CFI_REMEMBER_STATE - popfl_cfi + popfl ret - CFI_RESTORE_STATE .Lnot_same: movl (%esi), %eax .Lhalf_same: movl 4(%esi), %edx - popfl_cfi + popfl ret -CFI_ENDPROC ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 8239dbcbf984..009f98216b7e 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,6 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/cpufeature.h> #include <asm/alternative-asm.h> @@ -13,22 +12,16 @@ */ ALIGN ENTRY(copy_page) - CFI_STARTPROC ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret - CFI_ENDPROC ENDPROC(copy_page) ENTRY(copy_page_regs) - CFI_STARTPROC subq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET 2*8 movq %rbx, (%rsp) - CFI_REL_OFFSET rbx, 0 movq %r12, 1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 movl $(4096/64)-5, %ecx .p2align 4 @@ -87,11 +80,7 @@ ENTRY(copy_page_regs) jnz .Loop2 movq (%rsp), %rbx - CFI_RESTORE rbx movq 1*8(%rsp), %r12 - CFI_RESTORE r12 addq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET -2*8 ret - CFI_ENDPROC ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index e4b3beee83bd..982ce34f4a9b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,7 +7,6 @@ */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -18,7 +17,6 @@ /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx @@ -30,12 +28,10 @@ ENTRY(_copy_to_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx @@ -47,14 +43,12 @@ ENTRY(_copy_from_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_from_user) .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) bad_from_user: - CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -62,7 +56,6 @@ bad_from_user: bad_to_user: movl %edx,%eax ret - CFI_ENDPROC ENDPROC(bad_from_user) .previous @@ -80,7 +73,6 @@ ENDPROC(bad_from_user) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -162,7 +154,6 @@ ENTRY(copy_user_generic_unrolled) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(copy_user_generic_unrolled) /* Some CPUs run faster using the string copy instructions. @@ -184,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_string) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ @@ -209,7 +199,6 @@ ENTRY(copy_user_generic_string) _ASM_EXTABLE(1b,11b) _ASM_EXTABLE(3b,12b) - CFI_ENDPROC ENDPROC(copy_user_generic_string) /* @@ -225,7 +214,6 @@ ENDPROC(copy_user_generic_string) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_enhanced_fast_string) - CFI_STARTPROC ASM_STAC movl %edx,%ecx 1: rep @@ -240,7 +228,6 @@ ENTRY(copy_user_enhanced_fast_string) .previous _ASM_EXTABLE(1b,12b) - CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) /* @@ -248,7 +235,6 @@ ENDPROC(copy_user_enhanced_fast_string) * This will force destination/source out of cache for more performance. */ ENTRY(__copy_user_nocache) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -332,5 +318,4 @@ ENTRY(__copy_user_nocache) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 9734182966f3..7e48807b2fa1 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -6,7 +6,6 @@ * for more details. No warranty for anything given at all. */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/errno.h> #include <asm/asm.h> @@ -47,23 +46,16 @@ ENTRY(csum_partial_copy_generic) - CFI_STARTPROC cmpl $3*64, %edx jle .Lignore .Lignore: subq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx, 2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 movq %r12, 3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 movq %r14, 4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 movq %r13, 5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 movq %rbp, 6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 movq %r8, (%rsp) movq %r9, 1*8(%rsp) @@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic) addl %ebx, %eax adcl %r9d, %eax /* carry */ - CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp), %rbx - CFI_RESTORE rbx movq 3*8(%rsp), %r12 - CFI_RESTORE r12 movq 4*8(%rsp), %r14 - CFI_RESTORE r14 movq 5*8(%rsp), %r13 - CFI_RESTORE r13 movq 6*8(%rsp), %rbp - CFI_RESTORE rbp addq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET -7*8 ret - CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic) jz .Lende movl $-EFAULT, (%rax) jmp .Lende - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index a4512359656a..46668cda4ffd 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -26,7 +26,6 @@ */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/page_types.h> #include <asm/errno.h> #include <asm/asm-offsets.h> @@ -36,7 +35,6 @@ .text ENTRY(__get_user_1) - CFI_STARTPROC GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user @@ -45,11 +43,9 @@ ENTRY(__get_user_1) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_1) ENTRY(__get_user_2) - CFI_STARTPROC add $1,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -60,11 +56,9 @@ ENTRY(__get_user_2) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_2) ENTRY(__get_user_4) - CFI_STARTPROC add $3,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -75,11 +69,9 @@ ENTRY(__get_user_4) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_4) ENTRY(__get_user_8) - CFI_STARTPROC #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user @@ -104,28 +96,23 @@ ENTRY(__get_user_8) ASM_CLAC ret #endif - CFI_ENDPROC ENDPROC(__get_user_8) bad_get_user: - CFI_STARTPROC xor %edx,%edx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user) #ifdef CONFIG_X86_32 bad_get_user_8: - CFI_STARTPROC xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user_8) #endif diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index 05a95e713da8..33147fef3452 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -16,15 +16,12 @@ */ #include <linux/linkage.h> -#include <asm/dwarf2.h> /* * override generic version in lib/iomap_copy.c */ ENTRY(__iowrite32_copy) - CFI_STARTPROC movl %edx,%ecx rep movsd ret - CFI_ENDPROC ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index b046664f5a1c..16698bba87de 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -2,7 +2,6 @@ #include <linux/linkage.h> #include <asm/cpufeature.h> -#include <asm/dwarf2.h> #include <asm/alternative-asm.h> /* @@ -53,7 +52,6 @@ ENTRY(memcpy_erms) ENDPROC(memcpy_erms) ENTRY(memcpy_orig) - CFI_STARTPROC movq %rdi, %rax cmpq $0x20, %rdx @@ -178,5 +176,4 @@ ENTRY(memcpy_orig) .Lend: retq - CFI_ENDPROC ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0f8a0d0331b9..ca2afdd6d98e 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,6 @@ * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/cpufeature.h> #include <asm/alternative-asm.h> @@ -27,7 +26,6 @@ ENTRY(memmove) ENTRY(__memmove) - CFI_STARTPROC /* Handle more 32 bytes in loop */ mov %rdi, %rax @@ -207,6 +205,5 @@ ENTRY(__memmove) movb %r11b, (%rdi) 13: retq - CFI_ENDPROC ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 93118fb23976..2661fad05827 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,6 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/cpufeature.h> #include <asm/alternative-asm.h> @@ -66,7 +65,6 @@ ENTRY(memset_erms) ENDPROC(memset_erms) ENTRY(memset_orig) - CFI_STARTPROC movq %rdi,%r10 /* expand byte value */ @@ -78,7 +76,6 @@ ENTRY(memset_orig) movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment - CFI_REMEMBER_STATE .Lafter_bad_alignment: movq %rdx,%rcx @@ -128,7 +125,6 @@ ENTRY(memset_orig) movq %r10,%rax ret - CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%rdx jbe .Lhandle_7 @@ -139,5 +135,4 @@ ENTRY(memset_orig) subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: - CFI_ENDPROC ENDPROC(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 3ca5218fbece..c81556409bbb 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,6 +1,5 @@ #include <linux/linkage.h> #include <linux/errno.h> -#include <asm/dwarf2.h> #include <asm/asm.h> #include <asm/msr.h> @@ -13,9 +12,8 @@ */ .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushq_cfi_reg rbx - pushq_cfi_reg rbp + pushq %rbx + pushq %rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs) movl 20(%rdi), %ebp movl 24(%rdi), %esi movl 28(%rdi), %edi - CFI_REMEMBER_STATE 1: \op 2: movl %eax, (%r10) movl %r11d, %eax /* Return value */ @@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi_reg rbp - popq_cfi_reg rbx + popq %rbp + popq %rbx ret 3: - CFI_RESTORE_STATE movl $-EIO, %r11d jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm @@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg ebp - pushl_cfi_reg esi - pushl_cfi_reg edi - pushl_cfi $0 /* Return value */ - pushl_cfi %eax + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi + pushl $0 /* Return value */ + pushl %eax movl 4(%eax), %ecx movl 8(%eax), %edx movl 12(%eax), %ebx @@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs) movl 24(%eax), %esi movl 28(%eax), %edi movl (%eax), %eax - CFI_REMEMBER_STATE 1: \op -2: pushl_cfi %eax +2: pushl %eax movl 4(%esp), %eax - popl_cfi (%eax) + popl (%eax) addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 movl %ecx, 4(%eax) movl %edx, 8(%eax) movl %ebx, 12(%eax) movl %ebp, 20(%eax) movl %esi, 24(%eax) movl %edi, 28(%eax) - popl_cfi %eax - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebp - popl_cfi_reg ebx + popl %eax + popl %edi + popl %esi + popl %ebp + popl %ebx ret 3: - CFI_RESTORE_STATE movl $-EIO, 4(%esp) jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index fc6ba17a7eec..e0817a12d323 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -11,7 +11,6 @@ * return value. */ #include <linux/linkage.h> -#include <asm/dwarf2.h> #include <asm/thread_info.h> #include <asm/errno.h> #include <asm/asm.h> @@ -30,11 +29,9 @@ * as they get called from within inline assembly. */ -#define ENTER CFI_STARTPROC ; \ - GET_THREAD_INFO(%_ASM_BX) +#define ENTER GET_THREAD_INFO(%_ASM_BX) #define EXIT ASM_CLAC ; \ - ret ; \ - CFI_ENDPROC + ret .text ENTRY(__put_user_1) @@ -87,7 +84,6 @@ ENTRY(__put_user_8) ENDPROC(__put_user_8) bad_put_user: - CFI_STARTPROC movl $-EFAULT,%eax EXIT END(bad_put_user) diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 2322abe4da3b..40027db99140 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -15,7 +15,6 @@ #include <linux/linkage.h> #include <asm/alternative-asm.h> -#include <asm/dwarf2.h> #define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) #define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) @@ -34,10 +33,10 @@ */ #define save_common_regs \ - pushl_cfi_reg ecx + pushl %ecx #define restore_common_regs \ - popl_cfi_reg ecx + popl %ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,50 +63,45 @@ */ #define save_common_regs \ - pushq_cfi_reg rdi; \ - pushq_cfi_reg rsi; \ - pushq_cfi_reg rcx; \ - pushq_cfi_reg r8; \ - pushq_cfi_reg r9; \ - pushq_cfi_reg r10; \ - pushq_cfi_reg r11 + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 #define restore_common_regs \ - popq_cfi_reg r11; \ - popq_cfi_reg r10; \ - popq_cfi_reg r9; \ - popq_cfi_reg r8; \ - popq_cfi_reg rcx; \ - popq_cfi_reg rsi; \ - popq_cfi_reg rdi + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi #endif /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) - CFI_STARTPROC /* do nothing if still outstanding active readers */ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) jnz 1f @@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - CFI_ENDPROC ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1d553186c434..8533b46e6bee 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -40,7 +40,7 @@ */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , - [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , + [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, @@ -50,11 +50,11 @@ EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 9ca35fc60cfe..a9dc7a37e6a2 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -77,13 +77,13 @@ void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { /* - * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. - * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the - * MTRR is UC or WC. UC_MINUS gets the real intention, of the - * user, which is "WC if the MTRR is WC, UC if you can't do that." + * For non-PAT systems, translate non-WB request to UC- just in + * case the caller set the PWT bit to prot directly without using + * pgprot_writecombine(). UC- translates to uncached if the MTRR + * is UC or WC. UC- gets the real intention, of the user, which is + * "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == - (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) + if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 70e7444c6835..8405c0c6a535 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -42,6 +42,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, case _PAGE_CACHE_MODE_WC: err = _set_memory_wc(vaddr, nrpages); break; + case _PAGE_CACHE_MODE_WT: + err = _set_memory_wt(vaddr, nrpages); + break; case _PAGE_CACHE_MODE_WB: err = _set_memory_wb(vaddr, nrpages); break; @@ -172,6 +175,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, prot = __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); break; + case _PAGE_CACHE_MODE_WT: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); + break; case _PAGE_CACHE_MODE_WB: break; } @@ -234,10 +241,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; + * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use - * UC MINUS. + * UC MINUS. Drivers that are certain they need or can already + * be converted over to strong UC can use ioremap_uc(). */ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; @@ -247,6 +255,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) EXPORT_SYMBOL(ioremap_nocache); /** + * ioremap_uc - map bus memory into CPU space as strongly uncachable + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_uc performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked with a strong + * preference as completely uncachable on the CPU when possible. For non-PAT + * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT + * systems this will set the PAT entry for the pages as strong UC. This call + * will honor existing caching rules from things like the PCI bus. Note that + * there are other caches and buffers on many busses. In particular driver + * authors should read up on PCI writes. + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(ioremap_uc); + +/** * ioremap_wc - map memory into CPU space write combined * @phys_addr: bus address of the memory * @size: size of the resource to map @@ -258,14 +299,28 @@ EXPORT_SYMBOL(ioremap_nocache); */ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { - if (pat_enabled) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); - else - return ioremap_nocache(phys_addr, size); } EXPORT_SYMBOL(ioremap_wc); +/** + * ioremap_wt - map memory into CPU space write through + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write through. + * Write through stores data into memory while keeping the cache up-to-date. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, @@ -331,7 +386,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int arch_ioremap_pud_supported(void) +int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 return cpu_has_gbpages; @@ -340,7 +395,7 @@ int arch_ioremap_pud_supported(void) #endif } -int arch_ioremap_pmd_supported(void) +int __init arch_ioremap_pmd_supported(void) { return cpu_has_pse; } diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 6629f397b467..8ff686aa7e8c 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -9,6 +9,7 @@ #include <linux/random.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/cacheflush.h> #include <asm/pgtable.h> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 89af288ec674..727158cb3b3c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/pci.h> +#include <linux/vmalloc.h> #include <asm/e820.h> #include <asm/processor.h> @@ -129,16 +130,15 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - void *vend = vaddr + size - 1; + unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; mb(); - for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) - clflushopt(vaddr); - /* - * Flush any possible final partial cacheline: - */ - clflushopt(vend); + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += boot_cpu_data.x86_clflush_size) + clflushopt(p); mb(); } @@ -418,13 +418,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) phys_addr_t phys_addr; unsigned long offset; enum pg_level level; - unsigned long psize; unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - psize = page_level_size(level); pmask = page_level_mask(level); offset = virt_addr & ~pmask; phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; @@ -1468,6 +1466,9 @@ int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap_nocache() + * If you really need strong UC use ioremap_uc(), but note + * that you cannot override IO areas with set_memory_*() as + * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), @@ -1502,12 +1503,10 @@ EXPORT_SYMBOL(set_memory_uc); static int _set_memory_array(unsigned long *addr, int addrinarray, enum page_cache_mode new_type) { + enum page_cache_mode set_type; int i, j; int ret; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ for (i = 0; i < addrinarray; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, new_type, NULL); @@ -1515,9 +1514,12 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, goto out_free; } + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + ret = change_page_attr_set(addr, addrinarray, - cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), - 1); + cachemode2pgprot(set_type), 1); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(addr, addrinarray, @@ -1549,6 +1551,12 @@ int set_memory_array_wc(unsigned long *addr, int addrinarray) } EXPORT_SYMBOL(set_memory_array_wc); +int set_memory_array_wt(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_memory_array_wt); + int _set_memory_wc(unsigned long addr, int numpages) { int ret; @@ -1571,27 +1579,42 @@ int set_memory_wc(unsigned long addr, int numpages) { int ret; - if (!pat_enabled) - return set_memory_uc(addr, numpages); - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) - goto out_err; + return ret; ret = _set_memory_wc(addr, numpages); if (ret) - goto out_free; - - return 0; + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); -out_free: - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); -out_err: return ret; } EXPORT_SYMBOL(set_memory_wc); +int _set_memory_wt(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); +} + +int set_memory_wt(unsigned long addr, int numpages) +{ + int ret; + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_WT, NULL); + if (ret) + return ret; + + ret = _set_memory_wt(addr, numpages); + if (ret) + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL_GPL(set_memory_wt); + int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ @@ -1682,6 +1705,7 @@ static int _set_pages_array(struct page **pages, int addrinarray, { unsigned long start; unsigned long end; + enum page_cache_mode set_type; int i; int free_idx; int ret; @@ -1695,8 +1719,12 @@ static int _set_pages_array(struct page **pages, int addrinarray, goto err_out; } + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + ret = cpa_set_pages_array(pages, addrinarray, - cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); + cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, addrinarray, cachemode2pgprot( @@ -1730,6 +1758,12 @@ int set_pages_array_wc(struct page **pages, int addrinarray) } EXPORT_SYMBOL(set_pages_array_wc); +int set_pages_array_wt(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_pages_array_wt); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 35af6771a95a..188e3e07eeeb 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -33,13 +33,17 @@ #include "pat_internal.h" #include "mm_internal.h" -#ifdef CONFIG_X86_PAT -int __read_mostly pat_enabled = 1; +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static bool boot_cpu_done; + +static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); static inline void pat_disable(const char *reason) { - pat_enabled = 0; - printk(KERN_INFO "%s\n", reason); + __pat_enabled = 0; + pr_info("x86/PAT: %s\n", reason); } static int __init nopat(char *str) @@ -48,13 +52,12 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); -#else -static inline void pat_disable(const char *reason) + +bool pat_enabled(void) { - (void)reason; + return !!__pat_enabled; } -#endif - +EXPORT_SYMBOL_GPL(pat_enabled); int pat_debug_enable; @@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *str) } __setup("debugpat", pat_debug_setup); -static u64 __read_mostly boot_pat_state; - #ifdef CONFIG_X86_PAT /* - * X86 PAT uses page flags WC and Uncached together to keep track of - * memory type of pages that have backing page struct. X86 PAT supports 3 - * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and - * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not - * been changed from its default (value of -1 used to denote this). - * Note we do not support _PAGE_CACHE_MODE_UC here. + * X86 PAT uses page flags arch_1 and uncached together to keep track of + * memory type of pages that have backing page struct. + * + * X86 PAT supports 4 different memory types: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_WT + * + * _PAGE_CACHE_MODE_WB is the default type. */ -#define _PGMT_DEFAULT 0 +#define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) #define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) @@ -88,14 +93,14 @@ static inline enum page_cache_mode get_page_memtype(struct page *pg) { unsigned long pg_flags = pg->flags & _PGMT_MASK; - if (pg_flags == _PGMT_DEFAULT) - return -1; + if (pg_flags == _PGMT_WB) + return _PAGE_CACHE_MODE_WB; else if (pg_flags == _PGMT_WC) return _PAGE_CACHE_MODE_WC; else if (pg_flags == _PGMT_UC_MINUS) return _PAGE_CACHE_MODE_UC_MINUS; else - return _PAGE_CACHE_MODE_WB; + return _PAGE_CACHE_MODE_WT; } static inline void set_page_memtype(struct page *pg, @@ -112,11 +117,12 @@ static inline void set_page_memtype(struct page *pg, case _PAGE_CACHE_MODE_UC_MINUS: memtype_flags = _PGMT_UC_MINUS; break; - case _PAGE_CACHE_MODE_WB: - memtype_flags = _PGMT_WB; + case _PAGE_CACHE_MODE_WT: + memtype_flags = _PGMT_WT; break; + case _PAGE_CACHE_MODE_WB: default: - memtype_flags = _PGMT_DEFAULT; + memtype_flags = _PGMT_WB; break; } @@ -174,78 +180,154 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(void) +void pat_init_cache_modes(u64 pat) { - int i; enum page_cache_mode cache; char pat_msg[33]; - u64 pat; + int i; - rdmsrl(MSR_IA32_CR_PAT, pat); pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, pat_msg + 4 * i); update_cache_mode_entry(i, cache); } - pr_info("PAT configuration [0-7]: %s\n", pat_msg); + pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) -void pat_init(void) +static void pat_bsp_init(u64 pat) { - u64 pat; - bool boot_cpu = !boot_pat_state; + u64 tmp_pat; - if (!pat_enabled) + if (!cpu_has_pat) { + pat_disable("PAT not supported by CPU."); return; + } - if (!cpu_has_pat) { - if (!boot_pat_state) { - pat_disable("PAT not supported by CPU."); - return; - } else { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); - } + if (!pat_enabled()) + goto done; + + rdmsrl(MSR_IA32_CR_PAT, tmp_pat); + if (!tmp_pat) { + pat_disable("PAT MSR is 0, disabled."); + return; } - /* Set PWT to Write-Combining. All other bits stay the same */ - /* - * PTE encoding used in Linux: - * PAT - * |PCD - * ||PWT - * ||| - * 000 WB _PAGE_CACHE_WB - * 001 WC _PAGE_CACHE_WC - * 010 UC- _PAGE_CACHE_UC_MINUS - * 011 UC _PAGE_CACHE_UC - * PAT bit unused - */ - pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); - - /* Boot CPU check */ - if (!boot_pat_state) { - rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); - if (!boot_pat_state) { - pat_disable("PAT read returns always zero, disabled."); - return; - } + wrmsrl(MSR_IA32_CR_PAT, pat); + +done: + pat_init_cache_modes(pat); +} + +static void pat_ap_init(u64 pat) +{ + if (!pat_enabled()) + return; + + if (!cpu_has_pat) { + /* + * If this happens we are on a secondary CPU, but switched to + * PAT on the boot CPU. We have no way to undo PAT. + */ + panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } wrmsrl(MSR_IA32_CR_PAT, pat); +} + +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + /* + * No PAT. Emulate the PAT table that corresponds to the two + * cache bits, PWT (Write Through) and PCD (Cache Disable). This + * setup is the same as the BIOS default setup when the system + * has PAT but the "nopat" boot option has been specified. This + * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * + * PTE encoding: + * + * PCD + * |PWT PAT + * || slot + * 00 0 WB : _PAGE_CACHE_MODE_WB + * 01 1 WT : _PAGE_CACHE_MODE_WT + * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 11 3 UC : _PAGE_CACHE_MODE_UC + * + * NOTE: When WC or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); - if (boot_cpu) - pat_init_cache_modes(); + } else if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + /* + * PAT support with the lower four entries. Intel Pentium 2, + * 3, M, and 4 are affected by PAT errata, which makes the + * upper four entries unusable. To be on the safe side, we don't + * use those. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * PAT bit unused + * + * NOTE: When WT or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + } else { + /* + * Full PAT support. We put WT in slot 7 to improve + * robustness in the presence of errata that might cause + * the high PAT bit to be ignored. This way, a buggy slot 7 + * access will hit slot 3, and slot 3 is UC, so at worst + * we lose performance without causing a correctness issue. + * Pentium 4 erratum N46 is an example for such an erratum, + * although we try not to use PAT at all on affected CPUs. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * 100 4 WB : Reserved + * 101 5 WC : Reserved + * 110 6 UC-: Reserved + * 111 7 WT : _PAGE_CACHE_MODE_WT + * + * The reserved slots are unused, but mapped to their + * corresponding types in the presence of PAT errata. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + } + + if (!boot_cpu_done) { + pat_bsp_init(pat); + boot_cpu_done = true; + } else { + pat_ap_init(pat); + } } #undef PAT @@ -267,9 +349,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { - u8 mtrr_type; + u8 mtrr_type, uniform; - mtrr_type = mtrr_type_lookup(start, end); + mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; @@ -324,9 +406,14 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) /* * For RAM pages, we use page flags to mark the pages with appropriate type. - * Here we do two pass: - * - Find the memtype of all the pages in the range, look for any conflicts - * - In case of no conflicts, set the new memtype for pages in the range + * The page flags are limited to four types, WB (default), WC, WT and UC-. + * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting + * a new memory type is only allowed for a page mapped with the default WB + * type. + * + * Here we do two passes: + * - Find the memtype of all the pages in the range, look for any conflicts. + * - In case of no conflicts, set the new memtype for pages in the range. */ static int reserve_ram_pages_type(u64 start, u64 end, enum page_cache_mode req_type, @@ -335,6 +422,12 @@ static int reserve_ram_pages_type(u64 start, u64 end, struct page *page; u64 pfn; + if (req_type == _PAGE_CACHE_MODE_WP) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; + return -EINVAL; + } + if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); @@ -346,8 +439,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, page = pfn_to_page(pfn); type = get_page_memtype(page); - if (type != -1) { - pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", + if (type != _PAGE_CACHE_MODE_WB) { + pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -373,7 +466,7 @@ static int free_ram_pages_type(u64 start, u64 end) for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - set_page_memtype(page, -1); + set_page_memtype(page, _PAGE_CACHE_MODE_WB); } return 0; } @@ -384,6 +477,7 @@ static int free_ram_pages_type(u64 start, u64 end) * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_UC + * - _PAGE_CACHE_MODE_WT * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return @@ -400,14 +494,10 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, BUG_ON(start >= end); /* end is exclusive */ - if (!pat_enabled) { + if (!pat_enabled()) { /* This is identical to page table setting without PAT */ - if (new_type) { - if (req_type == _PAGE_CACHE_MODE_WC) - *new_type = _PAGE_CACHE_MODE_UC_MINUS; - else - *new_type = req_type; - } + if (new_type) + *new_type = req_type; return 0; } @@ -451,9 +541,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", - start, end - 1, - cattr_name(new->type), cattr_name(req_type)); + pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); @@ -475,7 +565,7 @@ int free_memtype(u64 start, u64 end) int is_range_ram; struct memtype *entry; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Low ISA region is always mapped WB. No need to track */ @@ -497,8 +587,8 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (!entry) { - printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", - current->comm, current->pid, start, end - 1); + pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); return -EINVAL; } @@ -517,7 +607,7 @@ int free_memtype(u64 start, u64 end) * Only to be called when PAT is enabled * * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS - * or _PAGE_CACHE_MODE_UC + * or _PAGE_CACHE_MODE_WT. */ static enum page_cache_mode lookup_memtype(u64 paddr) { @@ -529,16 +619,9 @@ static enum page_cache_mode lookup_memtype(u64 paddr) if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; - page = pfn_to_page(paddr >> PAGE_SHIFT); - rettype = get_page_memtype(page); - /* - * -1 from get_page_memtype() implies RAM page is in its - * default state and not reserved, and hence of type WB - */ - if (rettype == -1) - rettype = _PAGE_CACHE_MODE_WB; - return rettype; + page = pfn_to_page(paddr >> PAGE_SHIFT); + return get_page_memtype(page); } spin_lock(&memtype_lock); @@ -623,13 +706,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) u64 to = from + size; u64 cursor = from; - if (!pat_enabled) + if (!pat_enabled()) return 1; while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", - current->comm, from, to - 1); + pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -659,7 +742,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * caching for the high addresses through the KEN pin, but * we maintain the tradition of paranoia in this code. */ - if (!pat_enabled && + if (!pat_enabled() && !(boot_cpu_has(X86_FEATURE_MTRR) || boot_cpu_has(X86_FEATURE_K6_MTRR) || boot_cpu_has(X86_FEATURE_CYRIX_ARR) || @@ -698,8 +781,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { - printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " - "for [mem %#010Lx-%#010Lx]\n", + pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(pcm), base, (unsigned long long)(base + size-1)); @@ -729,12 +811,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, * the type requested matches the type of first page in the range. */ if (is_ram) { - if (!pat_enabled) + if (!pat_enabled()) return 0; pcm = lookup_memtype(paddr); if (want_pcm != pcm) { - printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, @@ -755,13 +837,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, if (strict_prot || !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { free_memtype(paddr, paddr + size); - printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for [mem %#010Lx-%#010Lx], got %s\n", - current->comm, current->pid, - cattr_name(want_pcm), - (unsigned long long)paddr, - (unsigned long long)(paddr + size - 1), - cattr_name(pcm)); + pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); return -EINVAL; } /* @@ -844,7 +925,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return ret; } - if (!pat_enabled) + if (!pat_enabled()) return 0; /* @@ -872,7 +953,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, { enum page_cache_mode pcm; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Set prot based on lookup */ @@ -913,14 +994,18 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { - if (pat_enabled) - return __pgprot(pgprot_val(prot) | + return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); - else - return pgprot_noncached(prot); } EXPORT_SYMBOL_GPL(pgprot_writecombine); +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) static struct memtype *memtype_get_idx(loff_t pos) @@ -996,7 +1081,7 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - if (pat_enabled) { + if (pat_enabled()) { debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, NULL, &memtype_fops); } diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index f6411620305d..a739bfc40690 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -4,7 +4,7 @@ extern int pat_debug_enable; #define dprintk(fmt, arg...) \ - do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0) struct memtype { u64 start; diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 6582adcc8bd9..63931080366a 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -160,9 +160,9 @@ success: return 0; failure: - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, - end, cattr_name(found_type), cattr_name(match->type)); + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); return -EBUSY; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0b97d2c75df3..fb0a9dd1d6e4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if any of the following conditions are met: + * + * - MTRRs are disabled, or + * + * - MTRRs are enabled and the range is completely covered by a single MTRR, or + * + * - MTRRs are enabled and the corresponding MTRR memory type is WB, which + * has no effect on the requested PAT memory type. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. + * + * Returns 1 on success and 0 on failure. + */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ - mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) return 0; prot = pgprot_4k_2_large(prot); @@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pmd_set_huge - setup kernel PMD mapping + * + * See text over pud_set_huge() above. + * + * Returns 1 on success and 0 on failure. + */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ - mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); return 0; + } prot = pgprot_4k_2_large(prot); @@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { @@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud) return 0; } +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 6440221ced0d..4093216b3791 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -8,7 +8,6 @@ * of the License. */ #include <linux/linkage.h> -#include <asm/dwarf2.h> /* * Calling convention : diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99f76103c6b7..ddeff4844a10 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -966,7 +966,12 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } ctx.cleanup_addr = proglen; - for (pass = 0; pass < 10; pass++) { + /* JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large bpf programs + * may converge on the last pass. In such case do one more + * pass to emit the final image + */ + for (pass = 0; pass < 10 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); if (proglen <= 0) { image = NULL; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d93963340c3c..14a63ed6fe09 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -482,9 +482,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { - struct pci_sysdata *sd = bridge->bus->sysdata; - - ACPI_COMPANION_SET(&bridge->dev, sd->companion); + /* + * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL + * here, pci_create_root_bus() has been called by someone else and + * sysdata is likely to be different from what we expect. Let it go in + * that case. + */ + if (!bridge->dev.parent) { + struct pci_sysdata *sd = bridge->bus->sysdata; + ACPI_COMPANION_SET(&bridge->dev, sd->companion); + } return 0; } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 349c0d32cc0b..0a9f2caf358f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, * Caller can followup with UC MINUS request and add a WC mtrr if there * is a free mtrr slot. */ - if (!pat_enabled && write_combine) + if (!pat_enabled() && write_combine) return -EINVAL; - if (pat_enabled && write_combine) + if (pat_enabled() && write_combine) prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled || boot_cpu_data.x86 > 3) + else if (pat_enabled() || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index a62e0be3a2f1..f1a6c8e86ddd 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,4 +1,5 @@ # Platform specific code goes here +obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile new file mode 100644 index 000000000000..0a3a40cbc794 --- /dev/null +++ b/arch/x86/platform/atom/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c new file mode 100644 index 000000000000..5ca8ead91579 --- /dev/null +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -0,0 +1,183 @@ +/* + * Intel SOC Punit device state debug driver + * Punit controls power management for North Complex devices (Graphics + * blocks, Image Signal Processing, video processing, display, DSP etc.) + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/device.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/io.h> +#include <asm/cpu_device_id.h> +#include <asm/iosf_mbi.h> + +/* Side band Interface port */ +#define PUNIT_PORT 0x04 +/* Power gate status reg */ +#define PWRGT_STATUS 0x61 +/* Subsystem config/status Video processor */ +#define VED_SS_PM0 0x32 +/* Subsystem config/status ISP (Image Signal Processor) */ +#define ISP_SS_PM0 0x39 +/* Subsystem config/status Input/output controller */ +#define MIO_SS_PM 0x3B +/* Shift bits for getting status for video, isp and i/o */ +#define SSS_SHIFT 24 +/* Shift bits for getting status for graphics rendering */ +#define RENDER_POS 0 +/* Shift bits for getting status for media control */ +#define MEDIA_POS 2 +/* Shift bits for getting status for Valley View/Baytrail display */ +#define VLV_DISPLAY_POS 6 +/* Subsystem config/status display for Cherry Trail SOC */ +#define CHT_DSP_SSS 0x36 +/* Shift bits for getting status for display */ +#define CHT_DSP_SSS_POS 16 + +struct punit_device { + char *name; + int reg; + int sss_pos; +}; + +static const struct punit_device punit_device_byt[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", PWRGT_STATUS, VLV_DISPLAY_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const struct punit_device punit_device_cht[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", CHT_DSP_SSS, CHT_DSP_SSS_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static int punit_dev_state_show(struct seq_file *seq_file, void *unused) +{ + u32 punit_pwr_status; + struct punit_device *punit_devp = seq_file->private; + int index; + int status; + + seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n"); + while (punit_devp->name) { + status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ, + punit_devp->reg, + &punit_pwr_status); + if (status) { + seq_printf(seq_file, "%9s : Read Failed\n", + punit_devp->name); + } else { + index = (punit_pwr_status >> punit_devp->sss_pos) & 3; + seq_printf(seq_file, "%9s : %s\n", punit_devp->name, + dstates[index]); + } + punit_devp++; + } + + return 0; +} + +static int punit_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, punit_dev_state_show, inode->i_private); +} + +static const struct file_operations punit_dev_state_ops = { + .open = punit_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *punit_dbg_file; + +static int punit_dbgfs_register(struct punit_device *punit_device) +{ + static struct dentry *dev_state; + + punit_dbg_file = debugfs_create_dir("punit_atom", NULL); + if (!punit_dbg_file) + return -ENXIO; + + dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO, + punit_dbg_file, punit_device, + &punit_dev_state_ops); + if (!dev_state) { + pr_err("punit_dev_state register failed\n"); + debugfs_remove(punit_dbg_file); + return -ENXIO; + } + + return 0; +} + +static void punit_dbgfs_unregister(void) +{ + debugfs_remove_recursive(punit_dbg_file); +} + +#define ICPU(model, drv_data) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\ + (kernel_ulong_t)&drv_data } + +static const struct x86_cpu_id intel_punit_cpu_ids[] = { + ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */ + ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */ + {} +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids); + +static int __init punit_atom_debug_init(void) +{ + const struct x86_cpu_id *id; + int ret; + + id = x86_match_cpu(intel_punit_cpu_ids); + if (!id) + return -ENODEV; + + ret = punit_dbgfs_register((struct punit_device *)id->driver_data); + if (ret < 0) + return ret; + + return 0; +} + +static void __exit punit_atom_debug_exit(void) +{ + punit_dbgfs_unregister(); +} + +module_init(punit_atom_debug_init); +module_exit(punit_atom_debug_exit); + +MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>"); +MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); +MODULE_DESCRIPTION("Driver for Punit devices states debugging"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index acb384d24669..a8fecc226946 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -26,7 +26,7 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ ../lib/rwsem.o endif diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 7e8a1a650435..b9531d343134 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -39,7 +39,8 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) + +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fe969ac1c65e..a8f57a94785a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1468,6 +1468,7 @@ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; + u64 pat; int rc; if (!xen_start_info) @@ -1575,8 +1576,8 @@ asmlinkage __visible void __init xen_start_kernel(void) * Modify the cache mode translation tables to match Xen's PAT * configuration. */ - - pat_init_cache_modes(); + rdmsrl(MSR_IA32_CR_PAT, pat); + pat_init_cache_modes(pat); /* keep using Xen gdt for now; no urgent need to change it */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b47124d4cd67..8b7f18e200aa 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -67,6 +67,7 @@ #include <linux/seq_file.h> #include <linux/bootmem.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <asm/cache.h> #include <asm/setup.h> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 956374c1edbc..9e2ba5c6e1dd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,6 +17,56 @@ #include "xen-ops.h" #include "debugfs.h" +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(char *, irq_name); +static bool xen_pvspin = true; + +#ifdef CONFIG_QUEUED_SPINLOCKS + +#include <asm/qspinlock.h> + +static void xen_qlock_kick(int cpu) +{ + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); +} + +/* + * Halt the current CPU & release it back to the host + */ +static void xen_qlock_wait(u8 *byte, u8 val) +{ + int irq = __this_cpu_read(lock_kicker_irq); + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return; + + /* clear pending */ + xen_clear_irq_pending(irq); + barrier(); + + /* + * We check the byte value after clearing pending IRQ to make sure + * that we won't miss a wakeup event because of the clearing. + * + * The sync_clear_bit() call in xen_clear_irq_pending() is atomic. + * So it is effectively a memory barrier for x86. + */ + if (READ_ONCE(*byte) != val) + return; + + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ + + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); +} + +#else /* CONFIG_QUEUED_SPINLOCKS */ + enum xen_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -100,12 +150,9 @@ struct xen_lock_waiting { __ticket_t want; }; -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; -static bool xen_pvspin = true; __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); @@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) } } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ static irqreturn_t dummy_handler(int irq, void *dev_id) { @@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); +#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = xen_qlock_wait; + pv_lock_ops.kick = xen_qlock_kick; +#else pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; +#endif } /* @@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#ifdef CONFIG_XEN_DEBUG_FS +#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS) static struct dentry *d_spin_debug; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 04529e620559..f22667abf7b9 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -114,7 +114,7 @@ RELOC(xen_sysret32, 1b+1) /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) undo_xen_syscall - jmp system_call_after_swapgs + jmp entry_SYSCALL_64_after_swapgs ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION @@ -122,13 +122,13 @@ ENDPROC(xen_syscall_target) /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) undo_xen_syscall - jmp ia32_cstar_target + jmp entry_SYSCALL_compat ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) undo_xen_syscall - jmp ia32_sysenter_target + jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ |