summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild4
-rw-r--r--arch/x86/Kconfig88
-rw-r--r--arch/x86/Makefile17
-rw-r--r--arch/x86/boot/code16gcc.h24
-rw-r--r--arch/x86/boot/compressed/Makefile3
-rw-r--r--arch/x86/boot/compressed/aslr.c9
-rw-r--r--arch/x86/boot/compressed/eboot.c51
-rw-r--r--arch/x86/boot/compressed/eboot.h16
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/boot/compressed/string.c4
-rw-r--r--arch/x86/boot/header.S31
-rw-r--r--arch/x86/boot/string.c9
-rw-r--r--arch/x86/boot/tools/build.c38
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S546
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c40
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S281
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S805
-rw-r--r--arch/x86/crypto/des3_ede_glue.c509
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S4
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c12
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c8
-rw-r--r--arch/x86/include/asm/Kbuild3
-rw-r--r--arch/x86/include/asm/acenv.h45
-rw-r--r--arch/x86/include/asm/acpi.h50
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/asm.h7
-rw-r--r--arch/x86/include/asm/atomic.h7
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/bitops.h6
-rw-r--r--arch/x86/include/asm/checksum_64.h9
-rw-r--r--arch/x86/include/asm/cmdline.h6
-rw-r--r--arch/x86/include/asm/cmpxchg.h4
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h45
-rw-r--r--arch/x86/include/asm/crash.h9
-rw-r--r--arch/x86/include/asm/efi.h133
-rw-r--r--arch/x86/include/asm/elf.h35
-rw-r--r--arch/x86/include/asm/espfix.h16
-rw-r--r--arch/x86/include/asm/fixmap.h11
-rw-r--r--arch/x86/include/asm/fpu-internal.h10
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/iosf_mbi.h55
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_remapping.h3
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h6
-rw-r--r--arch/x86/include/asm/kexec.h45
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/kvm_emulate.h34
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/mc146818rtc.h2
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/microcode.h1
-rw-r--r--arch/x86/include/asm/mmu.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/mutex_32.h16
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/page.h1
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/percpu.h3
-rw-r--r--arch/x86/include/asm/pgtable-2level.h59
-rw-r--r--arch/x86/include/asm/pgtable.h20
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h66
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h78
-rw-r--r--arch/x86/include/asm/pmc_atom.h107
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/include/asm/proto.h2
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/include/asm/qrwlock.h17
-rw-r--r--arch/x86/include/asm/scatterlist.h8
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/signal.h6
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/spinlock_types.h4
-rw-r--r--arch/x86/include/asm/swiotlb.h7
-rw-r--r--arch/x86/include/asm/sync_bitops.h2
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/traps.h8
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/x86/include/asm/uprobes.h20
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h19
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h12
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h42
-rw-r--r--arch/x86/include/asm/vdso.h78
-rw-r--r--arch/x86/include/asm/vdso32.h11
-rw-r--r--arch/x86/include/asm/vga.h6
-rw-r--r--arch/x86/include/asm/vmx.h7
-rw-r--r--arch/x86/include/asm/vvar.h20
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/interface.h3
-rw-r--r--arch/x86/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm_perf.h16
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h1
-rw-r--r--arch/x86/include/uapi/asm/vsyscall.h7
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/apei.c62
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/alternative.c3
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/aperture_64.c59
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c23
-rw-r--r--arch/x86/kernel/apic/io_apic.c158
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c26
-rw-r--r--arch/x86/kernel/apm_32.c12
-rw-r--r--arch/x86/kernel/cpu/amd.c348
-rw-r--r--arch/x86/kernel/cpu/common.c50
-rw-r--r--arch/x86/kernel/cpu/intel.c52
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c72
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c18
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c37
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh51
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event.c25
-rw-r--r--arch/x86/kernel/cpu/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c111
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c78
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c5
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c16
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/crash.c563
-rw-r--r--arch/x86/kernel/devicetree.c12
-rw-r--r--arch/x86/kernel/dumpstack.c9
-rw-r--r--arch/x86/kernel/early-quirks.c46
-rw-r--r--arch/x86/kernel/entry_32.S72
-rw-r--r--arch/x86/kernel/entry_64.S535
-rw-r--r--arch/x86/kernel/espfix_64.c208
-rw-r--r--arch/x86/kernel/ftrace.c59
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/hpet.c8
-rw-r--r--arch/x86/kernel/hw_breakpoint.c5
-rw-r--r--arch/x86/kernel/i8259.c20
-rw-r--r--arch/x86/kernel/iosf_mbi.c13
-rw-r--r--arch/x86/kernel/irq.c29
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c553
-rw-r--r--arch/x86/kernel/kprobes/core.c131
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c17
-rw-r--r--arch/x86/kernel/kprobes/opt.c32
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/machine_kexec_64.c239
-rw-r--r--arch/x86/kernel/mcount_64.S206
-rw-r--r--arch/x86/kernel/nmi.c18
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c9
-rw-r--r--arch/x86/kernel/pmc_atom.c321
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smpboot.c12
-rw-r--r--arch/x86/kernel/traps.c140
-rw-r--r--arch/x86/kernel/tsc.c32
-rw-r--r--arch/x86/kernel/uprobes.c842
-rw-r--r--arch/x86/kernel/vsyscall_64.c23
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c23
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c11
-rw-r--r--arch/x86/kvm/cpuid.h15
-rw-r--r--arch/x86/kvm/emulate.c587
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/lapic.c110
-rw-r--r--arch/x86/kvm/mmu.c84
-rw-r--r--arch/x86/kvm/mmu.h33
-rw-r--r--arch/x86/kvm/mmu_audit.c2
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h7
-rw-r--r--arch/x86/kvm/pmu.c16
-rw-r--r--arch/x86/kvm/svm.c121
-rw-r--r--arch/x86/kvm/trace.h26
-rw-r--r--arch/x86/kvm/vmx.c573
-rw-r--r--arch/x86/kvm/x86.c316
-rw-r--r--arch/x86/kvm/x86.h27
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/cmdline.c84
-rw-r--r--arch/x86/lib/thunk_32.S3
-rw-r--r--arch/x86/lib/thunk_64.S3
-rw-r--r--arch/x86/mm/dump_pagetables.c44
-rw-r--r--arch/x86/mm/fault.c43
-rw-r--r--arch/x86/mm/hugetlbpage.c10
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c62
-rw-r--r--arch/x86/mm/ioremap.c32
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/pageattr-test.c2
-rw-r--r--arch/x86/mm/pgtable.c27
-rw-r--r--arch/x86/mm/tlb.c103
-rw-r--r--arch/x86/net/bpf_jit.S77
-rw-r--r--arch/x86/net/bpf_jit_comp.c1403
-rw-r--r--arch/x86/pci/acpi.c6
-rw-r--r--arch/x86/pci/amd_bus.c83
-rw-r--r--arch/x86/pci/broadcom_bus.c4
-rw-r--r--arch/x86/pci/fixup.c39
-rw-r--r--arch/x86/pci/i386.c31
-rw-r--r--arch/x86/pci/sta2x11-fixup.c6
-rw-r--r--arch/x86/platform/efi/Makefile2
-rw-r--r--arch/x86/platform/efi/efi.c494
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S81
-rw-r--r--arch/x86/platform/efi/quirks.c290
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile1
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_wdt.c72
-rw-r--r--arch/x86/platform/ts5500/ts5500.c94
-rw-r--r--arch/x86/platform/uv/bios_uv.c2
-rw-r--r--arch/x86/platform/uv/tlb_uv.c71
-rw-r--r--arch/x86/platform/uv/uv_irq.c10
-rw-r--r--arch/x86/platform/uv/uv_nmi.c2
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/purgatory/Makefile30
-rw-r--r--arch/x86/purgatory/entry64.S101
-rw-r--r--arch/x86/purgatory/purgatory.c72
-rw-r--r--arch/x86/purgatory/setup-x86_64.S58
-rw-r--r--arch/x86/purgatory/sha256.c283
-rw-r--r--arch/x86/purgatory/sha256.h22
-rw-r--r--arch/x86/purgatory/stack.S19
-rw-r--r--arch/x86/purgatory/string.c13
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/syscalls/syscall_64.tbl10
-rw-r--r--arch/x86/um/asm/elf.h1
-rw-r--r--arch/x86/um/asm/processor.h3
-rw-r--r--arch/x86/um/mem_64.c15
-rw-r--r--arch/x86/um/signal.c45
-rw-r--r--arch/x86/um/vdso/vma.c2
-rw-r--r--arch/x86/vdso/.gitignore5
-rw-r--r--arch/x86/vdso/Makefile145
-rw-r--r--arch/x86/vdso/vclock_gettime.c29
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S84
-rw-r--r--arch/x86/vdso/vdso.S3
-rw-r--r--arch/x86/vdso/vdso.lds.S9
-rw-r--r--arch/x86/vdso/vdso2c.c253
-rw-r--r--arch/x86/vdso/vdso2c.h173
-rw-r--r--arch/x86/vdso/vdso32-setup.c230
-rw-r--r--arch/x86/vdso/vdso32.S9
-rw-r--r--arch/x86/vdso/vdso32/vdso-fakesections.c1
-rw-r--r--arch/x86/vdso/vdso32/vdso32.lds.S15
-rw-r--r--arch/x86/vdso/vdsox32.S3
-rw-r--r--arch/x86/vdso/vdsox32.lds.S9
-rw-r--r--arch/x86/vdso/vma.c244
-rw-r--r--arch/x86/xen/Makefile1
-rw-r--r--arch/x86/xen/efi.c43
-rw-r--r--arch/x86/xen/enlighten.c21
-rw-r--r--arch/x86/xen/grant-table.c106
-rw-r--r--arch/x86/xen/mmu.c133
-rw-r--r--arch/x86/xen/p2m.c179
-rw-r--r--arch/x86/xen/setup.c86
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-ops.h11
268 files changed, 11826 insertions, 5188 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index e5287d8517aa..61b6d51866f8 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -16,3 +16,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
obj-y += net/
+
+ifeq ($(CONFIG_X86_64),y)
+obj-$(CONFIG_KEXEC) += purgatory/
+endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 25d2c6f7325e..4aafd322e21e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,12 +21,13 @@ config X86_64
### Arch settings
config X86
def_bool y
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
- select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
@@ -41,7 +42,7 @@ config X86
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
- select HAVE_DMA_CONTIGUOUS if !SWIOTLB
+ select HAVE_DMA_CONTIGUOUS
select HAVE_KRETPROBES
select GENERIC_EARLY_IOREMAP
select HAVE_OPTPROBES
@@ -54,7 +55,6 @@ config X86
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_SYSCALL_TRACEPOINTS
select SYSCTL_EXCEPTION_TRACE
select HAVE_KVM
@@ -96,6 +96,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
@@ -105,13 +106,13 @@ config X86
select HAVE_ARCH_SECCOMP_FILTER
select BUILDTIME_EXTABLE_SORT
select GENERIC_CMOS_UPDATE
- select HAVE_ARCH_SOFT_DIRTY
+ select HAVE_ARCH_SOFT_DIRTY if X86_64
select CLOCKSOURCE_WATCHDOG
select GENERIC_CLOCKEVENTS
select ARCH_CLOCKSOURCE_DATA
+ select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
select GENERIC_TIME_VSYSCALL
- select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_CONTEXT_TRACKING if X86_64
@@ -121,6 +122,7 @@ config X86
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
select OLD_SIGACTION if X86_32
select COMPAT_OLD_SIGACTION if IA32_EMULATION
@@ -130,6 +132,10 @@ config X86
select HAVE_CC_STACKPROTECTOR
select GENERIC_CPU_AUTOPROBE
select HAVE_ARCH_AUDITSYSCALL
+ select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_ACPI_APEI if ACPI
+ select HAVE_ACPI_APEI_NMI if ACPI
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
config INSTRUCTION_DECODER
def_bool y
@@ -261,6 +267,9 @@ config ARCH_HWEIGHT_CFLAGS
config ARCH_SUPPORTS_UPROBES
def_bool y
+config FIX_EARLYCON_MEM
+ def_bool y
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -415,7 +424,6 @@ config X86_UV
config X86_GOLDFISH
bool "Goldfish (Virtual Platform)"
- depends on X86_32
depends on X86_EXTENDED_PLATFORM
---help---
Enable support for the Goldfish virtual platform used primarily
@@ -831,6 +839,7 @@ config X86_LOCAL_APIC
config X86_IO_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
+ select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
@@ -909,10 +918,27 @@ config VM86
default y
depends on X86_32
---help---
- This option is required by programs like DOSEMU to run 16-bit legacy
- code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ This option is required by programs like DOSEMU to run
+ 16-bit real mode legacy code on x86 processors. It also may
+ be needed by software like XFree86 to initialize some video
+ cards via BIOS. Disabling this option saves about 6K.
+
+config X86_16BIT
+ bool "Enable support for 16-bit segments" if EXPERT
+ default y
+ ---help---
+ This option is required by programs like Wine to run 16-bit
+ protected mode legacy code on x86 processors. Disabling
+ this option saves about 300 bytes on i386, or around 6K text
+ plus 16K runtime memory on x86-64,
+
+config X86_ESPFIX32
+ def_bool y
+ depends on X86_16BIT && X86_32
+
+config X86_ESPFIX64
+ def_bool y
+ depends on X86_16BIT && X86_64
config TOSHIBA
tristate "Toshiba Laptop support"
@@ -1501,6 +1527,7 @@ config EFI
bool "EFI runtime service support"
depends on ACPI
select UCS2_STRING
+ select EFI_RUNTIME_WRAPPERS
---help---
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1555,6 +1582,9 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ select BUILD_BIN2C
+ select CRYPTO
+ select CRYPTO_SHA256
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1569,6 +1599,28 @@ config KEXEC
interface is strongly in flux, so no good recommendation can be
made.
+config KEXEC_VERIFY_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on KEXEC
+ ---help---
+ This option makes kernel signature verification mandatory for
+ kexec_file_load() syscall. If kernel is signature can not be
+ verified, kexec_file_load() will fail.
+
+ This option enforces signature verification at generic level.
+ One needs to enable signature verification for type of kernel
+ image being loaded to make sure it works. For example, enable
+ bzImage signature verification option to be able to load and
+ verify signatures of bzImage. Otherwise kernel loading will fail.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on KEXEC_VERIFY_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ ---help---
+ Enable bzImage signature verification support.
+
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
@@ -1651,7 +1703,6 @@ config RELOCATABLE
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image"
depends on RELOCATABLE
- depends on !HIBERNATION
default n
---help---
Randomizes the physical and virtual address at which the
@@ -1871,6 +1922,10 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
def_bool y
depends on X86_64 || X86_PAE
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ def_bool y
+ depends on X86_64 && HUGETLB_PAGE && MIGRATION
+
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
@@ -2375,12 +2430,13 @@ config X86_DMA_REMAP
depends on STA2X11
config IOSF_MBI
- bool
+ tristate
+ default m
depends on PCI
- ---help---
- To be selected by modules requiring access to the Intel OnChip System
- Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms
- enumerable by PCI.
+
+config PMC_ATOM
+ def_bool y
+ depends on PCI
source "net/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 33f71b01fd22..c1aa36887843 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -15,12 +15,9 @@ endif
# that way we can complain to the user if the CPU is insufficient.
#
# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
-# older versions of GCC, we need to play evil and unreliable tricks to
-# attempt to ensure that our asm(".code16gcc") is first in the asm
-# output.
-CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time))
+# older versions of GCC, include an *assembly* header to make sure that
+# gcc doesn't play any games behind our back.
+CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
@@ -186,6 +183,14 @@ archscripts: scripts_basic
archheaders:
$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+archprepare:
+ifeq ($(CONFIG_KEXEC),y)
+# Build only for 64bit. No loaders for 32bit yet.
+ ifeq ($(CONFIG_X86_64),y)
+ $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c
+ endif
+endif
+
###
# Kernel objects
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
index d93e48010b61..5ff426535397 100644
--- a/arch/x86/boot/code16gcc.h
+++ b/arch/x86/boot/code16gcc.h
@@ -1,15 +1,11 @@
-/*
- * code16gcc.h
- *
- * This file is -include'd when compiling 16-bit C code.
- * Note: this asm() needs to be emitted before gcc emits any code.
- * Depending on gcc version, this requires -fno-unit-at-a-time or
- * -fno-toplevel-reorder.
- *
- * Hopefully gcc will eventually have a real -m16 option so we can
- * drop this hack long term.
- */
+#
+# code16gcc.h
+#
+# This file is added to the assembler via -Wa when compiling 16-bit C code.
+# This is done this way instead via asm() to make sure gcc does not reorder
+# things around us.
+#
+# gcc 4.9+ has a real -m16 option so we can drop this hack long term.
+#
-#ifndef __ASSEMBLY__
-asm(".code16gcc");
-#endif
+ .code16gcc
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0fcd9133790c..7a801a310e37 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -33,7 +33,8 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
ifeq ($(CONFIG_EFI_STUB), y)
- VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+ VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
+ $(objtree)/drivers/firmware/efi/libstub/lib.a
endif
$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 4dbf967da50d..fc6091abedb7 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -289,10 +289,17 @@ unsigned char *choose_kernel_location(unsigned char *input,
unsigned long choice = (unsigned long)output;
unsigned long random;
+#ifdef CONFIG_HIBERNATION
+ if (!cmdline_find_option_bool("kaslr")) {
+ debug_putstr("KASLR disabled by default...\n");
+ goto out;
+ }
+#else
if (cmdline_find_option_bool("nokaslr")) {
- debug_putstr("KASLR disabled...\n");
+ debug_putstr("KASLR disabled by cmdline...\n");
goto out;
}
+#endif
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4703a6c4b8e3..f277184e2ac1 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -19,10 +19,7 @@
static efi_system_table_t *sys_table;
-static struct efi_config *efi_early;
-
-#define efi_call_early(f, ...) \
- efi_early->call(efi_early->f, __VA_ARGS__);
+struct efi_config *efi_early;
#define BOOT_SERVICES(bits) \
static void setup_boot_services##bits(struct efi_config *c) \
@@ -48,8 +45,7 @@ static void setup_boot_services##bits(struct efi_config *c) \
BOOT_SERVICES(32);
BOOT_SERVICES(64);
-static void efi_printk(efi_system_table_t *, char *);
-static void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
+void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
static efi_status_t
__file_size32(void *__fh, efi_char16_t *filename_16,
@@ -156,7 +152,7 @@ grow:
return status;
}
-static efi_status_t
+efi_status_t
efi_file_size(efi_system_table_t *sys_table, void *__fh,
efi_char16_t *filename_16, void **handle, u64 *file_sz)
{
@@ -166,7 +162,7 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh,
return __file_size32(__fh, filename_16, handle, file_sz);
}
-static inline efi_status_t
+efi_status_t
efi_file_read(void *handle, unsigned long *size, void *addr)
{
unsigned long func;
@@ -184,7 +180,7 @@ efi_file_read(void *handle, unsigned long *size, void *addr)
}
}
-static inline efi_status_t efi_file_close(void *handle)
+efi_status_t efi_file_close(void *handle)
{
if (efi_early->is64) {
efi_file_handle_64_t *fh = handle;
@@ -249,7 +245,7 @@ static inline efi_status_t __open_volume64(void *__image, void **__fh)
return status;
}
-static inline efi_status_t
+efi_status_t
efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
{
if (efi_early->is64)
@@ -258,7 +254,7 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
return __open_volume32(__image, __fh);
}
-static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
+void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
{
unsigned long output_string;
size_t offset;
@@ -284,8 +280,6 @@ static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
}
}
-#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
-
static void find_bits(unsigned long mask, u8 *pos, u8 *size)
{
u8 first, len;
@@ -1038,6 +1032,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
int i;
unsigned long ramdisk_addr;
unsigned long ramdisk_size;
+ unsigned long initrd_addr_max;
efi_early = c;
sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
@@ -1087,8 +1082,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
hdr->type_of_loader = 0x21;
/* Convert unicode cmdline to ascii */
- cmdline_ptr = efi_convert_cmdline_to_ascii(sys_table, image,
- &options_size);
+ cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size);
if (!cmdline_ptr)
goto fail;
hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
@@ -1101,14 +1095,21 @@ struct boot_params *make_boot_params(struct efi_config *c)
memset(sdt, 0, sizeof(*sdt));
+ if (hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)
+ initrd_addr_max = -1UL;
+ else
+ initrd_addr_max = hdr->initrd_addr_max;
+
status = handle_cmdline_files(sys_table, image,
(char *)(unsigned long)hdr->cmd_line_ptr,
- "initrd=", hdr->initrd_addr_max,
+ "initrd=", initrd_addr_max,
&ramdisk_addr, &ramdisk_size);
if (status != EFI_SUCCESS)
goto fail2;
- hdr->ramdisk_image = ramdisk_addr;
- hdr->ramdisk_size = ramdisk_size;
+ hdr->ramdisk_image = ramdisk_addr & 0xffffffff;
+ hdr->ramdisk_size = ramdisk_size & 0xffffffff;
+ boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32;
+ boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32;
return boot_params;
fail2:
@@ -1375,7 +1376,10 @@ struct boot_params *efi_main(struct efi_config *c,
setup_graphics(boot_params);
- setup_efi_pci(boot_params);
+ status = setup_efi_pci(boot_params);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "setup_efi_pci() failed!\n");
+ }
status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
sizeof(*gdt), (void **)&gdt);
@@ -1402,16 +1406,20 @@ struct boot_params *efi_main(struct efi_config *c,
hdr->init_size, hdr->init_size,
hdr->pref_address,
hdr->kernel_alignment);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "efi_relocate_kernel() failed!\n");
goto fail;
+ }
hdr->pref_address = hdr->code32_start;
hdr->code32_start = bzimage_addr;
}
status = exit_boot(boot_params, handle, is64);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "exit_boot() failed!\n");
goto fail;
+ }
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
@@ -1471,5 +1479,6 @@ struct boot_params *efi_main(struct efi_config *c,
return boot_params;
fail:
+ efi_printk(sys_table, "efi_main() failed!\n");
return NULL;
}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index c88c31ecad12..d487e727f1ec 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -103,20 +103,4 @@ struct efi_uga_draw_protocol {
void *blt;
};
-struct efi_config {
- u64 image_handle;
- u64 table;
- u64 allocate_pool;
- u64 allocate_pages;
- u64 get_memory_map;
- u64 free_pool;
- u64 free_pages;
- u64 locate_handle;
- u64 handle_protocol;
- u64 exit_boot_services;
- u64 text_output;
- efi_status_t (*call)(unsigned long, ...);
- bool is64;
-} __packed;
-
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 0d558ee899ae..2884e0c3e8a5 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -452,7 +452,7 @@ efi32_config:
.global efi64_config
efi64_config:
.fill 11,8,0
- .quad efi_call6
+ .quad efi_call
.byte 1
#endif /* CONFIG_EFI_STUB */
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index f3c57e341402..00e788be1db9 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,9 +1,5 @@
-#include "misc.h"
#include "../string.c"
-/* misc.h might pull in string_32.h which has a macro for memcpy. undef that */
-#undef memcpy
-
#ifdef CONFIG_X86_32
void *memcpy(void *dest, const void *src, size_t n)
{
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 0ca9a5c362bc..16ef02596db2 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:
.section ".bsdata", "a"
bugger_off_msg:
- .ascii "Direct floppy boot is not supported. "
- .ascii "Use a boot loader program instead.\r\n"
+ .ascii "Use a boot loader.\r\n"
.ascii "\n"
- .ascii "Remove disk and press any key to reboot ...\r\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
.byte 0
#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
#else
.word 0x8664 # x86-64
#endif
- .word 3 # nr_sections
+ .word 4 # nr_sections
.long 0 # TimeDateStamp
.long 0 # PointerToSymbolTable
.long 1 # NumberOfSymbols
@@ -155,7 +154,7 @@ extra_header_fields:
#else
.quad 0 # ImageBase
#endif
- .long 0x20 # SectionAlignment
+ .long CONFIG_PHYSICAL_ALIGN # SectionAlignment
.long 0x20 # FileAlignment
.word 0 # MajorOperatingSystemVersion
.word 0 # MinorOperatingSystemVersion
@@ -250,6 +249,25 @@ section_table:
.word 0 # NumberOfLineNumbers
.long 0x60500020 # Characteristics (section flags)
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
@@ -375,8 +393,7 @@ xloadflags:
# define XLF0 0
#endif
-#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) && \
- !defined(CONFIG_EFI_MIXED)
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
/* kernel/boot_param/ramdisk could be loaded above 4g */
# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
#else
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5339040ef86e..493f3fd9f139 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -12,14 +12,9 @@
* Very basic string functions
*/
-#include "boot.h"
+#include <linux/types.h>
+#include "ctype.h"
-/*
- * This file gets included in compressed/string.c which might pull in
- * string_32.h and which in turn maps memcmp to __builtin_memcmp(). Undo
- * that first.
- */
-#undef memcmp
int memcmp(const void *s1, const void *s2, size_t len)
{
u8 diff;
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 1a2f2121cada..a7661c430cd9 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -143,7 +143,7 @@ static void usage(void)
#ifdef CONFIG_EFI_STUB
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
{
unsigned int pe_header;
unsigned short num_sections;
@@ -164,10 +164,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
put_unaligned_le32(size, section + 0x8);
/* section header vma field */
- put_unaligned_le32(offset, section + 0xc);
+ put_unaligned_le32(vma, section + 0xc);
/* section header 'size of initialised data' field */
- put_unaligned_le32(size, section + 0x10);
+ put_unaligned_le32(datasz, section + 0x10);
/* section header 'file offset' field */
put_unaligned_le32(offset, section + 0x14);
@@ -179,6 +179,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
}
}
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
static void update_pecoff_setup_and_reloc(unsigned int size)
{
u32 setup_offset = 0x200;
@@ -203,9 +208,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
pe_header = get_unaligned_le32(&buf[0x3c]);
- /* Size of image */
- put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
/*
* Size of code: Subtract the size of the first sector (512 bytes)
* which includes the header.
@@ -220,6 +222,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}
+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
static int reserve_pecoff_reloc_section(int c)
{
/* Reserve 0x20 bytes for .reloc section */
@@ -259,6 +277,8 @@ static void efi_stub_entry_update(void)
static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
static inline void update_pecoff_text(unsigned int text_start,
unsigned int file_sz) {}
+static inline void update_pecoff_bss(unsigned int file_sz,
+ unsigned int init_sz) {}
static inline void efi_stub_defaults(void) {}
static inline void efi_stub_entry_update(void) {}
@@ -310,7 +330,7 @@ static void parse_zoffset(char *fname)
int main(int argc, char ** argv)
{
- unsigned int i, sz, setup_sectors;
+ unsigned int i, sz, setup_sectors, init_sz;
int c;
u32 sys_size;
struct stat sb;
@@ -376,7 +396,9 @@ int main(int argc, char ** argv)
buf[0x1f1] = setup_sectors-1;
put_unaligned_le32(sys_size, &buf[0x1f4]);
- update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);
efi_stub_entry_update();
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 619e7f7426c6..32d2e7056c87 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -244,7 +244,6 @@ CONFIG_HID_TOPSEED=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
CONFIG_USB=y
-CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6181c69b786b..a481dd4755d5 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -239,7 +239,6 @@ CONFIG_HID_TOPSEED=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
CONFIG_USB=y
-CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 61d6e281898b..d551165a3159 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
@@ -52,6 +53,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
@@ -76,7 +78,7 @@ ifeq ($(avx2_supported),yes)
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 000000000000..f091f122ed24
--- /dev/null
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,546 @@
+/*
+ * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
+ *
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
+ *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptograhpic library.
+ * Additional information on it can be found at:
+ * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+#define CONCAT(a,b) a##b
+#define VMOVDQ vmovdqu
+
+#define xdata0 %xmm0
+#define xdata1 %xmm1
+#define xdata2 %xmm2
+#define xdata3 %xmm3
+#define xdata4 %xmm4
+#define xdata5 %xmm5
+#define xdata6 %xmm6
+#define xdata7 %xmm7
+#define xcounter %xmm8
+#define xbyteswap %xmm9
+#define xkey0 %xmm10
+#define xkey3 %xmm11
+#define xkey6 %xmm12
+#define xkey9 %xmm13
+#define xkey4 %xmm11
+#define xkey8 %xmm12
+#define xkey12 %xmm13
+#define xkeyA %xmm14
+#define xkeyB %xmm15
+
+#define p_in %rdi
+#define p_iv %rsi
+#define p_keys %rdx
+#define p_out %rcx
+#define num_bytes %r8
+
+#define tmp %r10
+#define DDQ(i) CONCAT(ddq_add_,i)
+#define XMM(i) CONCAT(%xmm, i)
+#define DDQ_DATA 0
+#define XDATA 1
+#define KEY_128 1
+#define KEY_192 2
+#define KEY_256 3
+
+.section .rodata
+.align 16
+
+byteswap_const:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+ddq_add_1:
+ .octa 0x00000000000000000000000000000001
+ddq_add_2:
+ .octa 0x00000000000000000000000000000002
+ddq_add_3:
+ .octa 0x00000000000000000000000000000003
+ddq_add_4:
+ .octa 0x00000000000000000000000000000004
+ddq_add_5:
+ .octa 0x00000000000000000000000000000005
+ddq_add_6:
+ .octa 0x00000000000000000000000000000006
+ddq_add_7:
+ .octa 0x00000000000000000000000000000007
+ddq_add_8:
+ .octa 0x00000000000000000000000000000008
+
+.text
+
+/* generate a unique variable for ddq_add_x */
+
+.macro setddq n
+ var_ddq_add = DDQ(\n)
+.endm
+
+/* generate a unique variable for xmm register */
+.macro setxdata n
+ var_xdata = XMM(\n)
+.endm
+
+/* club the numeric 'id' to the symbol 'name' */
+
+.macro club name, id
+.altmacro
+ .if \name == DDQ_DATA
+ setddq %\id
+ .elseif \name == XDATA
+ setxdata %\id
+ .endif
+.noaltmacro
+.endm
+
+/*
+ * do_aes num_in_par load_keys key_len
+ * This increments p_in, but not p_out
+ */
+.macro do_aes b, k, key_len
+ .set by, \b
+ .set load_keys, \k
+ .set klen, \key_len
+
+ .if (load_keys)
+ vmovdqa 0*16(p_keys), xkey0
+ .endif
+
+ vpshufb xbyteswap, xcounter, xdata0
+
+ .set i, 1
+ .rept (by - 1)
+ club DDQ_DATA, i
+ club XDATA, i
+ vpaddd var_ddq_add(%rip), xcounter, var_xdata
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 1*16(p_keys), xkeyA
+
+ vpxor xkey0, xdata0, xdata0
+ club DDQ_DATA, by
+ vpaddd var_ddq_add(%rip), xcounter, xcounter
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpxor xkey0, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 2*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+ .else
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
+ .set i, (i +1)
+ .endr
+
+ add $(16*by), p_in
+
+ .if (klen == KEY_128)
+ vmovdqa 4*16(p_keys), xkey4
+ .else
+ .if (load_keys)
+ vmovdqa 4*16(p_keys), xkey4
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 5*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkey4, var_xdata, var_xdata /* key 4 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+ .else
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 7*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ vmovdqa 8*16(p_keys), xkey8
+ .else
+ .if (load_keys)
+ vmovdqa 8*16(p_keys), xkey8
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+ .else
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkey8, var_xdata, var_xdata /* key 8 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 10*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ vmovdqa 11*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 10 */
+ .if (klen == KEY_128)
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ .if (load_keys)
+ vmovdqa 12*16(p_keys), xkey12
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 13*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ .if (klen == KEY_256)
+ /* key 12 */
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenclast xkey12, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 14*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 13 */
+ vaesenc xkeyA, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 14 */
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
+ .endif
+
+ .set i, 0
+ .rept (by / 2)
+ .set j, (i+1)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ VMOVDQ (j*16 - 16*by)(p_in), xkeyB
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ club XDATA, j
+ vpxor xkeyB, var_xdata, var_xdata
+ .set i, (i+2)
+ .endr
+
+ .if (i < by)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ VMOVDQ var_xdata, i*16(p_out)
+ .set i, (i+1)
+ .endr
+.endm
+
+.macro do_aes_load val, key_len
+ do_aes \val, 1, \key_len
+.endm
+
+.macro do_aes_noload val, key_len
+ do_aes \val, 0, \key_len
+.endm
+
+/* main body of aes ctr load */
+
+.macro do_aes_ctrmain key_len
+
+ cmp $16, num_bytes
+ jb .Ldo_return2\key_len
+
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+
+ mov num_bytes, tmp
+ and $(7*16), tmp
+ jz .Lmult_of_8_blks\key_len
+
+ /* 1 <= tmp <= 7 */
+ cmp $(4*16), tmp
+ jg .Lgt4\key_len
+ je .Leq4\key_len
+
+.Llt4\key_len:
+ cmp $(2*16), tmp
+ jg .Leq3\key_len
+ je .Leq2\key_len
+
+.Leq1\key_len:
+ do_aes_load 1, \key_len
+ add $(1*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq2\key_len:
+ do_aes_load 2, \key_len
+ add $(2*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+
+.Leq3\key_len:
+ do_aes_load 3, \key_len
+ add $(3*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq4\key_len:
+ do_aes_load 4, \key_len
+ add $(4*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lgt4\key_len:
+ cmp $(6*16), tmp
+ jg .Leq7\key_len
+ je .Leq6\key_len
+
+.Leq5\key_len:
+ do_aes_load 5, \key_len
+ add $(5*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq6\key_len:
+ do_aes_load 6, \key_len
+ add $(6*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq7\key_len:
+ do_aes_load 7, \key_len
+ add $(7*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lmult_of_8_blks\key_len:
+ .if (\key_len != KEY_128)
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 4*16(p_keys), xkey4
+ vmovdqa 8*16(p_keys), xkey8
+ vmovdqa 12*16(p_keys), xkey12
+ .else
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 3*16(p_keys), xkey4
+ vmovdqa 6*16(p_keys), xkey8
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+.align 16
+.Lmain_loop2\key_len:
+ /* num_bytes is a multiple of 8 and >0 */
+ do_aes_noload 8, \key_len
+ add $(8*16), p_out
+ sub $(8*16), num_bytes
+ jne .Lmain_loop2\key_len
+
+.Ldo_return2\key_len:
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ ret
+.endm
+
+/*
+ * routine to do AES128 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128
+
+ENDPROC(aes_ctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192
+
+ENDPROC(aes_ctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256
+
+ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 948ad0e77741..888950f29fd9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -105,6 +105,9 @@ void crypto_fpu_exit(void);
#define AVX_GEN4_OPTSIZE 4096
#ifdef CONFIG_X86_64
+
+static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
#ifdef CONFIG_AS_AVX
+asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
/*
* asmlinkage void aesni_gcm_precomp_avx_gen2()
* gcm_data *my_ctx_data, context data
@@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}
+#ifdef CONFIG_AS_AVX
+static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv)
+{
+ /*
+ * based on key length, override with the by8 version
+ * of ctr mode encryption/decryption for improved performance
+ * aes_set_key_common() ensures that key length is one of
+ * {128,192,256}
+ */
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
+ else
+ aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
+}
+#endif
+
static int ctr_crypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
@@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc,
kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
- aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
+ aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
@@ -1493,6 +1521,14 @@ static int __init aesni_init(void)
aesni_gcm_enc_tfm = aesni_gcm_enc;
aesni_gcm_dec_tfm = aesni_gcm_dec;
}
+ aesni_ctr_enc_tfm = aesni_ctr_enc;
+#ifdef CONFIG_AS_AVX
+ if (cpu_has_avx) {
+ /* optimize performance of ctr mode encryption transform */
+ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ pr_info("AES CTR mode by8 optimization enabled\n");
+ }
+#endif
#endif
err = crypto_fpu_init();
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dbc4339b5417..26d49ebae040 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -72,6 +72,7 @@
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+.text
ENTRY(crc_pcl)
#define bufp %rdi
#define bufp_dw %edi
@@ -216,15 +217,11 @@ LABEL crc_ %i
## 4) Combine three results:
################################################################
- lea (K_table-16)(%rip), bufp # first entry is for idx 1
+ lea (K_table-8)(%rip), bufp # first entry is for idx 1
shlq $3, %rax # rax *= 8
- subq %rax, tmp # tmp -= rax*8
- shlq $1, %rax
- subq %rax, tmp # tmp -= rax*16
- # (total tmp -= rax*24)
- addq %rax, bufp
-
- movdqa (bufp), %xmm0 # 2 consts: K1:K2
+ pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+ leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
+ subq %rax, tmp # tmp -= rax*24
movq crc_init, %xmm1 # CRC for block 1
PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
@@ -238,9 +235,9 @@ LABEL crc_ %i
mov crc2, crc_init
crc32 %rax, crc_init
-################################################################
-## 5) Check for end:
-################################################################
+ ################################################################
+ ## 5) Check for end:
+ ################################################################
LABEL crc_ 0
mov tmp, len
@@ -331,136 +328,136 @@ ENDPROC(crc_pcl)
################################################################
## PCLMULQDQ tables
- ## Table is 128 entries x 2 quad words each
+ ## Table is 128 entries x 2 words (8 bytes) each
################################################################
-.data
-.align 64
+.section .rotata, "a", %progbits
+.align 8
K_table:
- .quad 0x14cd00bd6,0x105ec76f0
- .quad 0x0ba4fc28e,0x14cd00bd6
- .quad 0x1d82c63da,0x0f20c0dfe
- .quad 0x09e4addf8,0x0ba4fc28e
- .quad 0x039d3b296,0x1384aa63a
- .quad 0x102f9b8a2,0x1d82c63da
- .quad 0x14237f5e6,0x01c291d04
- .quad 0x00d3b6092,0x09e4addf8
- .quad 0x0c96cfdc0,0x0740eef02
- .quad 0x18266e456,0x039d3b296
- .quad 0x0daece73e,0x0083a6eec
- .quad 0x0ab7aff2a,0x102f9b8a2
- .quad 0x1248ea574,0x1c1733996
- .quad 0x083348832,0x14237f5e6
- .quad 0x12c743124,0x02ad91c30
- .quad 0x0b9e02b86,0x00d3b6092
- .quad 0x018b33a4e,0x06992cea2
- .quad 0x1b331e26a,0x0c96cfdc0
- .quad 0x17d35ba46,0x07e908048
- .quad 0x1bf2e8b8a,0x18266e456
- .quad 0x1a3e0968a,0x11ed1f9d8
- .quad 0x0ce7f39f4,0x0daece73e
- .quad 0x061d82e56,0x0f1d0f55e
- .quad 0x0d270f1a2,0x0ab7aff2a
- .quad 0x1c3f5f66c,0x0a87ab8a8
- .quad 0x12ed0daac,0x1248ea574
- .quad 0x065863b64,0x08462d800
- .quad 0x11eef4f8e,0x083348832
- .quad 0x1ee54f54c,0x071d111a8
- .quad 0x0b3e32c28,0x12c743124
- .quad 0x0064f7f26,0x0ffd852c6
- .quad 0x0dd7e3b0c,0x0b9e02b86
- .quad 0x0f285651c,0x0dcb17aa4
- .quad 0x010746f3c,0x018b33a4e
- .quad 0x1c24afea4,0x0f37c5aee
- .quad 0x0271d9844,0x1b331e26a
- .quad 0x08e766a0c,0x06051d5a2
- .quad 0x093a5f730,0x17d35ba46
- .quad 0x06cb08e5c,0x11d5ca20e
- .quad 0x06b749fb2,0x1bf2e8b8a
- .quad 0x1167f94f2,0x021f3d99c
- .quad 0x0cec3662e,0x1a3e0968a
- .quad 0x19329634a,0x08f158014
- .quad 0x0e6fc4e6a,0x0ce7f39f4
- .quad 0x08227bb8a,0x1a5e82106
- .quad 0x0b0cd4768,0x061d82e56
- .quad 0x13c2b89c4,0x188815ab2
- .quad 0x0d7a4825c,0x0d270f1a2
- .quad 0x10f5ff2ba,0x105405f3e
- .quad 0x00167d312,0x1c3f5f66c
- .quad 0x0f6076544,0x0e9adf796
- .quad 0x026f6a60a,0x12ed0daac
- .quad 0x1a2adb74e,0x096638b34
- .quad 0x19d34af3a,0x065863b64
- .quad 0x049c3cc9c,0x1e50585a0
- .quad 0x068bce87a,0x11eef4f8e
- .quad 0x1524fa6c6,0x19f1c69dc
- .quad 0x16cba8aca,0x1ee54f54c
- .quad 0x042d98888,0x12913343e
- .quad 0x1329d9f7e,0x0b3e32c28
- .quad 0x1b1c69528,0x088f25a3a
- .quad 0x02178513a,0x0064f7f26
- .quad 0x0e0ac139e,0x04e36f0b0
- .quad 0x0170076fa,0x0dd7e3b0c
- .quad 0x141a1a2e2,0x0bd6f81f8
- .quad 0x16ad828b4,0x0f285651c
- .quad 0x041d17b64,0x19425cbba
- .quad 0x1fae1cc66,0x010746f3c
- .quad 0x1a75b4b00,0x18db37e8a
- .quad 0x0f872e54c,0x1c24afea4
- .quad 0x01e41e9fc,0x04c144932
- .quad 0x086d8e4d2,0x0271d9844
- .quad 0x160f7af7a,0x052148f02
- .quad 0x05bb8f1bc,0x08e766a0c
- .quad 0x0a90fd27a,0x0a3c6f37a
- .quad 0x0b3af077a,0x093a5f730
- .quad 0x04984d782,0x1d22c238e
- .quad 0x0ca6ef3ac,0x06cb08e5c
- .quad 0x0234e0b26,0x063ded06a
- .quad 0x1d88abd4a,0x06b749fb2
- .quad 0x04597456a,0x04d56973c
- .quad 0x0e9e28eb4,0x1167f94f2
- .quad 0x07b3ff57a,0x19385bf2e
- .quad 0x0c9c8b782,0x0cec3662e
- .quad 0x13a9cba9e,0x0e417f38a
- .quad 0x093e106a4,0x19329634a
- .quad 0x167001a9c,0x14e727980
- .quad 0x1ddffc5d4,0x0e6fc4e6a
- .quad 0x00df04680,0x0d104b8fc
- .quad 0x02342001e,0x08227bb8a
- .quad 0x00a2a8d7e,0x05b397730
- .quad 0x168763fa6,0x0b0cd4768
- .quad 0x1ed5a407a,0x0e78eb416
- .quad 0x0d2c3ed1a,0x13c2b89c4
- .quad 0x0995a5724,0x1641378f0
- .quad 0x19b1afbc4,0x0d7a4825c
- .quad 0x109ffedc0,0x08d96551c
- .quad 0x0f2271e60,0x10f5ff2ba
- .quad 0x00b0bf8ca,0x00bf80dd2
- .quad 0x123888b7a,0x00167d312
- .quad 0x1e888f7dc,0x18dcddd1c
- .quad 0x002ee03b2,0x0f6076544
- .quad 0x183e8d8fe,0x06a45d2b2
- .quad 0x133d7a042,0x026f6a60a
- .quad 0x116b0f50c,0x1dd3e10e8
- .quad 0x05fabe670,0x1a2adb74e
- .quad 0x130004488,0x0de87806c
- .quad 0x000bcf5f6,0x19d34af3a
- .quad 0x18f0c7078,0x014338754
- .quad 0x017f27698,0x049c3cc9c
- .quad 0x058ca5f00,0x15e3e77ee
- .quad 0x1af900c24,0x068bce87a
- .quad 0x0b5cfca28,0x0dd07448e
- .quad 0x0ded288f8,0x1524fa6c6
- .quad 0x059f229bc,0x1d8048348
- .quad 0x06d390dec,0x16cba8aca
- .quad 0x037170390,0x0a3e3e02c
- .quad 0x06353c1cc,0x042d98888
- .quad 0x0c4584f5c,0x0d73c7bea
- .quad 0x1f16a3418,0x1329d9f7e
- .quad 0x0531377e2,0x185137662
- .quad 0x1d8d9ca7c,0x1b1c69528
- .quad 0x0b25b29f2,0x18a08b5bc
- .quad 0x19fb2a8b0,0x02178513a
- .quad 0x1a08fe6ac,0x1da758ae0
- .quad 0x045cddf4e,0x0e0ac139e
- .quad 0x1a91647f2,0x169cf9eb0
- .quad 0x1a0f717c4,0x0170076fa
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 000000000000..038f6ae87c5e
--- /dev/null
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,805 @@
+/*
+ * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rbp
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %ebp
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+ movl a, RT0d; \
+ shrl $(offset), RT0d; \
+ xorl b, RT0d; \
+ andl $(mask), RT0d; \
+ xorl RT0d, b; \
+ shll $(offset), RT0d; \
+ xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ movl left##d, RW0d; \
+ roll $1, right##d; \
+ xorl right##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##d; \
+ xorl RW0d, right##d; \
+ roll $1, left##d; \
+ expand_to_64bits(right, RT3); \
+ expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+ compress_to_64bits(right); \
+ compress_to_64bits(left); \
+ movl right##d, RW0d; \
+ rorl $1, left##d; \
+ xorl left##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##d; \
+ xorl RW0d, left##d; \
+ rorl $1, right##d; \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+ xorq from, RW0; \
+ \
+ movzbl RW0bl, RT0d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ shrq $16, RW0; \
+ movq s8(, RT0, 8), RT0; \
+ xorq s6(, RT1, 8), to; \
+ movzbl RW0bl, RL1d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s4(, RT2, 8), RT0; \
+ xorq s2(, RT3, 8), to; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ xorq s7(, RL1, 8), RT0; \
+ xorq s5(, RT1, 8), to; \
+ xorq s3(, RT2, 8), RT0; \
+ load_next_key(n, RW0); \
+ xorq RT0, to; \
+ xorq s1(, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+ movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+ movl (io), left##d; \
+ movl 4(io), right##d; \
+ bswapl left##d; \
+ bswapl right##d;
+
+#define write_block(io, left, right) \
+ bswapl left##d; \
+ bswapl right##d; \
+ movl left##d, (io); \
+ movl right##d, 4(io);
+
+ENTRY(des3_ede_x86_64_crypt_blk)
+ /* input:
+ * %rdi: round keys, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ read_block(%rdx, RL0, RR0);
+ initial_permutation(RL0, RR0);
+
+ movq (CTX), RW0;
+
+ round1(0, RR0, RL0, load_next_key);
+ round1(1, RL0, RR0, load_next_key);
+ round1(2, RR0, RL0, load_next_key);
+ round1(3, RL0, RR0, load_next_key);
+ round1(4, RR0, RL0, load_next_key);
+ round1(5, RL0, RR0, load_next_key);
+ round1(6, RR0, RL0, load_next_key);
+ round1(7, RL0, RR0, load_next_key);
+ round1(8, RR0, RL0, load_next_key);
+ round1(9, RL0, RR0, load_next_key);
+ round1(10, RR0, RL0, load_next_key);
+ round1(11, RL0, RR0, load_next_key);
+ round1(12, RR0, RL0, load_next_key);
+ round1(13, RL0, RR0, load_next_key);
+ round1(14, RR0, RL0, load_next_key);
+ round1(15, RL0, RR0, load_next_key);
+
+ round1(16+0, RL0, RR0, load_next_key);
+ round1(16+1, RR0, RL0, load_next_key);
+ round1(16+2, RL0, RR0, load_next_key);
+ round1(16+3, RR0, RL0, load_next_key);
+ round1(16+4, RL0, RR0, load_next_key);
+ round1(16+5, RR0, RL0, load_next_key);
+ round1(16+6, RL0, RR0, load_next_key);
+ round1(16+7, RR0, RL0, load_next_key);
+ round1(16+8, RL0, RR0, load_next_key);
+ round1(16+9, RR0, RL0, load_next_key);
+ round1(16+10, RL0, RR0, load_next_key);
+ round1(16+11, RR0, RL0, load_next_key);
+ round1(16+12, RL0, RR0, load_next_key);
+ round1(16+13, RR0, RL0, load_next_key);
+ round1(16+14, RL0, RR0, load_next_key);
+ round1(16+15, RR0, RL0, load_next_key);
+
+ round1(32+0, RR0, RL0, load_next_key);
+ round1(32+1, RL0, RR0, load_next_key);
+ round1(32+2, RR0, RL0, load_next_key);
+ round1(32+3, RL0, RR0, load_next_key);
+ round1(32+4, RR0, RL0, load_next_key);
+ round1(32+5, RL0, RR0, load_next_key);
+ round1(32+6, RR0, RL0, load_next_key);
+ round1(32+7, RL0, RR0, load_next_key);
+ round1(32+8, RR0, RL0, load_next_key);
+ round1(32+9, RL0, RR0, load_next_key);
+ round1(32+10, RR0, RL0, load_next_key);
+ round1(32+11, RL0, RR0, load_next_key);
+ round1(32+12, RR0, RL0, load_next_key);
+ round1(32+13, RL0, RR0, load_next_key);
+ round1(32+14, RR0, RL0, load_next_key);
+ round1(32+15, RL0, RR0, dummy2);
+
+ final_permutation(RR0, RL0);
+ write_block(%rsi, RR0, RL0);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ \
+ movl left##0d, RW0d; \
+ roll $1, right##0d; \
+ xorl right##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##0d; \
+ xorl RW0d, right##0d; \
+ roll $1, left##0d; \
+ expand_to_64bits(right##0, RT3); \
+ expand_to_64bits(left##0, RT3); \
+ movl left##1d, RW1d; \
+ roll $1, right##1d; \
+ xorl right##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, left##1d; \
+ xorl RW1d, right##1d; \
+ roll $1, left##1d; \
+ expand_to_64bits(right##1, RT3); \
+ expand_to_64bits(left##1, RT3); \
+ movl left##2d, RW2d; \
+ roll $1, right##2d; \
+ xorl right##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, left##2d; \
+ xorl RW2d, right##2d; \
+ roll $1, left##2d; \
+ expand_to_64bits(right##2, RT3); \
+ expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+ compress_to_64bits(right##0); \
+ compress_to_64bits(left##0); \
+ movl right##0d, RW0d; \
+ rorl $1, left##0d; \
+ xorl left##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##0d; \
+ xorl RW0d, left##0d; \
+ rorl $1, right##0d; \
+ compress_to_64bits(right##1); \
+ compress_to_64bits(left##1); \
+ movl right##1d, RW1d; \
+ rorl $1, left##1d; \
+ xorl left##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, right##1d; \
+ xorl RW1d, left##1d; \
+ rorl $1, right##1d; \
+ compress_to_64bits(right##2); \
+ compress_to_64bits(left##2); \
+ movl right##2d, RW2d; \
+ rorl $1, left##2d; \
+ xorl left##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, right##2d; \
+ xorl RW2d, left##2d; \
+ rorl $1, right##2d; \
+ \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+ xorq from##0, RW0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s8(, RT3, 8), to##0; \
+ xorq s6(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s4(, RT3, 8), to##0; \
+ xorq s2(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s7(, RT3, 8), to##0; \
+ xorq s5(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ load_next_key(n, RW0); \
+ xorq s3(, RT3, 8), to##0; \
+ xorq s1(, RT1, 8), to##0; \
+ xorq from##1, RW1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s8(, RT3, 8), to##1; \
+ xorq s6(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s4(, RT3, 8), to##1; \
+ xorq s2(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrl $16, RW1d; \
+ xorq s7(, RT3, 8), to##1; \
+ xorq s5(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ do_movq(RW0, RW1); \
+ xorq s3(, RT3, 8), to##1; \
+ xorq s1(, RT1, 8), to##1; \
+ xorq from##2, RW2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s8(, RT3, 8), to##2; \
+ xorq s6(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s4(, RT3, 8), to##2; \
+ xorq s2(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrl $16, RW2d; \
+ xorq s7(, RT3, 8), to##2; \
+ xorq s5(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ do_movq(RW0, RW2); \
+ xorq s3(, RT3, 8), to##2; \
+ xorq s1(, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+ movq src, dst;
+
+ENTRY(des3_ede_x86_64_crypt_blk_3way)
+ /* input:
+ * %rdi: ctx, round keys
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ */
+
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ /* load input */
+ movl 0 * 4(%rdx), RL0d;
+ movl 1 * 4(%rdx), RR0d;
+ movl 2 * 4(%rdx), RL1d;
+ movl 3 * 4(%rdx), RR1d;
+ movl 4 * 4(%rdx), RL2d;
+ movl 5 * 4(%rdx), RR2d;
+
+ bswapl RL0d;
+ bswapl RR0d;
+ bswapl RL1d;
+ bswapl RR1d;
+ bswapl RL2d;
+ bswapl RR2d;
+
+ initial_permutation3(RL, RR);
+
+ movq 0(CTX), RW0;
+ movq RW0, RW1;
+ movq RW0, RW2;
+
+ round3(0, RR, RL, load_next_key, __movq);
+ round3(1, RL, RR, load_next_key, __movq);
+ round3(2, RR, RL, load_next_key, __movq);
+ round3(3, RL, RR, load_next_key, __movq);
+ round3(4, RR, RL, load_next_key, __movq);
+ round3(5, RL, RR, load_next_key, __movq);
+ round3(6, RR, RL, load_next_key, __movq);
+ round3(7, RL, RR, load_next_key, __movq);
+ round3(8, RR, RL, load_next_key, __movq);
+ round3(9, RL, RR, load_next_key, __movq);
+ round3(10, RR, RL, load_next_key, __movq);
+ round3(11, RL, RR, load_next_key, __movq);
+ round3(12, RR, RL, load_next_key, __movq);
+ round3(13, RL, RR, load_next_key, __movq);
+ round3(14, RR, RL, load_next_key, __movq);
+ round3(15, RL, RR, load_next_key, __movq);
+
+ round3(16+0, RL, RR, load_next_key, __movq);
+ round3(16+1, RR, RL, load_next_key, __movq);
+ round3(16+2, RL, RR, load_next_key, __movq);
+ round3(16+3, RR, RL, load_next_key, __movq);
+ round3(16+4, RL, RR, load_next_key, __movq);
+ round3(16+5, RR, RL, load_next_key, __movq);
+ round3(16+6, RL, RR, load_next_key, __movq);
+ round3(16+7, RR, RL, load_next_key, __movq);
+ round3(16+8, RL, RR, load_next_key, __movq);
+ round3(16+9, RR, RL, load_next_key, __movq);
+ round3(16+10, RL, RR, load_next_key, __movq);
+ round3(16+11, RR, RL, load_next_key, __movq);
+ round3(16+12, RL, RR, load_next_key, __movq);
+ round3(16+13, RR, RL, load_next_key, __movq);
+ round3(16+14, RL, RR, load_next_key, __movq);
+ round3(16+15, RR, RL, load_next_key, __movq);
+
+ round3(32+0, RR, RL, load_next_key, __movq);
+ round3(32+1, RL, RR, load_next_key, __movq);
+ round3(32+2, RR, RL, load_next_key, __movq);
+ round3(32+3, RL, RR, load_next_key, __movq);
+ round3(32+4, RR, RL, load_next_key, __movq);
+ round3(32+5, RL, RR, load_next_key, __movq);
+ round3(32+6, RR, RL, load_next_key, __movq);
+ round3(32+7, RL, RR, load_next_key, __movq);
+ round3(32+8, RR, RL, load_next_key, __movq);
+ round3(32+9, RL, RR, load_next_key, __movq);
+ round3(32+10, RR, RL, load_next_key, __movq);
+ round3(32+11, RL, RR, load_next_key, __movq);
+ round3(32+12, RR, RL, load_next_key, __movq);
+ round3(32+13, RL, RR, load_next_key, __movq);
+ round3(32+14, RR, RL, load_next_key, __movq);
+ round3(32+15, RL, RR, dummy2, dummy2);
+
+ final_permutation3(RR, RL);
+
+ bswapl RR0d;
+ bswapl RL0d;
+ bswapl RR1d;
+ bswapl RL1d;
+ bswapl RR2d;
+ bswapl RL2d;
+
+ movl RR0d, 0 * 4(%rsi);
+ movl RL0d, 1 * 4(%rsi);
+ movl RR1d, 2 * 4(%rsi);
+ movl RL1d, 3 * 4(%rsi);
+ movl RR2d, 4 * 4(%rsi);
+ movl RL2d, 5 * 4(%rsi);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk_3way)
+
+.data
+.align 16
+.L_s1:
+ .quad 0x0010100001010400, 0x0000000000000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0010100001010004, 0x0000100000010404
+ .quad 0x0000000000000004, 0x0000100000010000
+ .quad 0x0000000000000400, 0x0010100001010400
+ .quad 0x0010100001010404, 0x0000000000000400
+ .quad 0x0010000001000404, 0x0010100001010004
+ .quad 0x0010000001000000, 0x0000000000000004
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000100000010400
+ .quad 0x0000100000010400, 0x0010100001010000
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0000100000010004, 0x0010000001000004
+ .quad 0x0010000001000004, 0x0000100000010004
+ .quad 0x0000000000000000, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010000001000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0000000000000004, 0x0010100001010000
+ .quad 0x0010100001010400, 0x0010000001000000
+ .quad 0x0010000001000000, 0x0000000000000400
+ .quad 0x0010100001010004, 0x0000100000010000
+ .quad 0x0000100000010400, 0x0010000001000004
+ .quad 0x0000000000000400, 0x0000000000000004
+ .quad 0x0010000001000404, 0x0000100000010404
+ .quad 0x0010100001010404, 0x0000100000010004
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0010000001000004, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010100001010400
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000000000000000
+ .quad 0x0000100000010004, 0x0000100000010400
+ .quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+ .quad 0x0801080200100020, 0x0800080000000000
+ .quad 0x0000080000000000, 0x0001080200100020
+ .quad 0x0001000000100000, 0x0000000200000020
+ .quad 0x0801000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0801080200100020
+ .quad 0x0801080000100000, 0x0800000000000000
+ .quad 0x0800080000000000, 0x0001000000100000
+ .quad 0x0000000200000020, 0x0801000200100020
+ .quad 0x0001080000100000, 0x0001000200100020
+ .quad 0x0800080200000020, 0x0000000000000000
+ .quad 0x0800000000000000, 0x0000080000000000
+ .quad 0x0001080200100020, 0x0801000000100000
+ .quad 0x0001000200100020, 0x0800000200000020
+ .quad 0x0000000000000000, 0x0001080000100000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0801000000100000, 0x0000080200000020
+ .quad 0x0000000000000000, 0x0001080200100020
+ .quad 0x0801000200100020, 0x0001000000100000
+ .quad 0x0800080200000020, 0x0801000000100000
+ .quad 0x0801080000100000, 0x0000080000000000
+ .quad 0x0801000000100000, 0x0800080000000000
+ .quad 0x0000000200000020, 0x0801080200100020
+ .quad 0x0001080200100020, 0x0000000200000020
+ .quad 0x0000080000000000, 0x0800000000000000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0001000000100000, 0x0800000200000020
+ .quad 0x0001000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0001000200100020
+ .quad 0x0001080000100000, 0x0000000000000000
+ .quad 0x0800080000000000, 0x0000080200000020
+ .quad 0x0800000000000000, 0x0801000200100020
+ .quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+ .quad 0x0000002000000208, 0x0000202008020200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000202000020208, 0x0000002008000200
+ .quad 0x0000200000020008, 0x0000000008000008
+ .quad 0x0000000008000008, 0x0000200000020000
+ .quad 0x0000202008020208, 0x0000200000020008
+ .quad 0x0000200008020000, 0x0000002000000208
+ .quad 0x0000000008000000, 0x0000000000000008
+ .quad 0x0000202008020200, 0x0000002000000200
+ .quad 0x0000202000020200, 0x0000200008020000
+ .quad 0x0000200008020008, 0x0000202000020208
+ .quad 0x0000002008000208, 0x0000202000020200
+ .quad 0x0000200000020000, 0x0000002008000208
+ .quad 0x0000000000000008, 0x0000202008020208
+ .quad 0x0000002000000200, 0x0000000008000000
+ .quad 0x0000202008020200, 0x0000000008000000
+ .quad 0x0000200000020008, 0x0000002000000208
+ .quad 0x0000200000020000, 0x0000202008020200
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000002000000200, 0x0000200000020008
+ .quad 0x0000202008020208, 0x0000002008000200
+ .quad 0x0000000008000008, 0x0000002000000200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000208, 0x0000200000020000
+ .quad 0x0000000008000000, 0x0000202008020208
+ .quad 0x0000000000000008, 0x0000202000020208
+ .quad 0x0000202000020200, 0x0000000008000008
+ .quad 0x0000200008020000, 0x0000002008000208
+ .quad 0x0000002000000208, 0x0000200008020000
+ .quad 0x0000202000020208, 0x0000000000000008
+ .quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x0000020000002000, 0x0008020800002000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+ .quad 0x0000001000000100, 0x0020001002080100
+ .quad 0x0020000002080000, 0x0420001002000100
+ .quad 0x0000000000080000, 0x0000001000000100
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0400001000080100, 0x0000000000080000
+ .quad 0x0020001002000100, 0x0400001000080100
+ .quad 0x0420001002000100, 0x0420000002080000
+ .quad 0x0000001000080100, 0x0400000000000000
+ .quad 0x0020000002000000, 0x0400000000080000
+ .quad 0x0400000000080000, 0x0000000000000000
+ .quad 0x0400001000000100, 0x0420001002080100
+ .quad 0x0420001002080100, 0x0020001002000100
+ .quad 0x0420000002080000, 0x0400001000000100
+ .quad 0x0000000000000000, 0x0420000002000000
+ .quad 0x0020001002080100, 0x0020000002000000
+ .quad 0x0420000002000000, 0x0000001000080100
+ .quad 0x0000000000080000, 0x0420001002000100
+ .quad 0x0000001000000100, 0x0020000002000000
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0420001002000100, 0x0400001000080100
+ .quad 0x0020001002000100, 0x0400000000000000
+ .quad 0x0420000002080000, 0x0020001002080100
+ .quad 0x0400001000080100, 0x0000001000000100
+ .quad 0x0020000002000000, 0x0420000002080000
+ .quad 0x0420001002080100, 0x0000001000080100
+ .quad 0x0420000002000000, 0x0420001002080100
+ .quad 0x0020000002080000, 0x0000000000000000
+ .quad 0x0400000000080000, 0x0420000002000000
+ .quad 0x0000001000080100, 0x0020001002000100
+ .quad 0x0400001000000100, 0x0000000000080000
+ .quad 0x0000000000000000, 0x0400000000080000
+ .quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+ .quad 0x0200000120000010, 0x0204000020000000
+ .quad 0x0000040000000000, 0x0204040120000010
+ .quad 0x0204000020000000, 0x0000000100000010
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0200040020000000, 0x0004040100000010
+ .quad 0x0004000000000000, 0x0200000120000010
+ .quad 0x0004000100000010, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0000000000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000040000000000
+ .quad 0x0004040000000000, 0x0200040120000010
+ .quad 0x0000000100000010, 0x0204000120000010
+ .quad 0x0204000120000010, 0x0000000000000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000040100000010, 0x0004040000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0200040020000000, 0x0000000100000010
+ .quad 0x0204000120000010, 0x0004040000000000
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0000040100000010, 0x0200000120000010
+ .quad 0x0004000000000000, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0200000120000010, 0x0204040120000010
+ .quad 0x0004040000000000, 0x0204000020000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000000000000000, 0x0204000120000010
+ .quad 0x0000000100000010, 0x0000040000000000
+ .quad 0x0204000020000000, 0x0004040100000010
+ .quad 0x0000040000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000000000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+ .quad 0x0002000000200000, 0x2002000004200002
+ .quad 0x2000000004000802, 0x0000000000000000
+ .quad 0x0000000000000800, 0x2000000004000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2002000004200802, 0x0002000000200000
+ .quad 0x0000000000000000, 0x2000000004000002
+ .quad 0x2000000000000002, 0x0000000004000000
+ .quad 0x2002000004200002, 0x2000000000000802
+ .quad 0x0000000004000800, 0x2002000000200802
+ .quad 0x2002000000200002, 0x0000000004000800
+ .quad 0x2000000004000002, 0x0002000004200000
+ .quad 0x0002000004200800, 0x2002000000200002
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000000000802, 0x2002000004200802
+ .quad 0x0002000000200800, 0x2000000000000002
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0002000000200000, 0x2000000004000802
+ .quad 0x2000000004000802, 0x2002000004200002
+ .quad 0x2002000004200002, 0x2000000000000002
+ .quad 0x2002000000200002, 0x0000000004000000
+ .quad 0x0000000004000800, 0x0002000000200000
+ .quad 0x0002000004200800, 0x2000000000000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2000000000000802, 0x2000000004000002
+ .quad 0x2002000004200802, 0x0002000004200000
+ .quad 0x0002000000200800, 0x0000000000000000
+ .quad 0x2000000000000002, 0x2002000004200802
+ .quad 0x0000000000000000, 0x2002000000200802
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000004000002, 0x0000000004000800
+ .quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+ .quad 0x0100010410001000, 0x0000010000001000
+ .quad 0x0000000000040000, 0x0100010410041000
+ .quad 0x0100000010000000, 0x0100010410001000
+ .quad 0x0000000400000000, 0x0100000010000000
+ .quad 0x0000000400040000, 0x0100000010040000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0100010010041000, 0x0000010400041000
+ .quad 0x0000010000001000, 0x0000000400000000
+ .quad 0x0100000010040000, 0x0100000410000000
+ .quad 0x0100010010001000, 0x0000010400001000
+ .quad 0x0000010000041000, 0x0000000400040000
+ .quad 0x0100000410040000, 0x0100010010041000
+ .quad 0x0000010400001000, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0100000410040000
+ .quad 0x0100000410000000, 0x0100010010001000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0100010010041000, 0x0000010000001000
+ .quad 0x0000000400000000, 0x0100000410040000
+ .quad 0x0000010000001000, 0x0000010400041000
+ .quad 0x0100010010001000, 0x0000000400000000
+ .quad 0x0100000410000000, 0x0100000010040000
+ .quad 0x0100000410040000, 0x0100000010000000
+ .quad 0x0000000000040000, 0x0100010410001000
+ .quad 0x0000000000000000, 0x0100010410041000
+ .quad 0x0000000400040000, 0x0100000410000000
+ .quad 0x0100000010040000, 0x0100010010001000
+ .quad 0x0100010410001000, 0x0000000000000000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0000010000041000, 0x0000010400001000
+ .quad 0x0000010400001000, 0x0000000400040000
+ .quad 0x0100000010000000, 0x0100010010041000
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 000000000000..0e9c0668fe4e
--- /dev/null
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,509 @@
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/processor.h>
+#include <crypto/des.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+struct des3_ede_x86_ctx {
+ u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
+ u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ const u32 *expkey)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes;
+ int err;
+
+ err = blkcipher_walk_virt(desc, walk);
+
+ while ((nbytes = walk->nbytes)) {
+ u8 *wsrc = walk->src.virt.addr;
+ u8 *wdst = walk->dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+ wsrc);
+
+ wsrc += bsize * 3;
+ wdst += bsize * 3;
+ nbytes -= bsize * 3;
+ } while (nbytes >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = blkcipher_walk_done(desc, walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, ctx->enc_expkey);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, ctx->dec_expkey);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[3 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ nbytes -= bsize * 3 - bsize;
+ src -= 3 - 1;
+ dst -= 3 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 3);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
+ struct blkcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[DES3_EDE_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ des3_ede_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor(keystream, src, nbytes);
+ memcpy(dst, keystream, nbytes);
+
+ crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ __be64 *src = (__be64 *)walk->src.virt.addr;
+ __be64 *dst = (__be64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[3];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
+ (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+ dst[1] = src[1] ^ ctrblocks[1];
+ dst[2] = src[2] ^ ctrblocks[2];
+
+ src += 3;
+ dst += 3;
+ } while ((nbytes -= bsize * 3) >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE);
+
+ while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ if (walk.nbytes) {
+ ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 i, j, tmp;
+ int err;
+
+ /* Generate encryption context using generic implementation. */
+ err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
+ if (err < 0)
+ return err;
+
+ /* Fix encryption context for this implementation and form decryption
+ * context. */
+ j = DES3_EDE_EXPKEY_WORDS - 2;
+ for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+ tmp = ror32(ctx->enc_expkey[i + 1], 4);
+ ctx->enc_expkey[i + 1] = tmp;
+
+ ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
+ ctx->dec_expkey[j + 1] = tmp;
+ }
+
+ return 0;
+}
+
+static struct crypto_alg des3_ede_algs[4] = { {
+ .cra_name = "des3_ede",
+ .cra_driver_name = "des3_ede-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = DES3_EDE_KEY_SIZE,
+ .cia_max_keysize = DES3_EDE_KEY_SIZE,
+ .cia_setkey = des3_ede_x86_setkey,
+ .cia_encrypt = des3_ede_x86_encrypt,
+ .cia_decrypt = des3_ede_x86_decrypt,
+ }
+ }
+}, {
+ .cra_name = "ecb(des3_ede)",
+ .cra_driver_name = "ecb-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ },
+ },
+}, {
+ .cra_name = "cbc(des3_ede)",
+ .cra_driver_name = "cbc-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+ },
+}, {
+ .cra_name = "ctr(des3_ede)",
+ .cra_driver_name = "ctr-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+ },
+} };
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, des3_ede-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+ if (!force && is_blacklisted_cpu()) {
+ pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+ crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS("des3_ede");
+MODULE_ALIAS("des3_ede-asm");
+MODULE_ALIAS("des");
+MODULE_ALIAS("des-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 185fad49d86f..5d1e0075ac24 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -92,7 +92,7 @@ __clmul_gf128mul_ble:
ret
ENDPROC(__clmul_gf128mul_ble)
-/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+/* void clmul_ghash_mul(char *dst, const u128 *shash) */
ENTRY(clmul_ghash_mul)
movups (%rdi), DATA
movups (%rsi), SHASH
@@ -106,7 +106,7 @@ ENDPROC(clmul_ghash_mul)
/*
* void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- * const be128 *shash);
+ * const u128 *shash);
*/
ENTRY(clmul_ghash_update)
cmp $16, %rdx
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index d785cf2c529c..88bb7ba8b175 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -25,17 +25,17 @@
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
-void clmul_ghash_mul(char *dst, const be128 *shash);
+void clmul_ghash_mul(char *dst, const u128 *shash);
void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- const be128 *shash);
+ const u128 *shash);
struct ghash_async_ctx {
struct cryptd_ahash *cryptd_tfm;
};
struct ghash_ctx {
- be128 shash;
+ u128 shash;
};
struct ghash_desc_ctx {
@@ -68,11 +68,11 @@ static int ghash_setkey(struct crypto_shash *tfm,
a = be64_to_cpu(x->a);
b = be64_to_cpu(x->b);
- ctx->shash.a = (__be64)((b << 1) | (a >> 63));
- ctx->shash.b = (__be64)((a << 1) | (b >> 63));
+ ctx->shash.a = (b << 1) | (a >> 63);
+ ctx->shash.b = (a << 1) | (b >> 63);
if (a >> 63)
- ctx->shash.b ^= cpu_to_be64(0xc2);
+ ctx->shash.b ^= ((u64)0xc2) << 56;
return 0;
}
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f30cd10293f0..8626b03e83b7 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 220675795e08..f9e181aaba97 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -383,8 +383,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
} else {
/* Return stub is in 32bit vsyscall page */
if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso,
- sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_sigreturn;
else
restorer = &frame->retcode;
}
@@ -462,8 +462,8 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
else
- restorer = VDSO32_SYMBOL(current->mm->context.vdso,
- rt_sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_rt_sigreturn;
put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
/*
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 3ca9762e1649..3bf000fab0ae 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,6 +5,7 @@ genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
generic-y += clkdev.h
-generic-y += early_ioremap.h
generic-y += cputime.h
+generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += scatterlist.h
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
new file mode 100644
index 000000000000..1b010a859b8b
--- /dev/null
+++ b/arch/x86/include/asm/acenv.h
@@ -0,0 +1,45 @@
+/*
+ * X86 specific ACPICA environments and implementation
+ *
+ * Copyright (C) 2014, Intel Corporation
+ * Author: Lv Zheng <lv.zheng@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_ACENV_H
+#define _ASM_X86_ACENV_H
+
+#include <asm/special_insns.h>
+
+/* Asm macros */
+
+#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+
+int __acpi_acquire_global_lock(unsigned int *lock);
+int __acpi_release_global_lock(unsigned int *lock);
+
+#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
+
+#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_release_global_lock(&facs->global_lock))
+
+/*
+ * Math helper asm macros
+ */
+#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
+ asm("divl %2;" \
+ : "=a"(q32), "=d"(r32) \
+ : "r"(d32), \
+ "0"(n_lo), "1"(n_hi))
+
+#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
+ asm("shrl $1,%2 ;" \
+ "rcrl $1,%3;" \
+ : "=r"(n_hi), "=r"(n_lo) \
+ : "0"(n_hi), "1"(n_lo))
+
+#endif /* _ASM_X86_ACENV_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index c8c1e700c26e..0ab4f9fd2687 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -32,51 +32,6 @@
#include <asm/mpspec.h>
#include <asm/realmode.h>
-#define COMPILER_DEPENDENT_INT64 long long
-#define COMPILER_DEPENDENT_UINT64 unsigned long long
-
-/*
- * Calling conventions:
- *
- * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
- * ACPI_EXTERNAL_XFACE - External ACPI interfaces
- * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
- * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
- */
-#define ACPI_SYSTEM_XFACE
-#define ACPI_EXTERNAL_XFACE
-#define ACPI_INTERNAL_XFACE
-#define ACPI_INTERNAL_VAR_XFACE
-
-/* Asm macros */
-
-#define ACPI_FLUSH_CPU_CACHE() wbinvd()
-
-int __acpi_acquire_global_lock(unsigned int *lock);
-int __acpi_release_global_lock(unsigned int *lock);
-
-#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
- ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
-
-#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
- ((Acq) = __acpi_release_global_lock(&facs->global_lock))
-
-/*
- * Math helper asm macros
- */
-#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
- asm("divl %2;" \
- : "=a"(q32), "=d"(r32) \
- : "r"(d32), \
- "0"(n_lo), "1"(n_hi))
-
-
-#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
- asm("shrl $1,%2 ;" \
- "rcrl $1,%3;" \
- : "=r"(n_hi), "=r"(n_lo) \
- : "0"(n_hi), "1"(n_lo))
-
#ifdef CONFIG_ACPI
extern int acpi_lapic;
extern int acpi_ioapic;
@@ -166,6 +121,11 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
}
+static inline bool acpi_has_cpu_in_madt(void)
+{
+ return !!acpi_lapic;
+}
+
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 19b0ebafcd3e..79752f2bdec5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -99,7 +99,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 4582e8e1cd1a..7730c1c5c83a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -57,6 +57,12 @@
.long (from) - . ; \
.long (to) - . + 0x7ffffff0 ; \
.popsection
+
+# define _ASM_NOKPROBE(entry) \
+ .pushsection "_kprobe_blacklist","aw" ; \
+ _ASM_ALIGN ; \
+ _ASM_PTR (entry); \
+ .popsection
#else
# define _ASM_EXTABLE(from,to) \
" .pushsection \"__ex_table\",\"a\"\n" \
@@ -71,6 +77,7 @@
" .long (" #from ") - .\n" \
" .long (" #to ") - . + 0x7ffffff0\n" \
" .popsection\n"
+/* For C file, we already have NOKPROBE_SYMBOL macro */
#endif
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b17f4f48ecd7..6dd1c7dd0473 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -7,6 +7,7 @@
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
+#include <asm/barrier.h>
/*
* Atomic operations that C can't guarantee us. Useful for
@@ -243,12 +244,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
: : "r" ((unsigned)(mask)), "m" (*(addr)) \
: "memory")
-/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic_dec() barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc() barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 69bbb4845020..0f4460b5636d 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -99,7 +99,7 @@
#if defined(CONFIG_X86_PPRO_FENCE)
/*
- * For either of these options x86 doesn't have a strong TSO memory
+ * For this option x86 doesn't have a strong TSO memory
* model and we should fall back to full barriers.
*/
@@ -137,6 +137,10 @@ do { \
#endif
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic() barrier()
+#define smp_mb__after_atomic() barrier()
+
/*
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9fc1af74dc83..afcd35d331de 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,7 @@
#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
+#include <asm/barrier.h>
#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
@@ -102,7 +103,7 @@ static inline void __set_bit(long nr, volatile unsigned long *addr)
*
* clear_bit() is atomic and may not be reordered. However, it does
* not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
* in order to ensure changes are visible on other processors.
*/
static __always_inline void
@@ -156,9 +157,6 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
__clear_bit(nr, addr);
}
-#define smp_mb__before_clear_bit() barrier()
-#define smp_mb__after_clear_bit() barrier()
-
/**
* __change_bit - Toggle a bit in memory
* @nr: the bit to change
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index e6fd8a026c7b..cd00e1774491 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -184,8 +184,15 @@ static inline unsigned add32_with_carry(unsigned a, unsigned b)
asm("addl %2,%0\n\t"
"adcl $0,%0"
: "=r" (a)
- : "0" (a), "r" (b));
+ : "0" (a), "rm" (b));
return a;
}
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+ return (__force __wsum)add32_with_carry((__force unsigned)csum,
+ (__force unsigned)addend);
+}
+
#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
new file mode 100644
index 000000000000..e01f7f7ccb0c
--- /dev/null
+++ b/arch/x86/include/asm/cmdline.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_CMDLINE_H
+#define _ASM_X86_CMDLINE_H
+
+int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+
+#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d47786acb016..99c105d78b7e 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,6 +4,8 @@
#include <linux/compiler.h>
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+#define __HAVE_ARCH_CMPXCHG 1
+
/*
* Non-existant functions to indicate usage errors at link time
* (or compile-time if the compiler implements __compiletime_error().
@@ -143,7 +145,6 @@ extern void __add_wrong_size(void)
# include <asm/cmpxchg_64.h>
#endif
-#ifdef __HAVE_ARCH_CMPXCHG
#define cmpxchg(ptr, old, new) \
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
@@ -152,7 +153,6 @@ extern void __add_wrong_size(void)
#define cmpxchg_local(ptr, old, new) \
__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
-#endif
/*
* xadd() adds "inc" to "*ptr" and atomically returns the previous
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f8bf2eecab86..f7e142926481 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -34,8 +34,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
: "memory");
}
-#define __HAVE_ARCH_CMPXCHG 1
-
#ifdef CONFIG_X86_CMPXCHG64
#define cmpxchg64(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 614be87f1a9b..1af94697aae5 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -6,8 +6,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
*ptr = val;
}
-#define __HAVE_ARCH_CMPXCHG 1
-
#define cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b82f95144a05..bb9b258d60e7 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -81,7 +81,7 @@
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
@@ -90,13 +90,13 @@
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-#define X86_FEATURE_11AP ( 3*32+19) /* "" Bad local APIC aka 11AP */
+/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) /* "" clflush reqd with monitor */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
@@ -239,8 +239,11 @@
#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* AMD Erratum 400 */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -250,6 +253,12 @@
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
+/*
+ * In order to save room, we index into this array by doing
+ * X86_BUG_<name> - NCAPINTS*32.
+ */
+extern const char * const x86_bug_flags[NBUGINTS*32];
+
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -306,7 +315,6 @@ extern const char * const x86_power_flags[32];
#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
@@ -353,9 +361,6 @@ extern const char * const x86_power_flags[32];
#undef cpu_has_pae
#define cpu_has_pae ___BUG___
-#undef cpu_has_mp
-#define cpu_has_mp 1
-
#undef cpu_has_k6_mtrr
#define cpu_has_k6_mtrr 0
@@ -545,20 +550,20 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
#define static_cpu_has_safe(bit) boot_cpu_has(bit)
#endif
-#define cpu_has_bug(c, bit) cpu_has(c, (bit))
-#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
-#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit));
+#define cpu_has_bug(c, bit) cpu_has(c, (bit))
+#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
+#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
-#define static_cpu_has_bug(bit) static_cpu_has((bit))
-#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
+#define static_cpu_has_bug(bit) static_cpu_has((bit))
+#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit))
+#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
-#define MAX_CPU_FEATURES (NCAPINTS * 32)
-#define cpu_have_feature boot_cpu_has
+#define MAX_CPU_FEATURES (NCAPINTS * 32)
+#define cpu_have_feature boot_cpu_has
-#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
-#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
- boot_cpu_data.x86_model
+#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
+#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
+ boot_cpu_data.x86_model
#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
-
#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
new file mode 100644
index 000000000000..f498411f2500
--- /dev/null
+++ b/arch/x86/include/asm/crash.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_CRASH_H
+#define _ASM_X86_CRASH_H
+
+int crash_load_segments(struct kimage *image);
+int crash_copy_backup_region(struct kimage *image);
+int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+
+#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0869434eaf72..044a2fd3c5fe 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_EFI_H
#define _ASM_X86_EFI_H
+#include <asm/i387.h>
/*
* We map the EFI regions needed for runtime services non-contiguously,
* with preserved alignment on virtual addresses starting from -4G down
@@ -27,91 +28,58 @@
extern unsigned long asmlinkage efi_call_phys(void *, ...);
-#define efi_call_phys0(f) efi_call_phys(f)
-#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
-#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
-#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
-#define efi_call_phys4(f, a1, a2, a3, a4) \
- efi_call_phys(f, a1, a2, a3, a4)
-#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
- efi_call_phys(f, a1, a2, a3, a4, a5)
-#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
- efi_call_phys(f, a1, a2, a3, a4, a5, a6)
/*
* Wrap all the virtual calls in a way that forces the parameters on the stack.
*/
+/* Use this macro if your virtual returns a non-void value */
#define efi_call_virt(f, args...) \
- ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
-
-#define efi_call_virt0(f) efi_call_virt(f)
-#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
-#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
-#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
-#define efi_call_virt4(f, a1, a2, a3, a4) \
- efi_call_virt(f, a1, a2, a3, a4)
-#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
- efi_call_virt(f, a1, a2, a3, a4, a5)
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
- efi_call_virt(f, a1, a2, a3, a4, a5, a6)
+({ \
+ efi_status_t __s; \
+ kernel_fpu_begin(); \
+ __s = ((efi_##f##_t __attribute__((regparm(0)))*) \
+ efi.systab->runtime->f)(args); \
+ kernel_fpu_end(); \
+ __s; \
+})
+
+/* Use this macro if your virtual call does not return any value */
+#define __efi_call_virt(f, args...) \
+({ \
+ kernel_fpu_begin(); \
+ ((efi_##f##_t __attribute__((regparm(0)))*) \
+ efi.systab->runtime->f)(args); \
+ kernel_fpu_end(); \
+})
#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size)
#else /* !CONFIG_X86_32 */
-extern u64 efi_call0(void *fp);
-extern u64 efi_call1(void *fp, u64 arg1);
-extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
-extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
-extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
-extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
- u64 arg4, u64 arg5);
-extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
- u64 arg4, u64 arg5, u64 arg6);
-
-#define efi_call_phys0(f) \
- efi_call0((f))
-#define efi_call_phys1(f, a1) \
- efi_call1((f), (u64)(a1))
-#define efi_call_phys2(f, a1, a2) \
- efi_call2((f), (u64)(a1), (u64)(a2))
-#define efi_call_phys3(f, a1, a2, a3) \
- efi_call3((f), (u64)(a1), (u64)(a2), (u64)(a3))
-#define efi_call_phys4(f, a1, a2, a3, a4) \
- efi_call4((f), (u64)(a1), (u64)(a2), (u64)(a3), \
- (u64)(a4))
-#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
- efi_call5((f), (u64)(a1), (u64)(a2), (u64)(a3), \
- (u64)(a4), (u64)(a5))
-#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
- efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \
- (u64)(a4), (u64)(a5), (u64)(a6))
-
-#define _efi_call_virtX(x, f, ...) \
+#define EFI_LOADER_SIGNATURE "EL64"
+
+extern u64 asmlinkage efi_call(void *fp, ...);
+
+#define efi_call_phys(f, args...) efi_call((f), args)
+
+#define efi_call_virt(f, ...) \
({ \
efi_status_t __s; \
\
efi_sync_low_kernel_mappings(); \
preempt_disable(); \
- __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \
+ __kernel_fpu_begin(); \
+ __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \
+ __kernel_fpu_end(); \
preempt_enable(); \
__s; \
})
-#define efi_call_virt0(f) \
- _efi_call_virtX(0, f)
-#define efi_call_virt1(f, a1) \
- _efi_call_virtX(1, f, (u64)(a1))
-#define efi_call_virt2(f, a1, a2) \
- _efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
-#define efi_call_virt3(f, a1, a2, a3) \
- _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
-#define efi_call_virt4(f, a1, a2, a3, a4) \
- _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
-#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
- _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
- _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+/*
+ * All X86_64 virt calls return non-void values. Thus, use non-void call for
+ * virt calls that would be void on X86_32.
+ */
+#define __efi_call_virt(f, args...) efi_call_virt(f, args)
extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
u32 type, u64 attribute);
@@ -136,6 +104,8 @@ extern void __init runtime_code_page_mkexec(void);
extern void __init efi_runtime_mkexec(void);
extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
+extern int __init efi_reuse_config(u64 tables, int nr_tables);
+extern void efi_delete_dummy_variable(void);
struct efi_setup_data {
u64 fw_vendor;
@@ -188,6 +158,33 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
return EFI_SUCCESS;
}
#endif /* CONFIG_EFI_MIXED */
+
+
+/* arch specific definitions used by the stub code */
+
+struct efi_config {
+ u64 image_handle;
+ u64 table;
+ u64 allocate_pool;
+ u64 allocate_pages;
+ u64 get_memory_map;
+ u64 free_pool;
+ u64 free_pages;
+ u64 locate_handle;
+ u64 handle_protocol;
+ u64 exit_boot_services;
+ u64 text_output;
+ efi_status_t (*call)(unsigned long, ...);
+ bool is64;
+} __packed;
+
+extern struct efi_config *efi_early;
+
+#define efi_call_early(f, ...) \
+ efi_early->call(efi_early->f, __VA_ARGS__);
+
+extern bool efi_reboot_required(void);
+
#else
/*
* IF EFI is not configured, have the EFI calls return -ENOSYS.
@@ -200,6 +197,10 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
+static inline bool efi_reboot_required(void)
+{
+ return false;
+}
#endif /* CONFIG_EFI */
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 2c71182d30ef..1a055c81d864 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -75,7 +75,12 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
#include <asm/vdso.h>
-extern unsigned int vdso_enabled;
+#ifdef CONFIG_X86_64
+extern unsigned int vdso64_enabled;
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+extern unsigned int vdso32_enabled;
+#endif
/*
* This is used to ensure we don't load something for the wrong architecture.
@@ -269,9 +274,9 @@ extern int force_personality32;
struct task_struct;
-#define ARCH_DLINFO_IA32(vdso_enabled) \
+#define ARCH_DLINFO_IA32 \
do { \
- if (vdso_enabled) { \
+ if (vdso32_enabled) { \
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
} \
@@ -281,7 +286,7 @@ do { \
#define STACK_RND_MASK (0x7ff)
-#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
+#define ARCH_DLINFO ARCH_DLINFO_IA32
/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
@@ -292,16 +297,17 @@ do { \
#define ARCH_DLINFO \
do { \
- if (vdso_enabled) \
+ if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
- (unsigned long)current->mm->context.vdso); \
+ (unsigned long __force)current->mm->context.vdso); \
} while (0)
+/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
#define ARCH_DLINFO_X32 \
do { \
- if (vdso_enabled) \
+ if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
- (unsigned long)current->mm->context.vdso); \
+ (unsigned long __force)current->mm->context.vdso); \
} while (0)
#define AT_SYSINFO 32
@@ -310,7 +316,7 @@ do { \
if (test_thread_flag(TIF_X32)) \
ARCH_DLINFO_X32; \
else \
- ARCH_DLINFO_IA32(sysctl_vsyscall32)
+ ARCH_DLINFO_IA32
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
@@ -319,18 +325,17 @@ else \
#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
#define VDSO_ENTRY \
- ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
+ ((unsigned long)current->mm->context.vdso + \
+ selected_vdso32->sym___kernel_vsyscall)
struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
-extern int x32_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
-
-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
-#define compat_arch_setup_additional_pages syscall32_setup_pages
+extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
extern unsigned long arch_randomize_brk(struct mm_struct *mm);
#define arch_randomize_brk arch_randomize_brk
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 000000000000..99efebb2f69d
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_ESPFIX_H
+#define _ASM_X86_ESPFIX_H
+
+#ifdef CONFIG_X86_64
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(void);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 43f482a0db37..b0910f97a3ea 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -24,7 +24,7 @@
#include <linux/threads.h>
#include <asm/kmap_types.h>
#else
-#include <asm/vsyscall.h>
+#include <uapi/asm/vsyscall.h>
#endif
/*
@@ -41,7 +41,8 @@
extern unsigned long __FIXADDR_TOP;
#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
#else
-#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+#define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1<<PMD_SHIFT) - \
+ PAGE_SIZE)
#endif
@@ -68,11 +69,7 @@ enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_HOLE,
#else
- VSYSCALL_LAST_PAGE,
- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
- + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
- VVAR_PAGE,
- VSYSCALL_HPET,
+ VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
#ifdef CONFIG_PARAVIRT_CLOCK
PVCLOCK_FIXMAP_BEGIN,
PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 6099c0e5b62e..412ececa00b9 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -87,22 +87,22 @@ static inline int is_x32_frame(void)
static __always_inline __pure bool use_eager_fpu(void)
{
- return static_cpu_has(X86_FEATURE_EAGER_FPU);
+ return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
}
static __always_inline __pure bool use_xsaveopt(void)
{
- return static_cpu_has(X86_FEATURE_XSAVEOPT);
+ return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
}
static __always_inline __pure bool use_xsave(void)
{
- return static_cpu_has(X86_FEATURE_XSAVE);
+ return static_cpu_has_safe(X86_FEATURE_XSAVE);
}
static __always_inline __pure bool use_fxsr(void)
{
- return static_cpu_has(X86_FEATURE_FXSR);
+ return static_cpu_has_safe(X86_FEATURE_FXSR);
}
static inline void fx_finit(struct i387_fxsave_struct *fx)
@@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. "m" is a random variable that should be in L1 */
- if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) {
+ if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
asm volatile(
"fnclex\n\t"
"emms\n\t"
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 0525a8bdf65d..e1f7fecaa7d6 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -68,6 +68,8 @@ struct dyn_arch_ftrace {
int ftrace_int3_handler(struct pt_regs *regs);
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
+
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a307b7530e54..4615906d83df 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -190,8 +190,8 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
#define trace_interrupt interrupt
#endif
-#define VECTOR_UNDEFINED -1
-#define VECTOR_RETRIGGERED -2
+#define VECTOR_UNDEFINED (-1)
+#define VECTOR_RETRIGGERED (-2)
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 459e50a424d1..90f97b4b9347 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -168,8 +168,6 @@ extern int save_ioapic_entries(void);
extern void mask_ioapic_entries(void);
extern int restore_ioapic_entries(void);
-extern int get_nr_irqs_gsi(void);
-
extern void setup_ioapic_ids_from_mpc(void);
extern void setup_ioapic_ids_from_mpc_nocheck(void);
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index 8e71c7941767..57995f0596a6 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -50,6 +50,32 @@
#define BT_MBI_PCIE_READ 0x00
#define BT_MBI_PCIE_WRITE 0x01
+/* Quark available units */
+#define QRK_MBI_UNIT_HBA 0x00
+#define QRK_MBI_UNIT_HB 0x03
+#define QRK_MBI_UNIT_RMU 0x04
+#define QRK_MBI_UNIT_MM 0x05
+#define QRK_MBI_UNIT_MMESRAM 0x05
+#define QRK_MBI_UNIT_SOC 0x31
+
+/* Quark read/write opcodes */
+#define QRK_MBI_HBA_READ 0x10
+#define QRK_MBI_HBA_WRITE 0x11
+#define QRK_MBI_HB_READ 0x10
+#define QRK_MBI_HB_WRITE 0x11
+#define QRK_MBI_RMU_READ 0x10
+#define QRK_MBI_RMU_WRITE 0x11
+#define QRK_MBI_MM_READ 0x10
+#define QRK_MBI_MM_WRITE 0x11
+#define QRK_MBI_MMESRAM_READ 0x12
+#define QRK_MBI_MMESRAM_WRITE 0x13
+#define QRK_MBI_SOC_READ 0x06
+#define QRK_MBI_SOC_WRITE 0x07
+
+#if IS_ENABLED(CONFIG_IOSF_MBI)
+
+bool iosf_mbi_available(void);
+
/**
* iosf_mbi_read() - MailBox Interface read command
* @port: port indicating subunit being accessed
@@ -87,4 +113,33 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
*/
int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
+#else /* CONFIG_IOSF_MBI is not enabled */
+static inline
+bool iosf_mbi_available(void)
+{
+ return false;
+}
+
+static inline
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+#endif /* CONFIG_IOSF_MBI */
+
#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034cf..a80cbb88ea91 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index d806b228d2c0..b7747c4c2cf2 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -103,4 +103,7 @@ static inline bool setup_remapped_irq(int irq,
}
#endif /* CONFIG_IRQ_REMAP */
+#define dmar_alloc_hwirq() irq_alloc_hwirq(-1)
+#define dmar_free_hwirq irq_free_hwirq
+
#endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf88e624..0a8b519226b8 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-#define INTERRUPT_RETURN iretq
+#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
new file mode 100644
index 000000000000..d1b5d194e31d
--- /dev/null
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_KEXEC_BZIMAGE64_H
+#define _ASM_KEXEC_BZIMAGE64_H
+
+extern struct kexec_file_ops kexec_bzImage64_ops;
+
+#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 17483a492f18..d2434c1cad05 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -23,6 +23,9 @@
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/bootparam.h>
+
+struct kimage;
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -61,6 +64,10 @@
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
+/* Memory to backup during crash kdump */
+#define KEXEC_BACKUP_SRC_START (0UL)
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
+
/*
* CPU does not save ss and sp on stack if execution is already
* running in kernel mode at the time of NMI occurrence. This code
@@ -160,6 +167,44 @@ struct kimage_arch {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ /* Details of backup region */
+ unsigned long backup_src_start;
+ unsigned long backup_src_sz;
+
+ /* Physical address of backup segment */
+ unsigned long backup_load_addr;
+
+ /* Core ELF header buffer */
+ void *elf_headers;
+ unsigned long elf_headers_sz;
+ unsigned long elf_load_addr;
+};
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+/*
+ * Number of elements and order of elements in this structure should match
+ * with the ones in arch/x86/purgatory/entry64.S. If you make a change here
+ * make an appropriate change in purgatory too.
+ */
+struct kexec_entry64_regs {
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rsp;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ uint64_t rip;
};
#endif
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9454c167629f..53cdfb2857ab 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -116,4 +116,6 @@ struct kprobe_ctlblk {
extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data);
+extern int kprobe_int3_handler(struct pt_regs *regs);
+extern int kprobe_debug_handler(struct pt_regs *regs);
#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 24ec1216596e..eb181178fe0b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -37,6 +37,7 @@ struct x86_instruction_info {
u8 modrm_reg; /* index of register used */
u8 modrm_rm; /* rm part of modrm */
u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
u8 src_bytes; /* size of source operand */
u8 dst_bytes; /* size of destination operand */
u8 ad_bytes; /* size of src/dst address */
@@ -189,12 +190,12 @@ struct x86_emulate_ops {
void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
- void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
@@ -232,7 +233,7 @@ struct operand {
union {
unsigned long val;
u64 val64;
- char valptr[sizeof(unsigned long) + 2];
+ char valptr[sizeof(sse128_t)];
sse128_t vec_val;
u64 mm_val;
void *data;
@@ -241,8 +242,8 @@ struct operand {
struct fetch_cache {
u8 data[15];
- unsigned long start;
- unsigned long end;
+ u8 *ptr;
+ u8 *end;
};
struct read_cache {
@@ -287,30 +288,36 @@ struct x86_emulate_ctxt {
u8 opcode_len;
u8 b;
u8 intercept;
- u8 lock_prefix;
- u8 rep_prefix;
u8 op_bytes;
u8 ad_bytes;
- u8 rex_prefix;
struct operand src;
struct operand src2;
struct operand dst;
- bool has_seg_override;
- u8 seg_override;
- u64 d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * The following six fields are cleared together,
+ * the rest are initialized unconditionally in x86_decode_insn
+ * or elsewhere
+ */
+ bool rip_relative;
+ u8 rex_prefix;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u32 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u32 regs_dirty;
/* modrm */
u8 modrm;
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
u8 modrm_seg;
- bool rip_relative;
+ u8 seg_override;
+ u64 d;
unsigned long _eip;
struct operand memop;
- u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */
- u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */
/* Fields above regs are cleared together. */
unsigned long _regs[NR_VCPU_REGS];
struct operand *memopp;
@@ -408,6 +415,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_OK 0
#define EMULATION_RESTART 1
#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int idt_index, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7de069afb382..572460175ba5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -50,11 +50,7 @@
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
- 0xFFFFFF0000000000ULL)
+#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -99,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 8
+#define KVM_NR_VAR_MTRR 10
#define ASYNC_PF_PER_VCPU 64
@@ -134,7 +130,6 @@ enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
VCPU_EXREG_CR3,
VCPU_EXREG_RFLAGS,
- VCPU_EXREG_CPL,
VCPU_EXREG_SEGMENTS,
};
@@ -157,14 +152,16 @@ enum {
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
-#define DR6_FIXED_1 0xffff0ff0
-#define DR6_VOLATILE 0x0000e00f
+#define DR6_RTM (1 << 16)
+#define DR6_FIXED_1 0xfffe0ff0
+#define DR6_INIT 0xffff0ff0
+#define DR6_VOLATILE 0x0001e00f
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
#define DR7_GD (1 << 13)
#define DR7_FIXED_1 0x00000400
-#define DR7_VOLATILE 0xffff23ff
+#define DR7_VOLATILE 0xffff2bff
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -453,7 +450,7 @@ struct kvm_vcpu_arch {
u64 tsc_offset_adjustment;
u64 this_tsc_nsec;
u64 this_tsc_write;
- u8 this_tsc_generation;
+ u64 this_tsc_generation;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
@@ -466,7 +463,7 @@ struct kvm_vcpu_arch {
bool nmi_injected; /* Trying to inject an NMI this entry */
struct mtrr_state_type mtrr_state;
- u32 pat;
+ u64 pat;
unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
@@ -596,7 +593,7 @@ struct kvm_arch {
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
- u8 cur_tsc_generation;
+ u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
spinlock_t pvclock_gtod_sync_lock;
@@ -722,7 +719,7 @@ struct kvm_x86_ops {
int (*handle_exit)(struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu);
@@ -1075,6 +1072,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index a55c7efcc4ed..0f555cc31984 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -13,7 +13,7 @@
#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
#endif
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32)
/*
* This lock provides nmi access to the CMOS/RTC registers. It has some
* special properties. It is owned by a CPU and stores the index register
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6e4ce2df87cf..958b90f761e5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -176,8 +176,6 @@ int mce_available(struct cpuinfo_x86 *c);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
-extern atomic_t mce_entry;
-
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index b59827e76529..64dc362506b7 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -25,6 +25,7 @@ struct cpu_signature {
struct device;
enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+extern bool dis_ucode_ldr;
struct microcode_ops {
enum ucode_state (*request_microcode_user) (int cpu,
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5f55e6962769..876e74e8eec7 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -18,7 +18,7 @@ typedef struct {
#endif
struct mutex lock;
- void *vdso;
+ void __user *vdso;
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index be12c534fd59..166af2a8e865 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -3,6 +3,10 @@
#include <asm/desc.h>
#include <linux/atomic.h>
+#include <linux/mm_types.h>
+
+#include <trace/events/tlb.h>
+
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
@@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Re-load page tables */
load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* to make sure to use no freed page tables.
*/
load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_LDT_nolock(&next->context);
}
}
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
index 0208c3c2cbc6..85e6cda45a02 100644
--- a/arch/x86/include/asm/mutex_32.h
+++ b/arch/x86/include/asm/mutex_32.h
@@ -100,23 +100,11 @@ do { \
static inline int __mutex_fastpath_trylock(atomic_t *count,
int (*fail_fn)(atomic_t *))
{
- /*
- * We have two variants here. The cmpxchg based one is the best one
- * because it never induce a false contention state. It is included
- * here because architectures using the inc/dec algorithms over the
- * xchg ones are much more likely to support cmpxchg natively.
- *
- * If not we fall back to the spinlock based variant - that is
- * just as efficient (and simpler) as a 'destructive' probing of
- * the mutex state would be.
- */
-#ifdef __HAVE_ARCH_CMPXCHG
+ /* cmpxchg because it never induces a false contention state. */
if (likely(atomic_cmpxchg(count, 1, 0) == 1))
return 1;
+
return 0;
-#else
- return fail_fn(count);
-#endif
}
#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5f96f9..a1410db38a1a 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -43,7 +43,7 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!current_set_polling_and_test()) {
- if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+ if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 775873d3be55..802dde30c928 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr);
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-#define __HAVE_ARCH_GATE_AREA 1
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 0f1ddee6a0ce..f408caf73430 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -39,4 +39,6 @@ void copy_page(void *to, void *from);
#endif /* !__ASSEMBLY__ */
+#define __HAVE_ARCH_GATE_AREA 1
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8de6d9cf3b95..678205195ae1 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,7 +1,7 @@
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
-#define THREAD_SIZE_ORDER 1
+#define THREAD_SIZE_ORDER 2
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define CURRENT_MASK (~(THREAD_SIZE - 1))
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 96ae4f4040bb..0892ea0e683f 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -68,7 +68,6 @@ void pcibios_config_init(void);
void pcibios_scan_root(int bus);
void pcibios_set_master(struct pci_dev *dev);
-void pcibios_penalize_isa_irq(int irq, int active);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 851bcdc5db04..fd472181a1d0 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -52,10 +52,9 @@
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
*/
-#define raw_cpu_ptr(ptr) \
+#define arch_raw_cpu_ptr(ptr) \
({ \
unsigned long tcp_ptr__; \
- __verify_pcpu_ptr(ptr); \
asm volatile("add " __percpu_arg(1) ", %0" \
: "=r" (tcp_ptr__) \
: "m" (this_cpu_off), "0" (ptr)); \
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 0d193e234647..206a87fdd22d 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -62,66 +62,14 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
return ((value >> rightshift) & mask) << leftshift;
}
-#ifdef CONFIG_MEM_SOFT_DIRTY
-
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and
- * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset
- * into this range.
- */
-#define PTE_FILE_MAX_BITS 28
-#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1)
-#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
-
-#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
-#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
-#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1)
-
-#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
-#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
-#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)
-
-static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
-{
- return (pgoff_t)
- (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
- pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
- pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) +
- pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4));
-}
-
-static __always_inline pte_t pgoff_to_pte(pgoff_t off)
-{
- return (pte_t){
- .pte_low =
- pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
- pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
- pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) +
- pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) +
- _PAGE_FILE,
- };
-}
-
-#else /* CONFIG_MEM_SOFT_DIRTY */
-
/*
* Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
* split up the 29 bits of offset into this range.
*/
#define PTE_FILE_MAX_BITS 29
#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
-#else
-#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
-#endif
#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
@@ -150,16 +98,9 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off)
};
}
-#endif /* CONFIG_MEM_SOFT_DIRTY */
-
/* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#else
-#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
-#endif
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b459ddf27d64..0ec056012618 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- return pte_flags(pte) & _PAGE_SPECIAL;
+ return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
+ (_PAGE_PRESENT|_PAGE_SPECIAL);
}
static inline unsigned long pte_pfn(pte_t pte)
@@ -296,6 +297,7 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_PRESENT);
}
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
return pte_flags(pte) & _PAGE_SOFT_DIRTY;
@@ -331,6 +333,8 @@ static inline int pte_file_soft_dirty(pte_t pte)
return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -452,6 +456,12 @@ static inline int pte_present(pte_t a)
_PAGE_NUMA);
}
+#define pte_present_nonuma pte_present_nonuma
+static inline int pte_present_nonuma(pte_t a)
+{
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -858,23 +868,25 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
{
}
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present(pte));
+ VM_BUG_ON(pte_present_nonuma(pte));
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
static inline int pte_swp_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present(pte));
+ VM_BUG_ON(pte_present_nonuma(pte));
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present(pte));
+ VM_BUG_ON(pte_present_nonuma(pte));
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
+#endif
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e22c1dbf7feb..5be9063545d2 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -143,12 +143,12 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define pte_unmap(pte) ((void)(pte))/* NOP */
/* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#ifdef CONFIG_NUMA_BALANCING
+/* Automatic NUMA balancing needs to be distinguishable from swap entries */
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
#else
-#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
#endif
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index c883bf726398..7166e25ecb57 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
#define EARLY_DYNAMIC_PAGE_TABLES 64
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index eb3d44945133..f216963760e5 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -16,15 +16,26 @@
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
-#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
-#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
-#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
+#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
+#define _PAGE_BIT_SOFTW2 10 /* " */
+#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
+#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
+#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
+#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+/*
+ * Swap offsets on configurations that allow automatic NUMA balancing use the
+ * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
+ * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
+ * maximum possible swap space from 16TB to 8TB.
+ */
+#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
+
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -40,7 +51,7 @@
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@@ -61,8 +72,6 @@
* they do not conflict with each other.
*/
-#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
-
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
@@ -70,6 +79,21 @@
#endif
/*
+ * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
+ * that is not present. The hinting fault gathers numa placement statistics
+ * (see pte_numa()). The bit is always zero when the PTE is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ */
+#ifdef CONFIG_NUMA_BALANCING
+#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
+#else
+#define _PAGE_NUMA (_AT(pteval_t, 0))
+#endif
+
+/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
* with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -94,26 +118,6 @@
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-/*
- * _PAGE_NUMA indicates that this page will trigger a numa hinting
- * minor page fault to gather numa placement statistics (see
- * pte_numa()). The bit picked (8) is within the range between
- * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
- * require changes to the swp entry format because that bit is always
- * zero when the pte is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- *
- * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
- * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
- * couldn't reach, like handle_mm_fault() (see access_error in
- * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
- * handle_mm_fault() to be invoked).
- */
-#define _PAGE_NUMA _PAGE_PROTNONE
-
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
@@ -122,8 +126,8 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+ _PAGE_SOFT_DIRTY | _PAGE_NUMA)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
new file mode 100644
index 000000000000..0a4e140315b6
--- /dev/null
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -0,0 +1,78 @@
+/*
+ * platform_sst_audio.h: sst audio platform data header file
+ *
+ * Copyright (C) 2012-14 Intel Corporation
+ * Author: Jeeja KP <jeeja.kp@intel.com>
+ * Omair Mohammed Abdullah <omair.m.abdullah@intel.com>
+ * Vinod Koul ,vinod.koul@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_SST_AUDIO_H_
+#define _PLATFORM_SST_AUDIO_H_
+
+#include <linux/sfi.h>
+
+enum sst_audio_task_id_mrfld {
+ SST_TASK_ID_NONE = 0,
+ SST_TASK_ID_SBA = 1,
+ SST_TASK_ID_MEDIA = 3,
+ SST_TASK_ID_MAX = SST_TASK_ID_MEDIA,
+};
+
+/* Device IDs for Merrifield are Pipe IDs,
+ * ref: DSP spec v0.75 */
+enum sst_audio_device_id_mrfld {
+ /* Output pipeline IDs */
+ PIPE_ID_OUT_START = 0x0,
+ PIPE_CODEC_OUT0 = 0x2,
+ PIPE_CODEC_OUT1 = 0x3,
+ PIPE_SPROT_LOOP_OUT = 0x4,
+ PIPE_MEDIA_LOOP1_OUT = 0x5,
+ PIPE_MEDIA_LOOP2_OUT = 0x6,
+ PIPE_VOIP_OUT = 0xC,
+ PIPE_PCM0_OUT = 0xD,
+ PIPE_PCM1_OUT = 0xE,
+ PIPE_PCM2_OUT = 0xF,
+ PIPE_MEDIA0_OUT = 0x12,
+ PIPE_MEDIA1_OUT = 0x13,
+/* Input Pipeline IDs */
+ PIPE_ID_IN_START = 0x80,
+ PIPE_CODEC_IN0 = 0x82,
+ PIPE_CODEC_IN1 = 0x83,
+ PIPE_SPROT_LOOP_IN = 0x84,
+ PIPE_MEDIA_LOOP1_IN = 0x85,
+ PIPE_MEDIA_LOOP2_IN = 0x86,
+ PIPE_VOIP_IN = 0x8C,
+ PIPE_PCM0_IN = 0x8D,
+ PIPE_PCM1_IN = 0x8E,
+ PIPE_MEDIA0_IN = 0x8F,
+ PIPE_MEDIA1_IN = 0x90,
+ PIPE_MEDIA2_IN = 0x91,
+ PIPE_RSVD = 0xFF,
+};
+
+/* The stream map for each platform consists of an array of the below
+ * stream map structure.
+ */
+struct sst_dev_stream_map {
+ u8 dev_num; /* device id */
+ u8 subdev_num; /* substream */
+ u8 direction;
+ u8 device_id; /* fw id */
+ u8 task_id; /* fw task */
+ u8 status;
+};
+
+struct sst_platform_data {
+ /* Intel software platform id*/
+ struct sst_dev_stream_map *pdev_strm_map;
+ unsigned int strm_map_size;
+};
+
+int add_sst_platform_device(void);
+#endif
+
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
new file mode 100644
index 000000000000..fc7a17c05d35
--- /dev/null
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -0,0 +1,107 @@
+/*
+ * Intel Atom SOC Power Management Controller Header File
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef PMC_ATOM_H
+#define PMC_ATOM_H
+
+/* ValleyView Power Control Unit PCI Device ID */
+#define PCI_DEVICE_ID_VLV_PMC 0x0F1C
+
+/* PMC Memory mapped IO registers */
+#define PMC_BASE_ADDR_OFFSET 0x44
+#define PMC_BASE_ADDR_MASK 0xFFFFFE00
+#define PMC_MMIO_REG_LEN 0x100
+#define PMC_REG_BIT_WIDTH 32
+
+/* BIOS uses FUNC_DIS to disable specific function */
+#define PMC_FUNC_DIS 0x34
+#define PMC_FUNC_DIS_2 0x38
+
+/* S0ix wake event control */
+#define PMC_S0IX_WAKE_EN 0x3C
+
+#define BIT_LPC_CLOCK_RUN BIT(4)
+#define BIT_SHARED_IRQ_GPSC BIT(5)
+#define BIT_ORED_DEDICATED_IRQ_GPSS BIT(18)
+#define BIT_ORED_DEDICATED_IRQ_GPSC BIT(19)
+#define BIT_SHARED_IRQ_GPSS BIT(20)
+
+#define PMC_WAKE_EN_SETTING ~(BIT_LPC_CLOCK_RUN | \
+ BIT_SHARED_IRQ_GPSC | \
+ BIT_ORED_DEDICATED_IRQ_GPSS | \
+ BIT_ORED_DEDICATED_IRQ_GPSC | \
+ BIT_SHARED_IRQ_GPSS)
+
+/* The timers acumulate time spent in sleep state */
+#define PMC_S0IR_TMR 0x80
+#define PMC_S0I1_TMR 0x84
+#define PMC_S0I2_TMR 0x88
+#define PMC_S0I3_TMR 0x8C
+#define PMC_S0_TMR 0x90
+/* Sleep state counter is in units of of 32us */
+#define PMC_TMR_SHIFT 5
+
+/* These registers reflect D3 status of functions */
+#define PMC_D3_STS_0 0xA0
+
+#define BIT_LPSS1_F0_DMA BIT(0)
+#define BIT_LPSS1_F1_PWM1 BIT(1)
+#define BIT_LPSS1_F2_PWM2 BIT(2)
+#define BIT_LPSS1_F3_HSUART1 BIT(3)
+#define BIT_LPSS1_F4_HSUART2 BIT(4)
+#define BIT_LPSS1_F5_SPI BIT(5)
+#define BIT_LPSS1_F6_XXX BIT(6)
+#define BIT_LPSS1_F7_XXX BIT(7)
+#define BIT_SCC_EMMC BIT(8)
+#define BIT_SCC_SDIO BIT(9)
+#define BIT_SCC_SDCARD BIT(10)
+#define BIT_SCC_MIPI BIT(11)
+#define BIT_HDA BIT(12)
+#define BIT_LPE BIT(13)
+#define BIT_OTG BIT(14)
+#define BIT_USH BIT(15)
+#define BIT_GBE BIT(16)
+#define BIT_SATA BIT(17)
+#define BIT_USB_EHCI BIT(18)
+#define BIT_SEC BIT(19)
+#define BIT_PCIE_PORT0 BIT(20)
+#define BIT_PCIE_PORT1 BIT(21)
+#define BIT_PCIE_PORT2 BIT(22)
+#define BIT_PCIE_PORT3 BIT(23)
+#define BIT_LPSS2_F0_DMA BIT(24)
+#define BIT_LPSS2_F1_I2C1 BIT(25)
+#define BIT_LPSS2_F2_I2C2 BIT(26)
+#define BIT_LPSS2_F3_I2C3 BIT(27)
+#define BIT_LPSS2_F4_I2C4 BIT(28)
+#define BIT_LPSS2_F5_I2C5 BIT(29)
+#define BIT_LPSS2_F6_I2C6 BIT(30)
+#define BIT_LPSS2_F7_I2C7 BIT(31)
+
+#define PMC_D3_STS_1 0xA4
+#define BIT_SMB BIT(0)
+#define BIT_OTG_SS_PHY BIT(1)
+#define BIT_USH_SS_PHY BIT(2)
+#define BIT_DFX BIT(3)
+
+/* PMC I/O Registers */
+#define ACPI_BASE_ADDR_OFFSET 0x40
+#define ACPI_BASE_ADDR_MASK 0xFFFFFE00
+#define ACPI_MMIO_REG_LEN 0x100
+
+#define PM1_CNT 0x4
+#define SLEEP_TYPE_MASK 0xFFFFECFF
+#define SLEEP_TYPE_S5 0x1C00
+#define SLEEP_ENABLE 0x2000
+#endif /* PMC_ATOM_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2c8d3b8e7fcb..eb71ec794732 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
-extern s8 __read_mostly tlb_flushall_shift;
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
@@ -696,6 +695,8 @@ static inline void cpu_relax(void)
rep_nop();
}
+#define cpu_relax_lowlatency() cpu_relax()
+
/* Stop speculative execution and prefetching of modified code. */
static inline void sync_core(void)
{
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6fd3fd769796..a90f8972dad5 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -12,8 +12,6 @@ void ia32_syscall(void);
void ia32_cstar_target(void);
void ia32_sysenter_target(void);
-void syscall32_cpu_init(void);
-
void x86_configure_nx(void);
void x86_report_nx(void);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14fd6fd75a19..6205f0c434db 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_INFO
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ set_thread_flag(TIF_NOTIFY_RESUME); \
+ false; \
+})
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
new file mode 100644
index 000000000000..ae0e241e228b
--- /dev/null
+++ b/arch/x86/include/asm/qrwlock.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_QRWLOCK_H
+#define _ASM_X86_QRWLOCK_H
+
+#include <asm-generic/qrwlock_types.h>
+
+#ifndef CONFIG_X86_PPRO_FENCE
+#define queue_write_unlock queue_write_unlock
+static inline void queue_write_unlock(struct qrwlock *lock)
+{
+ barrier();
+ ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
+}
+#endif
+
+#include <asm-generic/qrwlock.h>
+
+#endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
deleted file mode 100644
index 4240878b9d76..000000000000
--- a/arch/x86/include/asm/scatterlist.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_X86_SCATTERLIST_H
-#define _ASM_X86_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9264f04a4c55..ff4e7b236e21 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -59,6 +59,8 @@ static inline void x86_ce4100_early_setup(void) { }
#ifndef _SETUP
+#include <asm/espfix.h>
+
/*
* This is set up by the setup-routine at boot-time
*/
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 35e67a457182..31eab867e6d3 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -92,12 +92,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig)
? __const_sigismember((set), (sig)) \
: __gen_sigismember((set), (sig)))
-static inline int sigfindinword(unsigned long word)
-{
- asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
- return word;
-}
-
struct pt_regs;
#else /* __i386__ */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 0f62f5482d91..54f1c8068c02 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -187,6 +187,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
cpu_relax();
}
+#ifndef CONFIG_QUEUE_RWLOCK
/*
* Read-write spinlocks, allowing multiple readers
* but only one writer.
@@ -269,6 +270,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
: "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
}
+#else
+#include <asm/qrwlock.h>
+#endif /* CONFIG_QUEUE_RWLOCK */
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 4f1bea19945b..73c4c007200f 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -34,6 +34,10 @@ typedef struct arch_spinlock {
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
+#ifdef CONFIG_QUEUE_RWLOCK
+#include <asm-generic/qrwlock_types.h>
+#else
#include <asm/rwlock.h>
+#endif
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 977f1761a25d..ab05d73e2bb7 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -29,4 +29,11 @@ static inline void pci_swiotlb_late_init(void)
static inline void dma_mark_clean(void *addr, size_t size) {}
+extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags,
+ struct dma_attrs *attrs);
+extern void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ struct dma_attrs *attrs);
+
#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 05af3b31d522..f28a24b51dc7 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -41,7 +41,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr)
*
* sync_clear_bit() is atomic and may not be reordered. However, it does
* not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
* in order to ensure changes are visible on other processors.
*/
static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 47e5de25ba79..854053889d4d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_FORK 18 /* ret_from_fork */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
@@ -106,6 +107,7 @@ struct thread_info {
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_NOHZ (1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
@@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void)
* have to worry about atomic accesses.
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-#define TS_POLLING 0x0004 /* idle task polling need_resched,
- skip sending interrupt */
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 58d66fe06b61..bc8352e7010a 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -68,12 +68,17 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
+asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
#ifdef CONFIG_TRACING
dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
+#else
+static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
+{
+ do_page_fault(regs, error);
+}
#endif
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
@@ -98,7 +103,6 @@ static inline int get_si_code(unsigned long condition)
extern int panic_on_unrecovered_nmi;
-void math_error(struct pt_regs *, int, int);
void math_emulate(struct math_emu_info *);
#ifndef CONFIG_X86_32
asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 3f556c6a0157..2b19caa4081c 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -41,7 +41,6 @@
# define __ARCH_WANT_SYS_OLD_GETRLIMIT
# define __ARCH_WANT_SYS_OLD_UNAME
# define __ARCH_WANT_SYS_PAUSE
-# define __ARCH_WANT_SYS_SGETMASK
# define __ARCH_WANT_SYS_SIGNAL
# define __ARCH_WANT_SYS_SIGPENDING
# define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 3087ea9c5f2e..74f4c2ff6427 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -33,15 +33,27 @@ typedef u8 uprobe_opcode_t;
#define UPROBE_SWBP_INSN 0xcc
#define UPROBE_SWBP_INSN_SIZE 1
+struct uprobe_xol_ops;
+
struct arch_uprobe {
- u16 fixups;
union {
u8 insn[MAX_UINSN_BYTES];
u8 ixol[MAX_UINSN_BYTES];
};
-#ifdef CONFIG_X86_64
- unsigned long rip_rela_target_address;
-#endif
+
+ const struct uprobe_xol_ops *ops;
+
+ union {
+ struct {
+ s32 offs;
+ u8 ilen;
+ u8 opc1;
+ } branch;
+ struct {
+ u8 fixups;
+ u8 ilen;
+ } defparam;
+ };
};
struct arch_uprobe_task {
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 0b46ef261c77..2d60a7813dfe 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -73,6 +73,7 @@
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+/* assuming UV3 is the same */
#define BAU_MISC_CONTROL_MULT_MASK 3
@@ -93,6 +94,8 @@
#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT
+#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
#define write_gmmr uv_write_global_mmr64
#define write_lmmr uv_write_local_mmr
#define read_lmmr uv_read_local_mmr
@@ -322,8 +325,9 @@ struct uv1_bau_msg_header {
/*
* UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
* see figure 9-2 of harp_sys.pdf
+ * assuming UV3 is the same
*/
-struct uv2_bau_msg_header {
+struct uv2_3_bau_msg_header {
unsigned int base_dest_nasid:15; /* nasid of the first bit */
/* bits 14:0 */ /* in uvhub map */
unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */
@@ -395,7 +399,7 @@ struct bau_desc {
*/
union bau_msg_header {
struct uv1_bau_msg_header uv1_hdr;
- struct uv2_bau_msg_header uv2_hdr;
+ struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
struct bau_msg_payload payload;
@@ -631,11 +635,6 @@ struct bau_control {
struct hub_and_pnode *thp;
};
-static inline unsigned long read_mmr_uv2_status(void)
-{
- return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
-}
-
static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
@@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v)
*/
static inline int atom_asr(short i, struct atomic_short *v)
{
- return i + xadd(&v->counter, i);
+ short __i = i;
+ asm volatile(LOCK_PREFIX "xaddw %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
}
/*
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index a30836c8ac4d..c63e925fd6b7 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -5,7 +5,7 @@
*
* SGI UV architectural definitions
*
- * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_HUB_H
@@ -204,16 +204,6 @@ static inline int is_uvx_hub(void)
return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
}
-static inline int is_uv2_1_hub(void)
-{
- return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE;
-}
-
-static inline int is_uv2_2_hub(void)
-{
- return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1;
-}
-
union uvh_apicid {
unsigned long v;
struct uvh_apicid_s {
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index e42249bcf7e1..ddd8db6b6e70 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
*
* SGI UV MMR definitions
*
- * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_MMRS_H
@@ -2803,6 +2803,46 @@ union uv1h_lb_target_physical_apic_id_mask_u {
};
/* ========================================================================= */
+/* UV3H_GR0_GAM_GR_CONFIG */
+/* ========================================================================= */
+#define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL
+
+#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0
+#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10
+#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL
+#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL
+
+union uv3h_gr0_gam_gr_config_u {
+ unsigned long v;
+ struct uv3h_gr0_gam_gr_config_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long undef_6_9:4; /* Undefined */
+ unsigned long subspace:1; /* RW */
+ unsigned long reserved:53;
+ } s3;
+};
+
+/* ========================================================================= */
+/* UV3H_GR1_GAM_GR_CONFIG */
+/* ========================================================================= */
+#define UV3H_GR1_GAM_GR_CONFIG 0x1000028UL
+
+#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT 0
+#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT 10
+#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL
+#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL
+
+union uv3h_gr1_gam_gr_config_u {
+ unsigned long v;
+ struct uv3h_gr1_gam_gr_config_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long undef_6_9:4; /* Undefined */
+ unsigned long subspace:1; /* RW */
+ unsigned long reserved:53;
+ } s3;
+};
+
+/* ========================================================================= */
/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */
/* ========================================================================= */
#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index d1dc55404ff1..8021bd28c0f1 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -3,63 +3,51 @@
#include <asm/page_types.h>
#include <linux/linkage.h>
+#include <linux/init.h>
-#ifdef __ASSEMBLER__
+#ifndef __ASSEMBLER__
-#define DEFINE_VDSO_IMAGE(symname, filename) \
-__PAGE_ALIGNED_DATA ; \
- .globl symname##_start, symname##_end ; \
- .align PAGE_SIZE ; \
- symname##_start: ; \
- .incbin filename ; \
- symname##_end: ; \
- .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \
- \
-.previous ; \
- \
- .globl symname##_pages ; \
- .bss ; \
- .align 8 ; \
- .type symname##_pages, @object ; \
- symname##_pages: ; \
- .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \
- .size symname##_pages, .-symname##_pages
+#include <linux/mm_types.h>
-#else
+struct vdso_image {
+ void *data;
+ unsigned long size; /* Always a multiple of PAGE_SIZE */
-#define DECLARE_VDSO_IMAGE(symname) \
- extern char symname##_start[], symname##_end[]; \
- extern struct page *symname##_pages[]
+ /* text_mapping.pages is big enough for data/size page pointers */
+ struct vm_special_mapping text_mapping;
-#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+ unsigned long alt, alt_len;
-#include <asm/vdso32.h>
+ long sym_vvar_start; /* Negative offset to the vvar area */
-DECLARE_VDSO_IMAGE(vdso32_int80);
-#ifdef CONFIG_COMPAT
-DECLARE_VDSO_IMAGE(vdso32_syscall);
+ long sym_vvar_page;
+ long sym_hpet_page;
+ long sym_VDSO32_NOTE_MASK;
+ long sym___kernel_sigreturn;
+ long sym___kernel_rt_sigreturn;
+ long sym___kernel_vsyscall;
+ long sym_VDSO32_SYSENTER_RETURN;
+};
+
+#ifdef CONFIG_X86_64
+extern const struct vdso_image vdso_image_64;
+#endif
+
+#ifdef CONFIG_X86_X32
+extern const struct vdso_image vdso_image_x32;
#endif
-DECLARE_VDSO_IMAGE(vdso32_sysenter);
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO32_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO32_SYMBOL(base, name) \
-({ \
- extern const char VDSO32_##name[]; \
- (void __user *)(VDSO32_##name + (unsigned long)(base)); \
-})
+#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+extern const struct vdso_image vdso_image_32_int80;
+#ifdef CONFIG_COMPAT
+extern const struct vdso_image vdso_image_32_syscall;
#endif
+extern const struct vdso_image vdso_image_32_sysenter;
-/*
- * These symbols are defined with the addresses in the vsyscall page.
- * See vsyscall-sigreturn.S.
- */
-extern void __user __kernel_sigreturn;
-extern void __user __kernel_rt_sigreturn;
+extern const struct vdso_image *selected_vdso32;
+#endif
-void __init patch_vdso32(void *vdso, size_t len);
+extern void __init init_vdso_image(const struct vdso_image *image);
#endif /* __ASSEMBLER__ */
diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h
deleted file mode 100644
index 7efb7018406e..000000000000
--- a/arch/x86/include/asm/vdso32.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_VDSO32_H
-#define _ASM_X86_VDSO32_H
-
-#define VDSO_BASE_PAGE 0
-#define VDSO_VVAR_PAGE 1
-#define VDSO_HPET_PAGE 2
-#define VDSO_PAGES 3
-#define VDSO_PREV_PAGES 2
-#define VDSO_OFFSET(x) ((x) * PAGE_SIZE)
-
-#endif
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index 44282fbf7bf9..c4b9dc2f67c5 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -17,10 +17,4 @@
#define vga_readb(x) (*(x))
#define vga_writeb(x, y) (*(y) = (x))
-#ifdef CONFIG_FB_EFI
-#define __ARCH_HAS_VGA_DEFAULT_DEVICE
-extern struct pci_dev *vga_default_device(void);
-extern void vga_set_default_device(struct pci_dev *pdev);
-#endif
-
#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 7004d21e6219..bcbfade26d8d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -51,6 +51,9 @@
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
/*
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
@@ -76,7 +79,7 @@
#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
-#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
@@ -89,7 +92,7 @@
#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
-#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 081d909bc495..5d2b9ad2c6d2 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -29,31 +29,13 @@
#else
-#ifdef BUILD_VDSO32
+extern char __vvar_page;
#define DECLARE_VVAR(offset, type, name) \
extern type vvar_ ## name __attribute__((visibility("hidden")));
#define VVAR(name) (vvar_ ## name)
-#else
-
-extern char __vvar_page;
-
-/* Base address of vvars. This is not ABI. */
-#ifdef CONFIG_X86_64
-#define VVAR_ADDRESS (-10*1024*1024 - 4096)
-#else
-#define VVAR_ADDRESS (&__vvar_page)
-#endif
-
-#define DECLARE_VVAR(offset, type, name) \
- static type const * const vvaraddr_ ## name = \
- (void *)(VVAR_ADDRESS + (offset));
-
-#define VVAR(name) (*vvaraddr_ ## name)
-#endif
-
#define DEFINE_VVAR(type, name) \
type name \
__attribute__((section(".vvar_" #name), aligned(16))) __visible
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index e709884d0ef9..ca08a27b90b3 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -343,7 +343,7 @@ HYPERVISOR_memory_op(unsigned int cmd, void *arg)
}
static inline int
-HYPERVISOR_multicall(void *call_list, int nr_calls)
+HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
{
return _hypercall2(int, multicall, call_list, nr_calls);
}
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index fd9cb7695b5f..3400dbaec3c3 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -54,6 +54,9 @@ typedef unsigned long xen_pfn_t;
#define PRI_xen_pfn "lx"
typedef unsigned long xen_ulong_t;
#define PRI_xen_ulong "lx"
+typedef long xen_long_t;
+#define PRI_xen_long "lx"
+
/* Guest handles for primitive C types. */
__DEFINE_GUEST_HANDLE(uchar, unsigned char);
__DEFINE_GUEST_HANDLE(uint, unsigned int);
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 09409c44f9a5..3dec769cadf7 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -22,6 +22,7 @@ header-y += ipcbuf.h
header-y += ist.h
header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_perf.h
header-y += ldt.h
header-y += mce.h
header-y += mman.h
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d3a87780c70b..d7dcef58aefa 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -23,7 +23,10 @@
#define GP_VECTOR 13
#define PF_VECTOR 14
#define MF_VECTOR 16
+#define AC_VECTOR 17
#define MC_VECTOR 18
+#define XM_VECTOR 19
+#define VE_VECTOR 20
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_PIT
diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h
new file mode 100644
index 000000000000..3bb964f88aa1
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_perf.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_KVM_PERF_H
+#define _ASM_X86_KVM_PERF_H
+
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+
+#define DECODE_STR_LEN 20
+
+#define VCPU_ID "vcpu_id"
+
+#define KVM_ENTRY_TRACE "kvm:kvm_entry"
+#define KVM_EXIT_TRACE "kvm:kvm_exit"
+#define KVM_EXIT_REASON "exit_reason"
+
+#endif /* _ASM_X86_KVM_PERF_H */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 5cd1569518ef..eac9e92fe181 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -560,6 +560,7 @@
/* VMX_BASIC bits and bitmasks */
#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
#define VMX_BASIC_64 0x0001000000000000LLU
#define VMX_BASIC_MEM_TYPE_SHIFT 50
#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h
index 85dc1b3825ab..b97dd6e263d2 100644
--- a/arch/x86/include/uapi/asm/vsyscall.h
+++ b/arch/x86/include/uapi/asm/vsyscall.h
@@ -7,11 +7,6 @@ enum vsyscall_num {
__NR_vgetcpu,
};
-#define VSYSCALL_START (-10UL << 20)
-#define VSYSCALL_SIZE 1024
-#define VSYSCALL_END (-2UL << 20)
-#define VSYSCALL_MAPPED_PAGES 1
-#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
-
+#define VSYSCALL_ADDR (-10UL << 20)
#endif /* _UAPI_ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f4d96000d33a..b5ea75c4a4b4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,9 +26,11 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
+obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
@@ -104,6 +106,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
+obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
###
# 64 bit specific files
@@ -115,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
+ obj-$(CONFIG_KEXEC) += kexec-bzimage64.o
endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 163b22581472..3242e591fa82 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,6 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 000000000000..c280df6b2aa2
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,62 @@
+/*
+ * Arch-specific APEI-related functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+ return 0;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
+
+void arch_apei_flush_tlb_one(unsigned long addr)
+{
+ __flush_tlb_one(addr);
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 86281ffb96d6..a531f6564ed0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -74,10 +74,6 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
-#ifndef __HAVE_ARCH_CMPXCHG
-#warning ACPI uses CMPXCHG, i486 and later hardware
-#endif
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index df94598ad05a..703130f469ec 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,7 +5,6 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
-#include <linux/kprobes.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
@@ -551,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
*
* Note: Must be called under text_mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b574b295a2f9..8e3842fc8bea 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, struct dma_attrs *attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, get_order(size));
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9fa8aa051f54..76164e173a24 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -10,6 +10,8 @@
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
+#define pr_fmt(fmt) "AGP: " fmt
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void)
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
if (!addr) {
- printk(KERN_ERR
- "Cannot allocate aperture memory hole (%lx,%uK)\n",
- addr, aper_size>>10);
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
memblock_reserve(addr, aper_size);
- printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
- aper_size >> 10, addr);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
(addr+aper_size) >> PAGE_SHIFT);
@@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
u64 aper;
u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
- printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
return 0;
}
@@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
- printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
- aper, 32 << old_order);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
- printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
- 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
- printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
- aper, 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
@@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
}
}
}
- printk(KERN_INFO "No AGP bridge found\n");
+ pr_info("No AGP bridge found\n");
return 0;
}
@@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void)
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
/* reserve it, so we can reuse it in second kernel */
- printk(KERN_INFO "update e820 for GART\n");
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
@@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void)
!early_pci_allowed())
return -ENODEV;
- printk(KERN_INFO "Checking aperture...\n");
+ pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
@@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void)
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void)
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
- printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
- printk(KERN_ERR "please increase GART size in your BIOS setup\n");
- printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
@@ -446,13 +452,10 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
- "Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
- "Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
- "This costs you %d MB of RAM\n",
- 32 << fallback_aper_order);
+ pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index a698d7165c96..6a1e71bde323 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,34 +33,44 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
+ put_cpu();
}
-static int __kprobes
+static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
int cpu;
@@ -80,6 +90,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
return NMI_DONE;
}
+NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
static int __init register_trigger_all_cpu_backtrace(void)
{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 992060e09897..81e08eff05ee 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -206,9 +206,6 @@ int __init arch_early_irq_init(void)
count = ARRAY_SIZE(irq_cfgx);
node = cpu_to_node(0);
- /* Make sure the legacy interrupts are marked in the bitmap */
- irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
-
for (i = 0; i < count; i++) {
irq_set_chip_data(i, &cfg[i]);
zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
@@ -281,18 +278,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
return cfg;
}
-static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
-{
- return irq_alloc_descs_from(from, count, node);
-}
-
-static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
-{
- free_irq_cfg(at, cfg);
- irq_free_desc(at);
-}
-
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -2312,7 +2297,7 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
int err;
if (!config_enabled(CONFIG_SMP))
- return -1;
+ return -EPERM;
if (!cpumask_intersects(mask, cpu_online_mask))
return -EINVAL;
@@ -2343,7 +2328,7 @@ int native_ioapic_set_affinity(struct irq_data *data,
int ret;
if (!config_enabled(CONFIG_SMP))
- return -1;
+ return -EPERM;
raw_spin_lock_irqsave(&ioapic_lock, flags);
ret = __ioapic_set_affinity(data, mask, &dest);
@@ -2916,98 +2901,39 @@ static int __init ioapic_init_ops(void)
device_initcall(ioapic_init_ops);
/*
- * Dynamic irq allocate and deallocation
+ * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
*/
-unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
+int arch_setup_hwirq(unsigned int irq, int node)
{
- struct irq_cfg **cfg;
+ struct irq_cfg *cfg;
unsigned long flags;
- int irq, i;
-
- if (from < nr_irqs_gsi)
- from = nr_irqs_gsi;
+ int ret;
- cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
+ cfg = alloc_irq_cfg(irq, node);
if (!cfg)
- return 0;
-
- irq = alloc_irqs_from(from, count, node);
- if (irq < 0)
- goto out_cfgs;
-
- for (i = 0; i < count; i++) {
- cfg[i] = alloc_irq_cfg(irq + i, node);
- if (!cfg[i])
- goto out_irqs;
- }
+ return -ENOMEM;
raw_spin_lock_irqsave(&vector_lock, flags);
- for (i = 0; i < count; i++)
- if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
- goto out_vecs;
+ ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
raw_spin_unlock_irqrestore(&vector_lock, flags);
- for (i = 0; i < count; i++) {
- irq_set_chip_data(irq + i, cfg[i]);
- irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
- }
-
- kfree(cfg);
- return irq;
-
-out_vecs:
- for (i--; i >= 0; i--)
- __clear_irq_vector(irq + i, cfg[i]);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-out_irqs:
- for (i = 0; i < count; i++)
- free_irq_at(irq + i, cfg[i]);
-out_cfgs:
- kfree(cfg);
- return 0;
-}
-
-unsigned int create_irq_nr(unsigned int from, int node)
-{
- return __create_irqs(from, 1, node);
-}
-
-int create_irq(void)
-{
- int node = cpu_to_node(0);
- unsigned int irq_want;
- int irq;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want, node);
-
- if (irq == 0)
- irq = -1;
-
- return irq;
+ if (!ret)
+ irq_set_chip_data(irq, cfg);
+ else
+ free_irq_cfg(irq, cfg);
+ return ret;
}
-void destroy_irq(unsigned int irq)
+void arch_teardown_hwirq(unsigned int irq)
{
struct irq_cfg *cfg = irq_get_chip_data(irq);
unsigned long flags;
- irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
-
free_remapped_irq(irq);
-
raw_spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
- free_irq_at(irq, cfg);
-}
-
-void destroy_irqs(unsigned int irq, unsigned int count)
-{
- unsigned int i;
-
- for (i = 0; i < count; i++)
- destroy_irq(irq + i);
+ free_irq_cfg(irq, cfg);
}
/*
@@ -3075,9 +3001,11 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
__get_cached_msi_msg(data->msi_desc, &msg);
@@ -3136,8 +3064,8 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- unsigned int irq, irq_want;
struct msi_desc *msidesc;
+ unsigned int irq;
int node, ret;
/* Multiple MSI vectors only supported with interrupt remapping */
@@ -3145,28 +3073,25 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return 1;
node = dev_to_node(&dev->dev);
- irq_want = nr_irqs_gsi;
+
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want, node);
- if (irq == 0)
+ irq = irq_alloc_hwirq(node);
+ if (!irq)
return -ENOSPC;
- irq_want = irq + 1;
-
ret = setup_msi_irq(dev, msidesc, irq, 0);
- if (ret < 0)
- goto error;
+ if (ret < 0) {
+ irq_free_hwirq(irq);
+ return ret;
+ }
+
}
return 0;
-
-error:
- destroy_irq(irq);
- return ret;
}
void native_teardown_msi_irq(unsigned int irq)
{
- destroy_irq(irq);
+ irq_free_hwirq(irq);
}
#ifdef CONFIG_DMAR_TABLE
@@ -3177,9 +3102,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_cfg *cfg = data->chip_data;
unsigned int dest, irq = data->irq;
struct msi_msg msg;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
dmar_msi_read(irq, &msg);
@@ -3226,9 +3153,11 @@ static int hpet_msi_set_affinity(struct irq_data *data,
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
hpet_msi_read(data->handler_data, &msg);
@@ -3295,9 +3224,11 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
target_ht_irq(data->irq, dest, cfg->vector);
return IRQ_SET_MASK_OK_NOCOPY;
@@ -3420,11 +3351,6 @@ static void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
-int get_nr_irqs_gsi(void)
-{
- return nr_irqs_gsi;
-}
-
unsigned int arch_dynirq_lower_bound(unsigned int from)
{
return from < nr_irqs_gsi ? nr_irqs_gsi : from;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 7834389ba5be..293b41df54ef 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -440,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = {
{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
};
+static unsigned char get_n_lshift(int m_val)
+{
+ union uv3h_gr0_gam_gr_config_u m_gr_config;
+
+ if (is_uv1_hub())
+ return m_val;
+
+ if (is_uv2_hub())
+ return m_val == 40 ? 40 : 39;
+
+ m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ return m_gr_config.s3.m_skt;
+}
+
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
@@ -849,6 +863,7 @@ void __init uv_system_init(void)
int gnode_extra, min_pnode = 999999, max_pnode = -1;
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
+ unsigned char n_lshift;
char *hub = (is_uv1_hub() ? "UV1" :
(is_uv2_hub() ? "UV2" :
"UV3"));
@@ -860,6 +875,7 @@ void __init uv_system_init(void)
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
pnode_mask = (1 << n_val) - 1;
+ n_lshift = get_n_lshift(m_val);
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
@@ -867,8 +883,9 @@ void __init uv_system_init(void)
node_id.v = uv_read_local_mmr(UVH_NODE_ID);
gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
gnode_upper = ((unsigned long)gnode_extra << m_val);
- pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n",
- n_val, m_val, pnode_mask, gnode_upper, gnode_extra);
+ pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n",
+ n_val, m_val, pnode_mask, gnode_upper, gnode_extra,
+ n_lshift);
pr_info("UV: global MMR base 0x%lx\n", mmr_base);
@@ -935,8 +952,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
uv_cpu_hub_info(cpu)->m_shift = 64 - m_val;
- uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ?
- (m_val == 40 ? 40 : 39) : m_val;
+ uv_cpu_hub_info(cpu)->n_lshift = n_lshift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab03430211d..584874451414 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,24 +841,12 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
- polling = !!(current_thread_info()->status & TS_POLLING);
- if (polling) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- }
if (!need_resched()) {
idled = 1;
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
}
- if (polling)
- current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ce8b8ff0e0ef..60e5497681f5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/smp.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
@@ -50,7 +51,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
return wrmsr_safe_regs(gprs);
}
-#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
@@ -70,6 +70,7 @@ __asm__(".globl vide\n\t.align 4\nvide: ret");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
@@ -83,11 +84,12 @@ static void init_amd_k5(struct cpuinfo_x86 *c)
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
@@ -176,10 +178,44 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ printk(KERN_INFO
+ "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ set_cpu_cap(c, X86_FEATURE_K7);
+
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -207,7 +243,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
+ if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -219,45 +255,8 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
-}
-
-static void init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- msr_clear_bit(MSR_K7_HWCR, 15);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
#endif
+}
#ifdef CONFIG_NUMA
/*
@@ -446,6 +445,26 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
+
+#ifdef CONFIG_X86_64
+ if (c->x86 >= 0xf) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
+
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
@@ -515,101 +534,74 @@ static const int amd_erratum_383[];
static const int amd_erratum_400[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
-static void init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
- u32 dummy;
- unsigned long long value;
+ u32 level;
+ u64 value;
-#ifdef CONFIG_SMP
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 0xf)
- msr_set_bit(MSR_K7_HWCR, 6);
-#endif
-
- early_init_amd(c);
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
*/
- clear_cpu_cap(c, 0*32+31);
-
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
-
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- * (AMD Erratum #110, docId: 25759).
- */
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
- value &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, value);
- }
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrl_amd_safe(0xc001100d, value);
}
-
}
- if (c->x86 >= 0x10)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-#else
+ if (!c->x86_model_id[0])
+ strcpy(c->x86_model_id, "Hammer");
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
- }
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+}
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
@@ -625,14 +617,60 @@ static void init_amd(struct cpuinfo_x86 *c)
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
value |= 0x1E;
wrmsrl_safe(0xc0011021, value);
}
}
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x15: init_amd_bd(c); break;
+ }
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);
@@ -656,33 +694,6 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
-
- fam10h_check_enable_mmcfg();
- }
-
- if (c == &boot_cpu_data && c->x86 >= 0xf) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-#endif
-
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
@@ -690,34 +701,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (c->x86 == 0x10) {
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors already. If
- * it doesn't, do it here as suggested by the BKDG.
- *
- * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
- */
- msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
-
- /*
- * On family 10h BIOS may not have properly enabled WC+ support,
- * causing it to be converted to CD memtype. This may result in
- * performance degradation for certain nested-paging guests.
- * Prevent this conversion by clearing bit 24 in
- * MSR_AMD64_BU_CFG2.
- *
- * NOTE: we want to use the _safe accessors so as not to #GP kvm
- * guests on older kvm hosts.
- */
- msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
-
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- }
-
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
@@ -741,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
}
#endif
-static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
-{
- tlb_flushall_shift = 6;
-}
-
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
@@ -793,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
-
- cpu_set_tlb_flushall_shift(c);
}
static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cdc95852532d..e4ab2b42bd6f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
@@ -20,6 +21,7 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
+#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
#include <asm/pgtable.h>
@@ -487,26 +489,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
u16 __read_mostly tlb_lld_1g[NR_INFO];
-/*
- * tlb_flushall_shift shows the balance point in replacing cr3 write
- * with multiple 'invlpg'. It will do this replacement when
- * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
- * If tlb_flushall_shift is -1, means the replacement will be disabled.
- */
-s8 __read_mostly tlb_flushall_shift = -1;
-
void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
- "tlb_flushall_shift: %d\n",
+ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_lld_1g[ENTRIES], tlb_flushall_shift);
+ tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -970,6 +963,38 @@ static void vgetcpu_set_mode(void)
else
vgetcpu_mode = VGETCPU_LSL;
}
+
+/* May not be __init: called during resume */
+static void syscall32_cpu_init(void)
+{
+ /* Load these always in case some future AMD CPU supports
+ SYSENTER from compat mode too. */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
+{
+ int cpu = get_cpu();
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP)) {
+ put_cpu();
+ return;
+ }
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+ wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ put_cpu();
+}
#endif
void __init identify_boot_cpu(void)
@@ -1177,6 +1202,7 @@ int is_debug_stack(unsigned long addr)
(addr <= __get_cpu_var(debug_stack_addr) &&
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+NOKPROBE_SYMBOL(is_debug_stack);
DEFINE_PER_CPU(u32, debug_idt_ctr);
@@ -1185,6 +1211,7 @@ void debug_stack_set_zero(void)
this_cpu_inc(debug_idt_ctr);
load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_set_zero);
void debug_stack_reset(void)
{
@@ -1193,6 +1220,7 @@ void debug_stack_reset(void)
if (this_cpu_dec_return(debug_idt_ctr) == 0)
load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_reset);
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a80029035bf2..74e804ddc5c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -253,7 +253,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -370,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)
*/
detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -391,7 +402,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 6 && cpu_has_clflush &&
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
@@ -438,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
/* Work around errata */
srat_detect_node(c);
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
}
}
-static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
-{
- switch ((c->x86 << 8) + c->x86_model) {
- case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 0x61d: /* six-core 45 nm xeon "Dunnington" */
- tlb_flushall_shift = -1;
- break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 2;
- break;
- case 0x61a: /* 45 nm nehalem, "Bloomfield" */
- case 0x61e: /* 45 nm nehalem, "Lynnfield" */
- case 0x625: /* 32 nm nehalem, "Clarkdale" */
- case 0x62c: /* 32 nm nehalem, "Gulftown" */
- case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
- case 0x62f: /* 32 nm Xeon E7 */
- case 0x62a: /* SandyBridge */
- case 0x62d: /* SandyBridge, "Romely-EP" */
- default:
- tlb_flushall_shift = 6;
- }
-}
-
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
for (j = 1 ; j < 16 ; j++)
intel_tlb_lookup(desc[j]);
}
- intel_tlb_flushall_shift_set(c);
}
static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a952e9c85b6f..c7035073dfc1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- if (strict_strtoul(buf, 10, &val) < 0)
+ if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
@@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return -EINVAL;
- if (strict_strtoul(buf, 16, &val) < 0)
+ if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
if (amd_set_subcaches(cpu, val))
@@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_HT
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 68317c80de7f..bd9ccda8087f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define SPINUNIT 100 /* 100ns */
-atomic_t mce_entry;
-
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
@@ -704,8 +702,7 @@ static int mce_timed_out(u64 *t)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (mca_cfg.tolerant < 1)
+ if (mca_cfg.tolerant <= 1)
mce_panic("Timeout synchronizing machine check over CPUs",
NULL, NULL);
cpu_missing = 1;
@@ -1041,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- atomic_inc(&mce_entry);
-
this_cpu_inc(mce_exception_count);
if (!cfg->banks)
@@ -1172,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
- atomic_dec(&mce_entry);
sync_core();
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -2142,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
@@ -2180,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.ignore_ce ^ !!new) {
@@ -2204,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.cmci_disabled ^ !!new) {
@@ -2391,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_rediscover();
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
@@ -2402,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
}
- if (action == CPU_POST_DEAD) {
- /* intentionally ignoring frozen here */
- cmci_rediscover();
- }
-
return NOTIFY_OK;
}
@@ -2437,32 +2430,67 @@ static __init int mcheck_init_device(void)
int err;
int i = 0;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
- zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
mce_init_banks();
err = subsys_system_register(&mce_subsys, NULL);
if (err)
- return err;
+ goto err_out_mem;
cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = mce_device_create(i);
if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
- return err;
+ goto err_device_create;
}
}
- register_syscore_ops(&mce_syscore_ops);
__register_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
+ register_syscore_ops(&mce_syscore_ops);
+
/* register character device /dev/mcelog */
- misc_register(&mce_chrdev_device);
+ err = misc_register(&mce_chrdev_device);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ unregister_syscore_ops(&mce_syscore_ops);
+
+err_device_create:
+ /*
+ * We didn't keep track of which devices were created above, but
+ * even if we had, the set of online cpus might have changed.
+ * Play safe and remove for every possible cpu, since
+ * mce_device_remove() will do the right thing.
+ */
+ for_each_possible_cpu(i)
+ mce_device_remove(i);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 603df4f74640..1e49f8f41276 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
if (!b->interrupt_capable)
return -EINVAL;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
@@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 9a316b21df8b..3bdb95ae8c43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
-static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
@@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void)
int bank;
u64 val;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = __get_cpu_var(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
@@ -211,7 +211,7 @@ static void cmci_discover(int banks)
int i;
int bios_wrong_thresh = 0;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
@@ -266,7 +266,7 @@ static void cmci_discover(int banks)
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
@@ -316,10 +316,10 @@ void cmci_clear(void)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
@@ -360,9 +360,9 @@ void cmci_disable_bank(int bank)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0f..dd9d6190b08d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -97,6 +97,9 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
+bool dis_ucode_ldr;
+module_param(dis_ucode_ldr, bool, 0);
+
/*
* Synchronization.
*
@@ -546,6 +549,9 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
+ if (dis_ucode_ldr)
+ return 0;
+
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f577..5f28a64e71ea 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -17,9 +17,11 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
+#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/microcode_amd.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
@@ -72,10 +74,33 @@ static int x86_family(void)
return x86;
}
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *opt = "dis_ucode_ldr";
+ const char *option = (const char *)__pa_nodebug(opt);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = "dis_ucode_ldr";
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ if (cmdline_find_option_bool(cmdline, option))
+ *res = true;
+
+ return *res;
+}
+
void __init load_ucode_bsp(void)
{
int vendor, x86;
+ if (check_loader_disabled_bsp())
+ return;
+
if (!have_cpuid_p())
return;
@@ -96,10 +121,22 @@ void __init load_ucode_bsp(void)
}
}
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return __pa_nodebug(dis_ucode_ldr);
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
void load_ucode_ap(void)
{
int vendor, x86;
+ if (check_loader_disabled_ap())
+ return;
+
if (!have_cpuid_p())
return;
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 2bf616505499..e2b22df964cd 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,23 +1,25 @@
#!/bin/sh
#
-# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
#
IN=$1
OUT=$2
-TABS="$(printf '\t\t\t\t\t')"
-trap 'rm "$OUT"' EXIT
+function dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
-(
- echo "#ifndef _ASM_X86_CPUFEATURE_H"
- echo "#include <asm/cpufeature.h>"
- echo "#endif"
- echo ""
- echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
- # Iterate through any input lines starting with #define X86_FEATURE_
- sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
@@ -31,11 +33,32 @@ trap 'rm "$OUT"' EXIT
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
- TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
- printf "\t[%s]%.*s = %s,\n" \
- "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
done
echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURE_H"
+ echo "#include <asm/cpufeature.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 76f98fe5b35c..a450373e8e91 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -132,15 +132,6 @@ static void __init ms_hyperv_init_platform(void)
lapic_timer_frequency = hv_lapic_frequency;
printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
-
- /*
- * On Hyper-V, when we are booting off an EFI firmware stack,
- * we do not have many legacy devices including PIC, PIT etc.
- */
- if (efi_enabled(EFI_BOOT)) {
- printk(KERN_INFO "HyperV: Using null_legacy_pic\n");
- legacy_pic = &null_legacy_pic;
- }
}
#endif
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ae407f7226c8..2879ecdaac43 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
@@ -303,15 +306,6 @@ int x86_setup_perfctr(struct perf_event *event)
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
}
if (attr->type == PERF_TYPE_RAW)
@@ -721,6 +715,7 @@ int perf_assign_events(struct perf_event **events, int n,
return sched.state.unassigned;
}
+EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
@@ -1292,7 +1287,7 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
-static int __kprobes
+static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
u64 start_clock;
@@ -1310,6 +1305,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return ret;
}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
@@ -1365,6 +1361,15 @@ static void __init pmu_check_apic(void)
x86_pmu.apic = 0;
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
pr_info("no hardware sampling interrupt available.\n");
+
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
}
static struct attribute_group x86_pmu_format_group = {
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3b2f9bdd974b..8ade93111e03 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -295,14 +295,16 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 4c36bbe3173a..cbb1be3ed9e4 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -593,7 +593,7 @@ out:
return 1;
}
-static int __kprobes
+static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
int handled = 0;
@@ -606,6 +606,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return handled;
}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 3bbdf4cd38b9..30790d798e6b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -294,31 +294,41 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
cpu_to_node(cpu));
}
-static void amd_uncore_cpu_up_prepare(unsigned int cpu)
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
- struct amd_uncore *uncore;
+ struct amd_uncore *uncore_nb = NULL, *uncore_l2;
if (amd_uncore_nb) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_NB;
- uncore->rdpmc_base = RDPMC_BASE_NB;
- uncore->msr_base = MSR_F15H_NB_PERF_CTL;
- uncore->active_mask = &amd_nb_active_mask;
- uncore->pmu = &amd_nb_pmu;
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ uncore_nb = amd_uncore_alloc(cpu);
+ if (!uncore_nb)
+ goto fail;
+ uncore_nb->cpu = cpu;
+ uncore_nb->num_counters = NUM_COUNTERS_NB;
+ uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+ uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+ uncore_nb->active_mask = &amd_nb_active_mask;
+ uncore_nb->pmu = &amd_nb_pmu;
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_l2) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_L2;
- uncore->rdpmc_base = RDPMC_BASE_L2;
- uncore->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore->active_mask = &amd_l2_active_mask;
- uncore->pmu = &amd_l2_pmu;
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ uncore_l2 = amd_uncore_alloc(cpu);
+ if (!uncore_l2)
+ goto fail;
+ uncore_l2->cpu = cpu;
+ uncore_l2->num_counters = NUM_COUNTERS_L2;
+ uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+ uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore_l2->active_mask = &amd_l2_active_mask;
+ uncore_l2->pmu = &amd_l2_pmu;
+ *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
+
+ return 0;
+
+fail:
+ kfree(uncore_nb);
+ return -ENOMEM;
}
static struct amd_uncore *
@@ -441,7 +451,7 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
if (!--uncore->refcnt)
kfree(uncore);
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+ *per_cpu_ptr(uncores, cpu) = NULL;
}
static void amd_uncore_cpu_dead(unsigned int cpu)
@@ -461,7 +471,8 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- amd_uncore_cpu_up_prepare(cpu);
+ if (amd_uncore_cpu_up_prepare(cpu))
+ return notifier_from_errno(-ENOMEM);
break;
case CPU_STARTING:
@@ -501,20 +512,33 @@ static void __init init_cpu_already_online(void *dummy)
amd_uncore_cpu_online(cpu);
}
+static void cleanup_cpu_online(void *dummy)
+{
+ unsigned int cpu = smp_processor_id();
+
+ amd_uncore_cpu_dead(cpu);
+}
+
static int __init amd_uncore_init(void)
{
- unsigned int cpu;
+ unsigned int cpu, cpu2;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return -ENODEV;
+ goto fail_nodev;
if (!cpu_has_topoext)
- return -ENODEV;
+ goto fail_nodev;
if (cpu_has_perfctr_nb) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (!amd_uncore_nb) {
+ ret = -ENOMEM;
+ goto fail_nb;
+ }
+ ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (ret)
+ goto fail_nb;
printk(KERN_INFO "perf: AMD NB counters detected\n");
ret = 0;
@@ -522,20 +546,28 @@ static int __init amd_uncore_init(void)
if (cpu_has_perfctr_l2) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (!amd_uncore_l2) {
+ ret = -ENOMEM;
+ goto fail_l2;
+ }
+ ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (ret)
+ goto fail_l2;
printk(KERN_INFO "perf: AMD L2I counters detected\n");
ret = 0;
}
if (ret)
- return -ENODEV;
+ goto fail_nodev;
cpu_notifier_register_begin();
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
- amd_uncore_cpu_up_prepare(cpu);
+ ret = amd_uncore_cpu_up_prepare(cpu);
+ if (ret)
+ goto fail_online;
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
@@ -543,5 +575,30 @@ static int __init amd_uncore_init(void)
cpu_notifier_register_done();
return 0;
+
+
+fail_online:
+ for_each_online_cpu(cpu2) {
+ if (cpu2 == cpu)
+ break;
+ smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+ }
+ cpu_notifier_register_done();
+
+ /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+ amd_uncore_nb = amd_uncore_l2 = NULL;
+ if (cpu_has_perfctr_l2)
+ perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+ if (cpu_has_perfctr_nb)
+ perf_pmu_unregister(&amd_nb_pmu);
+ if (amd_uncore_l2)
+ free_percpu(amd_uncore_l2);
+fail_nb:
+ if (amd_uncore_nb)
+ free_percpu(amd_uncore_nb);
+
+fail_nodev:
+ return ret;
}
device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index adb02aa62af5..2502d0d9d246 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1382,6 +1382,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2173,6 +2182,41 @@ static void intel_snb_check_microcode(void)
}
}
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2262,7 +2306,8 @@ __init int intel_pmu_init(void)
union cpuid10_ebx ebx;
struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -2465,6 +2510,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
@@ -2565,6 +2613,34 @@ __init int intel_pmu_init(void)
}
}
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ae96cfa5eddd..696ade311ded 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status)
return val;
}
-static u64 precise_store_data_hsw(u64 status)
+static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
{
union perf_mem_data_src dse;
+ u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
dse.val = 0;
dse.mem_op = PERF_MEM_OP_STORE;
dse.mem_lvl = PERF_MEM_LVL_NA;
+
+ /*
+ * L1 info only valid for following events:
+ *
+ * MEM_UOPS_RETIRED.STLB_MISS_STORES
+ * MEM_UOPS_RETIRED.LOCK_STORES
+ * MEM_UOPS_RETIRED.SPLIT_STORES
+ * MEM_UOPS_RETIRED.ALL_STORES
+ */
+ if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
+ return dse.mem_lvl;
+
if (status & 1)
- dse.mem_lvl = PERF_MEM_LVL_L1;
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+
/* Nothing else supported. Sorry. */
return dse.val;
}
@@ -295,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
- if (unlikely(!buffer))
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
+ }
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
@@ -887,7 +905,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
data.data_src.val = load_latency_data(pebs->dse);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
data.data_src.val =
- precise_store_data_hsw(pebs->dse);
+ precise_store_data_hsw(event, pebs->dse);
else
data.data_src.val = precise_store_data(pebs->dse);
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d82d155aca8c..9dd2459a4c73 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -384,6 +384,9 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
mask |= X86_BR_NO_TX;
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -678,6 +681,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
@@ -689,6 +693,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
/* core */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 65bbbea38b9c..cfc6f9dfcd90 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -550,16 +550,16 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
@@ -1222,6 +1222,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+
SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
@@ -1245,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
@@ -2946,10 +2947,7 @@ again:
* extra registers. If we failed to take an extra
* register, try the alternative.
*/
- if (idx % 2)
- idx--;
- else
- idx++;
+ idx ^= 1;
if (idx != reg1->idx % 6) {
if (idx == 2)
config1 >>= 8;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 06fe3ed8b851..5433658e598d 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -97,6 +97,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+ seq_printf(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 507de8066594..0553a34fa0df 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -4,9 +4,14 @@
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,6 +21,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -28,6 +34,45 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN 4096
+
+/* This primarily represents number of split ranges due to exclusion */
+#define CRASH_MAX_RANGES 16
+
+struct crash_mem_range {
+ u64 start, end;
+};
+
+struct crash_mem {
+ unsigned int nr_ranges;
+ struct crash_mem_range ranges[CRASH_MAX_RANGES];
+};
+
+/* Misc data about ram ranges needed to prepare elf headers */
+struct crash_elf_data {
+ struct kimage *image;
+ /*
+ * Total number of ram ranges we have after various adjustments for
+ * GART, crash reserved region etc.
+ */
+ unsigned int max_nr_ranges;
+ unsigned long gart_start, gart_end;
+
+ /* Pointer to elf header */
+ void *ehdr;
+ /* Pointer to next phdr */
+ void *bufp;
+ struct crash_mem mem;
+};
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
int in_crash_kexec;
/*
@@ -39,6 +84,7 @@ int in_crash_kexec;
*/
crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
static inline void cpu_crash_vmclear_loaded_vmcss(void)
{
@@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}
+
+#ifdef CONFIG_X86_64
+
+static int get_nr_ram_ranges_callback(unsigned long start_pfn,
+ unsigned long nr_pfn, void *arg)
+{
+ int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+
+ ced->gart_start = start;
+ ced->gart_end = end;
+
+ /* Not expecting more than 1 gart aperture */
+ return 1;
+}
+
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static void fill_up_crash_elf_data(struct crash_elf_data *ced,
+ struct kimage *image)
+{
+ unsigned int nr_ranges = 0;
+
+ ced->image = image;
+
+ walk_system_ram_range(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+
+ ced->max_nr_ranges = nr_ranges;
+
+ /*
+ * We don't create ELF headers for GART aperture as an attempt
+ * to dump this memory in second kernel leads to hang/crash.
+ * If gart aperture is present, one needs to exclude that region
+ * and that could lead to need of extra phdr.
+ */
+ walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
+ ced, get_gart_ranges_callback);
+
+ /*
+ * If we have gart region, excluding that could potentially split
+ * a memory range, resulting in extra header. Account for that.
+ */
+ if (ced->gart_end)
+ ced->max_nr_ranges++;
+
+ /* Exclusion of crash region could split memory ranges */
+ ced->max_nr_ranges++;
+
+ /* If crashk_low_res is not 0, another range split possible */
+ if (crashk_low_res.end != 0)
+ ced->max_nr_ranges++;
+}
+
+static int exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happend, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == CRASH_MAX_RANGES - 1) {
+ pr_err("Too many crash ranges after split\n");
+ return -ENOMEM;
+ }
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in ced->mem.ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_elf_data *ced,
+ unsigned long long mstart, unsigned long long mend)
+{
+ struct crash_mem *cmem = &ced->mem;
+ int ret = 0;
+
+ memset(cmem->ranges, 0, sizeof(cmem->ranges));
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude crashkernel region */
+ ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ /* Exclude GART region */
+ if (ced->gart_end) {
+ ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long mstart, mend;
+ struct kimage *image = ced->image;
+ struct crash_mem *cmem;
+ int ret, i;
+
+ ehdr = ced->ehdr;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(ced, start, end);
+ if (ret)
+ return ret;
+
+ /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
+ cmem = &ced->mem;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ mstart = cmem->ranges[i].start;
+ mend = cmem->ranges[i].end;
+
+ phdr = ced->bufp;
+ ced->bufp += sizeof(Elf64_Phdr);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ if (mstart == image->arch.backup_src_start &&
+ (mend - mstart + 1) == image->arch.backup_src_sz)
+ phdr->p_offset = image->arch.backup_load_addr;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_headers(struct crash_elf_data *ced,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf, *bufp;
+ unsigned int cpu;
+ unsigned long long notes_addr;
+ int ret;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += ced->max_nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ bufp = buf;
+ ehdr = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+ (ehdr->e_phnum)++;
+
+#ifdef CONFIG_X86_64
+ /* Prepare PT_LOAD type program header for kernel text region */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ (ehdr->e_phnum)++;
+#endif
+
+ /* Prepare PT_LOAD headers for system ram chunks. */
+ ced->ehdr = ehdr;
+ ced->bufp = bufp;
+ ret = walk_system_ram_res(0, -1, ced,
+ prepare_elf64_ram_headers_callback);
+ if (ret < 0)
+ return ret;
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_elf_data *ced;
+ int ret;
+
+ ced = kzalloc(sizeof(*ced), GFP_KERNEL);
+ if (!ced)
+ return -ENOMEM;
+
+ fill_up_crash_elf_data(ced, image);
+
+ /* By default prepare 64bit headers */
+ ret = prepare_elf64_headers(ced, addr, sz);
+ kfree(ced);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820MAX)
+ return 1;
+
+ memcpy(&params->e820_map[nr_e820_entries], entry,
+ sizeof(struct e820entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820entry ei;
+
+ ei.addr = start;
+ ei.size = end - start + 1;
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(sizeof(struct crash_mem));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_NVS;
+ walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(u64 start, u64 end, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = start;
+ image->arch.backup_src_sz = end - start + 1;
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ unsigned long src_start, src_sz, elf_sz;
+ void *elf_addr;
+ int ret;
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ src_start = image->arch.backup_src_start;
+ src_sz = image->arch.backup_src_sz;
+
+ /* Add backup segment. */
+ if (src_sz) {
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
+ sizeof(crash_zero_bytes), src_sz,
+ PAGE_SIZE, 0, -1, 0,
+ &image->arch.backup_load_addr);
+ if (ret)
+ return ret;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr, src_start, src_sz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = elf_addr;
+ image->arch.elf_headers_sz = elf_sz;
+
+ ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
+ ELF_CORE_HEADER_ALIGN, 0, -1, 0,
+ &image->arch.elf_load_addr);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, elf_sz, elf_sz);
+
+ return ret;
+}
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index d35078ea1446..7db54b5d5f86 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -206,23 +206,21 @@ static void __init dtb_apic_setup(void)
static void __init x86_flattree_get_config(void)
{
u32 size, map_len;
- struct boot_param_header *dt;
+ void *dt;
if (!initial_dtb)
return;
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
- (u64)sizeof(struct boot_param_header));
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
- dt = early_memremap(initial_dtb, map_len);
- size = be32_to_cpu(dt->totalsize);
+ initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+ size = of_get_flat_dt_size();
if (map_len < size) {
early_iounmap(dt, map_len);
- dt = early_memremap(initial_dtb, size);
+ initial_boot_params = dt = early_memremap(initial_dtb, size);
map_len = size;
}
- initial_boot_params = dt;
unflatten_and_copy_device_tree();
early_iounmap(dt, map_len);
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d9c12d3022a7..b74ebc7c4402 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -200,7 +200,7 @@ static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
{
int cpu;
unsigned long flags;
@@ -223,8 +223,9 @@ unsigned __kprobes long oops_begin(void)
return flags;
}
EXPORT_SYMBOL_GPL(oops_begin);
+NOKPROBE_SYMBOL(oops_begin);
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
crash_kexec(regs);
@@ -247,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
panic("Fatal exception");
do_exit(signr);
}
+NOKPROBE_SYMBOL(oops_end);
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+int __die(const char *str, struct pt_regs *regs, long err)
{
#ifdef CONFIG_X86_32
unsigned short ss;
@@ -291,6 +293,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#endif
return 0;
}
+NOKPROBE_SYMBOL(__die);
/*
* This is gone through when something in the kernel has done something bad
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 6cda0baeac9d..2e1a6853e00c 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -419,7 +419,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func)
return gmch_ctrl << 25; /* 32 MB units */
}
-static size_t gen8_stolen_size(int num, int slot, int func)
+static size_t __init gen8_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
@@ -429,48 +429,73 @@ static size_t gen8_stolen_size(int num, int slot, int func)
return gmch_ctrl << 25; /* 32 MB units */
}
+static size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+ gmch_ctrl &= SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gmch_ctrl < 0x11)
+ return gmch_ctrl << 25;
+ else if (gmch_ctrl < 0x17)
+ return (gmch_ctrl - 0x11 + 2) << 22;
+ else
+ return (gmch_ctrl - 0x17 + 9) << 22;
+}
struct intel_stolen_funcs {
size_t (*size)(int num, int slot, int func);
u32 (*base)(int num, int slot, int func, size_t size);
};
-static const struct intel_stolen_funcs i830_stolen_funcs = {
+static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
.base = i830_stolen_base,
.size = i830_stolen_size,
};
-static const struct intel_stolen_funcs i845_stolen_funcs = {
+static const struct intel_stolen_funcs i845_stolen_funcs __initconst = {
.base = i845_stolen_base,
.size = i830_stolen_size,
};
-static const struct intel_stolen_funcs i85x_stolen_funcs = {
+static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = {
.base = i85x_stolen_base,
.size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs i865_stolen_funcs = {
+static const struct intel_stolen_funcs i865_stolen_funcs __initconst = {
.base = i865_stolen_base,
.size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs gen3_stolen_funcs = {
+static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs gen6_stolen_funcs = {
+static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = gen6_stolen_size,
};
-static const struct intel_stolen_funcs gen8_stolen_funcs = {
+static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = gen8_stolen_size,
};
-static struct pci_device_id intel_stolen_ids[] __initdata = {
+static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = chv_stolen_size,
+};
+
+static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_I830_IDS(&i830_stolen_funcs),
INTEL_I845G_IDS(&i845_stolen_funcs),
INTEL_I85X_IDS(&i85x_stolen_funcs),
@@ -496,7 +521,8 @@ static struct pci_device_id intel_stolen_ids[] __initdata = {
INTEL_HSW_D_IDS(&gen6_stolen_funcs),
INTEL_HSW_M_IDS(&gen6_stolen_funcs),
INTEL_BDW_M_IDS(&gen8_stolen_funcs),
- INTEL_BDW_D_IDS(&gen8_stolen_funcs)
+ INTEL_BDW_D_IDS(&gen8_stolen_funcs),
+ INTEL_CHV_IDS(&chv_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a2a4f4697889..47c410d99f5d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -315,10 +315,6 @@ ENTRY(ret_from_kernel_thread)
ENDPROC(ret_from_kernel_thread)
/*
- * Interrupt exit functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
@@ -372,10 +368,6 @@ need_resched:
END(resume_kernel)
#endif
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -431,8 +423,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -495,10 +488,6 @@ sysexit_audit:
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
@@ -513,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -527,6 +517,7 @@ syscall_exit:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -537,6 +528,7 @@ restore_all_notrace:
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
+#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
@@ -549,13 +541,9 @@ ENTRY(iret_exc)
.previous
_ASM_EXTABLE(irq_return,iret_exc)
+#ifdef CONFIG_X86_ESPFIX32
CFI_RESTORE_STATE
ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
@@ -597,6 +585,7 @@ ldt_ss:
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
+#endif
CFI_ENDPROC
ENDPROC(system_call)
@@ -687,14 +676,15 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
.macro FIXUP_ESPFIX_STACK
/*
@@ -704,6 +694,7 @@ END(syscall_badsys)
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
+#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -713,8 +704,10 @@ END(syscall_badsys)
pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
+#endif
.endm
.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
@@ -725,6 +718,7 @@ END(syscall_badsys)
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
+#endif
.endm
/*
@@ -781,10 +775,6 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-/*
- * Irq entries should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
@@ -961,10 +951,6 @@ ENTRY(spurious_interrupt_bug)
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
- .popsection
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
@@ -1073,9 +1059,6 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
@@ -1107,8 +1090,6 @@ END(ftrace_caller)
ENTRY(ftrace_regs_caller)
pushf /* push flags before compare (in cs location) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
/*
* i386 does not save SS and ESP when coming from kernel.
@@ -1167,7 +1148,6 @@ GLOBAL(ftrace_regs_call)
popf /* Pop flags at end (no addl to corrupt flags) */
jmp ftrace_ret
-ftrace_restore_flags:
popf
jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -1176,9 +1156,6 @@ ENTRY(mcount)
cmpl $__PAGE_OFFSET, %esp
jb ftrace_stub /* Paging not enabled yet? */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -1239,11 +1216,6 @@ return_to_handler:
jmp *%ecx
#endif
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
#ifdef CONFIG_TRACING
ENTRY(trace_page_fault)
RING0_EC_FRAME
@@ -1355,11 +1327,13 @@ END(debug)
ENTRY(nmi)
RING0_INT_FRAME
ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
+#endif
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
@@ -1399,6 +1373,7 @@ nmi_debug_stack_check:
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
+#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
@@ -1420,6 +1395,7 @@ nmi_espfix_stack:
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
+#endif
CFI_ENDPROC
END(nmi)
@@ -1453,7 +1429,3 @@ ENTRY(async_page_fault)
END(async_page_fault)
#endif
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..2fac1343a90b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -36,7 +36,7 @@
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ * - idtentry - Define exception entry points.
*/
#include <linux/linkage.h>
@@ -53,11 +53,11 @@
#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
-#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/asm.h>
#include <asm/context_tracking.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -69,209 +69,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_FUNCTION_TRACER
-
-#ifdef CC_USING_FENTRY
-# define function_hook __fentry__
-#else
-# define function_hook mcount
-#endif
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(function_hook)
- retq
-END(function_hook)
-
-/* skip is set if stack has been adjusted */
-.macro ftrace_caller_setup skip=0
- MCOUNT_SAVE_FRAME \skip
-
- /* Load the ftrace_ops into the 3rd parameter */
- movq function_trace_op(%rip), %rdx
-
- /* Load ip into the first parameter */
- movq RIP(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
- /* Load the parent_ip into the second parameter */
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
-.endm
-
-ENTRY(ftrace_caller)
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- ftrace_caller_setup
- /* regs go into 4th parameter (but make it NULL) */
- movq $0, %rcx
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-ftrace_return:
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- /* Save the current flags before compare (in SS location)*/
- pushfq
-
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
-
- /* skip=8 to skip flags saved in SS */
- ftrace_caller_setup 8
-
- /* Save the rest of pt_regs */
- movq %r15, R15(%rsp)
- movq %r14, R14(%rsp)
- movq %r13, R13(%rsp)
- movq %r12, R12(%rsp)
- movq %r11, R11(%rsp)
- movq %r10, R10(%rsp)
- movq %rbp, RBP(%rsp)
- movq %rbx, RBX(%rsp)
- /* Copy saved flags */
- movq SS(%rsp), %rcx
- movq %rcx, EFLAGS(%rsp)
- /* Kernel segments */
- movq $__KERNEL_DS, %rcx
- movq %rcx, SS(%rsp)
- movq $__KERNEL_CS, %rcx
- movq %rcx, CS(%rsp)
- /* Stack - skipping return address */
- leaq SS+16(%rsp), %rcx
- movq %rcx, RSP(%rsp)
-
- /* regs go into 4th parameter */
- leaq (%rsp), %rcx
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- /* Copy flags back to SS, to restore them */
- movq EFLAGS(%rsp), %rax
- movq %rax, SS(%rsp)
-
- /* Handlers can change the RIP */
- movq RIP(%rsp), %rax
- movq %rax, SS+8(%rsp)
-
- /* restore the rest of pt_regs */
- movq R15(%rsp), %r15
- movq R14(%rsp), %r14
- movq R13(%rsp), %r13
- movq R12(%rsp), %r12
- movq R10(%rsp), %r10
- movq RBP(%rsp), %rbp
- movq RBX(%rsp), %rbx
-
- /* skip=8 to skip flags saved in SS */
- MCOUNT_RESTORE_FRAME 8
-
- /* Restore flags */
- popfq
-
- jmp ftrace_return
-ftrace_restore_flags:
- popfq
- jmp ftrace_stub
-
-END(ftrace_regs_caller)
-
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(function_hook)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq RIP(%rsp), %rdi
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(function_hook)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- MCOUNT_SAVE_FRAME
-
-#ifdef CC_USING_FENTRY
- leaq SS+16(%rsp), %rdi
- movq $0, %rdx /* No framepointers needed */
-#else
- leaq 8(%rbp), %rdi
- movq (%rbp), %rdx
-#endif
- movq RIP(%rsp), %rsi
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $24, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $24, %rsp
- jmp *%rdi
-#endif
-
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
@@ -410,7 +207,6 @@ ENDPROC(native_usergs_sysret64)
*/
.macro XCPT_FRAME start=1 offset=0
INTR_FRAME \start, RIP+\offset-ORIG_RAX
- /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
.endm
/*
@@ -487,26 +283,24 @@ ENDPROC(native_usergs_sysret64)
TRACE_IRQS_OFF
.endm
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
movq_cfi rdx, RDX+8
movq_cfi rcx, RCX+8
movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
movl $1,%ebx
movl $MSR_GS_BASE,%ecx
rdmsr
@@ -517,7 +311,6 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
- .popsection
/*
* A newly forked process directly context switches into this address.
@@ -975,10 +768,6 @@ END(interrupt)
call \func
.endm
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -1041,12 +830,45 @@ restore_args:
irq_return:
INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return, bad_iret)
-#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -1110,13 +932,39 @@ ENTRY(retint_kernel)
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
-/*
- * End of kprobes section
- */
- .popsection
+
+ /*
+ * If IRET takes a fault on the espfix stack, then we
+ * end up promoting it to a doublefault. In that case,
+ * modify the stack to make it look like we just entered
+ * the #GP handler from user space, similar to bad_iret.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ ALIGN
+__do_double_fault:
+ XCPT_FRAME 1 RDI+8
+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
+ sarq $PGDIR_SHIFT,%rax
+ cmpl $ESPFIX_PGD_ENTRY,%eax
+ jne do_double_fault /* No, just deliver the fault */
+ cmpl $__KERNEL_CS,CS(%rdi)
+ jne do_double_fault
+ movq RIP(%rdi),%rax
+ cmpq $native_irq_return_iret,%rax
+ jne do_double_fault /* This shouldn't happen... */
+ movq PER_CPU_VAR(kernel_stack),%rax
+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
+ movq %rax,RSP(%rdi)
+ movq $0,(%rax) /* Missing (lost) #GP error code */
+ movq $general_protection,RIP(%rdi)
+ retq
+ CFI_ENDPROC
+END(__do_double_fault)
+#else
+# define __do_double_fault do_double_fault
+#endif
/*
* APIC interrupts.
@@ -1203,125 +1051,100 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry sym do_sym
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
+ .if \has_error_code
+ XCPT_FRAME
+ .else
INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF_DEBUG
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- call \do_sym
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ .endif
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
ASM_CLAC
PARAVIRT_ADJUST_EXCEPTION_FRAME
+
+ .ifeq \has_error_code
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+
+ .if \paranoid
+ call save_paranoid
+ .else
call error_entry
+ .endif
+
DEFAULT_FRAME 0
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
+ TRACE_IRQS_OFF
+ .endif
+ .endif
+
movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
+ .if \paranoid
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ .else
jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
#ifdef CONFIG_TRACING
-.macro trace_errorentry sym do_sym
-errorentry trace(\sym) trace(\do_sym)
-errorentry \sym \do_sym
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
+idtentry \sym \do_sym has_error_code=\has_error_code
.endm
#else
-.macro trace_errorentry sym do_sym
-errorentry \sym \do_sym
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry \sym \do_sym has_error_code=\has_error_code
.endm
#endif
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- DEFAULT_FRAME 0
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
/* Reload gs selector with exception handling */
@@ -1371,7 +1194,7 @@ ENTRY(do_softirq_own_stack)
END(do_softirq_own_stack)
#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
+idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -1477,26 +1300,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
hyperv_callback_vector hyperv_vector_handler
#endif /* CONFIG_HYPERV */
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
#endif
-errorentry general_protection do_general_protection
-trace_errorentry page_fault do_page_fault
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
-errorentry async_page_fault do_async_page_fault
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
/*
@@ -1568,21 +1386,21 @@ ENTRY(error_entry)
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
+ movq %rdx, RDX+8(%rsp)
+ movq %rcx, RCX+8(%rsp)
+ movq %rax, RAX+8(%rsp)
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1600,8 +1418,9 @@ error_sti:
* compat mode. Check for these here too.
*/
error_kernelspace:
+ CFI_REL_OFFSET rcx, RCX+8
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
@@ -1898,7 +1717,3 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
END(ignore_sysret)
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..94d857fb1033
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more than one PGD for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rdtscll(rand);
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd_p;
+ pteval_t ptemask;
+
+ ptemask = __supported_pte_mask;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap();
+}
+
+void init_espfix_ap(void)
+{
+ unsigned int cpu, page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(this_cpu_read(espfix_stack)))
+ return; /* Already initialized */
+
+ cpu = smp_processor_id();
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = (void *)__get_free_page(GFP_KERNEL);
+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ this_cpu_write(espfix_stack, addr);
+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ + (addr & ~PAGE_MASK));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 52819e816f87..3386dc9aa333 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -297,16 +297,7 @@ int ftrace_int3_handler(struct pt_regs *regs)
static int ftrace_write(unsigned long ip, const char *val, int size)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
+ ip = text_ip_addr(ip);
if (probe_kernel_write((void *)ip, val, size))
return -EPERM;
@@ -349,40 +340,14 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
-/*
- * If the record has the FTRACE_FL_REGS set, that means that it
- * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
- * is not not set, then it wants to convert to the normal callback.
- */
-static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_REGS)
- return (unsigned long)FTRACE_REGS_ADDR;
- else
- return (unsigned long)FTRACE_ADDR;
-}
-
-/*
- * The FTRACE_FL_REGS_EN is set when the record already points to
- * a function that saves all the regs. Basically the '_EN' version
- * represents the current state of the function.
- */
-static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_REGS_EN)
- return (unsigned long)FTRACE_REGS_ADDR;
- else
- return (unsigned long)FTRACE_ADDR;
-}
-
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
- ret = ftrace_test_record(rec, enable);
+ ftrace_addr = ftrace_get_addr_curr(rec);
- ftrace_addr = get_ftrace_addr(rec);
+ ret = ftrace_test_record(rec, enable);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -392,10 +357,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
/* converting nop to call */
return add_brk_on_nop(rec);
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
- ftrace_addr = get_ftrace_old_addr(rec);
- /* fall through */
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
@@ -440,14 +402,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
goto update;
/* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = get_ftrace_old_addr(rec);
+ ftrace_addr = ftrace_get_addr_curr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
@@ -491,13 +453,12 @@ static int add_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
@@ -538,13 +499,12 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
@@ -621,8 +581,8 @@ void ftrace_replace_code(int enable)
return;
remove_breakpoints:
+ pr_warn("Failed on %s (%d):\n", report, count);
ftrace_bug(ret, rec ? rec->ip : 0);
- printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
/*
@@ -743,6 +703,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
+ if (unlikely(ftrace_graph_is_dead()))
+ return;
+
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 068054f4bf20..eda1a865641e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- if (console_loglevel == 10)
+ if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
early_printk("Kernel alive\n");
clear_page(init_level4_pgt);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4177bfbc80b0..319bcb9372fe 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
-#endif
}
static inline void hpet_clear_mapping(void)
@@ -479,7 +476,7 @@ static int hpet_msi_next_event(unsigned long delta,
static int hpet_setup_msi_irq(unsigned int irq)
{
if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
- destroy_irq(irq);
+ irq_free_hwirq(irq);
return -EINVAL;
}
return 0;
@@ -487,9 +484,8 @@ static int hpet_setup_msi_irq(unsigned int irq)
static int hpet_assign_irq(struct hpet_dev *dev)
{
- unsigned int irq;
+ unsigned int irq = irq_alloc_hwirq(-1);
- irq = create_irq_nr(0, -1);
if (!irq)
return -EINVAL;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a67b47c31314..5f9cf20cdb68 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,7 +32,6 @@
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
-#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
@@ -424,7 +423,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
* NOTIFY_STOP returned for all other cases
*
*/
-static int __kprobes hw_breakpoint_handler(struct die_args *args)
+static int hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
@@ -511,7 +510,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
/*
* Handle debug exception notifications.
*/
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
{
if (val != DIE_DEBUG)
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 2e977b5d61dd..8af817105e29 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -299,13 +299,31 @@ static void unmask_8259A(void)
static void init_8259A(int auto_eoi)
{
unsigned long flags;
+ unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned char new_val;
i8259A_auto_eoi = auto_eoi;
raw_spin_lock_irqsave(&i8259A_lock, flags);
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ /*
+ * Check to see if we have a PIC.
+ * Mask all except the cascade and read
+ * back the value we just wrote. If we don't
+ * have a PIC, we will read 0xff as opposed to the
+ * value we wrote.
+ */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+ }
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
/*
* outb_pic - this has to work on a wide range of PC hardware.
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
index c3aae6672843..d30acdc1229d 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -25,6 +25,9 @@
#include <asm/iosf_mbi.h>
+#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
+#define PCI_DEVICE_ID_QUARK_X1000 0x0958
+
static DEFINE_SPINLOCK(iosf_mbi_lock);
static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
@@ -177,6 +180,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
}
EXPORT_SYMBOL(iosf_mbi_modify);
+bool iosf_mbi_available(void)
+{
+ /* Mbi isn't hot-pluggable. No remove routine is provided */
+ return mbi_pdev;
+}
+EXPORT_SYMBOL(iosf_mbi_available);
+
static int iosf_mbi_probe(struct pci_dev *pdev,
const struct pci_device_id *unused)
{
@@ -193,7 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev,
}
static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
- { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
{ 0, },
};
MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 283a76a9cc40..922d28581024 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -17,6 +17,7 @@
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
+#include <asm/desc.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -334,10 +335,17 @@ int check_irq_vectors_for_cpu_disable(void)
for_each_online_cpu(cpu) {
if (cpu == this_cpu)
continue;
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
- if (per_cpu(vector_irq, cpu)[vector] < 0)
- count++;
+ /*
+ * We scan from FIRST_EXTERNAL_VECTOR to first system
+ * vector. If the vector is marked in the used vectors
+ * bitmap or an irq is assigned to it, we don't count
+ * it as available.
+ */
+ for (vector = FIRST_EXTERNAL_VECTOR;
+ vector < first_system_vector; vector++) {
+ if (!test_bit(vector, used_vectors) &&
+ per_cpu(vector_irq, cpu)[vector] < 0)
+ count++;
}
}
@@ -357,6 +365,7 @@ void fixup_irqs(void)
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
+ int ret;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -395,10 +404,14 @@ void fixup_irqs(void)
if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
chip->irq_mask(data);
- if (chip->irq_set_affinity)
- chip->irq_set_affinity(data, affinity, true);
- else if (!(warned++))
- set_affinity = 0;
+ if (chip->irq_set_affinity) {
+ ret = chip->irq_set_affinity(data, affinity, true);
+ if (ret == -ENOSPC)
+ pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq);
+ } else {
+ if (!(warned++))
+ set_affinity = 0;
+ }
/*
* We unmask if the irq was not marked masked by the
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000000..9642b9b33655
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,553 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verify_pefile.h>
+#include <keys/system_keyring.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ memcpy(cmdline_ptr, cmdline, cmdline_len);
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr + cmdline_len - 1,
+ " elfcorehdr=0x%lx", image->arch.elf_load_addr);
+ cmdline_len += len;
+ }
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_saved.nr_map;
+
+ /* TODO: Pass entries more than E820MAX in bootparams setup data */
+ if (nr_e820_entries > E820MAX)
+ nr_e820_entries = E820MAX;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_map, &e820_saved.map,
+ nr_e820_entries * sizeof(struct e820entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &boot_params.screen_info,
+ sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+ /* Default sysdesc table */
+ params->sys_desc_table.length = 0;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_map[i].type != E820_RAM)
+ continue;
+ start = params->e820_map[i].addr;
+ end = params->e820_map[i].addr + params->e820_map[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be atleast two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ unsigned long purgatory_load_addr;
+ unsigned long kernel_bufsz, kernel_memsz, kernel_align;
+ char *kernel_buf;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1,
+ &purgatory_load_addr);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ efi_map_sz = ALIGN(efi_map_sz, 16);
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ params_misc_sz = params_cmdline_sz + efi_map_sz +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(params_misc_sz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + efi_map_sz;
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
+ params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
+ ULONG_MAX, 1, &bootparam_load_addr);
+ if (ret)
+ goto out_free_params;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, params_misc_sz, params_misc_sz);
+
+ /* Load kernel */
+ kernel_buf = kernel + kern16_size;
+ kernel_bufsz = kernel_len - kern16_size;
+ kernel_memsz = PAGE_ALIGN(header->init_size);
+ kernel_align = header->kernel_alignment;
+
+ ret = kexec_add_buffer(image, kernel_buf,
+ kernel_bufsz, kernel_memsz, kernel_align,
+ MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
+ &kernel_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kernel_memsz, kernel_memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
+ PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
+ ULONG_MAX, 1, &initrd_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ bool trusted;
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ system_trusted_keyring, &trusted);
+ if (ret < 0)
+ return ret;
+ if (!trusted)
+ return -EKEYREJECTED;
+ return 0;
+}
+#endif
+
+struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 61b17dc2c277..67e6d19ef1be 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -112,7 +112,8 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
+static nokprobe_inline void
+__synthesize_relative_insn(void *from, void *to, u8 op)
{
struct __arch_relative_insn {
u8 op;
@@ -125,21 +126,23 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
}
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-void __kprobes synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
-void __kprobes synthesize_relcall(void *from, void *to)
+void synthesize_relcall(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_relcall);
/*
* Skip the prefixes of the instruction.
*/
-static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
{
insn_attr_t attr;
@@ -154,12 +157,13 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
#endif
return insn;
}
+NOKPROBE_SYMBOL(skip_prefixes);
/*
* Returns non-zero if opcode is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes)
{
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
@@ -260,7 +264,7 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add
}
/* Check if paddr is at an instruction boundary */
-static int __kprobes can_probe(unsigned long paddr)
+static int can_probe(unsigned long paddr)
{
unsigned long addr, __addr, offset = 0;
struct insn insn;
@@ -299,7 +303,7 @@ static int __kprobes can_probe(unsigned long paddr)
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+static int is_IF_modifier(kprobe_opcode_t *insn)
{
/* Skip prefixes */
insn = skip_prefixes(insn);
@@ -322,7 +326,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
* If not, return null.
* Only applicable to 64-bit x86.
*/
-int __kprobes __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -365,7 +369,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
return insn.length;
}
-static int __kprobes arch_copy_kprobe(struct kprobe *p)
+static int arch_copy_kprobe(struct kprobe *p)
{
int ret;
@@ -392,7 +396,7 @@ static int __kprobes arch_copy_kprobe(struct kprobe *p)
return 0;
}
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+int arch_prepare_kprobe(struct kprobe *p)
{
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -407,17 +411,17 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return arch_copy_kprobe(p);
}
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
}
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
@@ -425,7 +429,8 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
}
}
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -433,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
}
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -441,8 +447,9 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
}
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
@@ -451,7 +458,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
}
-static void __kprobes clear_btf(void)
+static nokprobe_inline void clear_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -461,7 +468,7 @@ static void __kprobes clear_btf(void)
}
}
-static void __kprobes restore_btf(void)
+static nokprobe_inline void restore_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -471,8 +478,7 @@ static void __kprobes restore_btf(void)
}
}
-void __kprobes
-arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
@@ -481,9 +487,10 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-static void __kprobes
-setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
{
if (setup_detour_execution(p, regs, reenter))
return;
@@ -519,22 +526,24 @@ setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
else
regs->ip = (unsigned long)p->ainsn.insn;
}
+NOKPROBE_SYMBOL(setup_singlestep);
/*
* We have reentered the kprobe_handler(), since another probe was hit while
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
*/
-static int __kprobes
-reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
kprobes_inc_nmissed_count(p);
setup_singlestep(p, regs, kcb, 1);
break;
- case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
/* A probe has been hit in the codepath leading up to, or just
* after, single-stepping of a probed instruction. This entire
* codepath should strictly reside in .kprobes.text section.
@@ -553,17 +562,21 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
return 1;
}
+NOKPROBE_SYMBOL(reenter_kprobe);
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes kprobe_handler(struct pt_regs *regs)
+int kprobe_int3_handler(struct pt_regs *regs)
{
kprobe_opcode_t *addr;
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ if (user_mode_vm(regs))
+ return 0;
+
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
@@ -621,12 +634,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
preempt_enable_no_resched();
return 0;
}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
-static void __used __kprobes kretprobe_trampoline_holder(void)
+static void __used kretprobe_trampoline_holder(void)
{
asm volatile (
".global kretprobe_trampoline\n"
@@ -657,11 +671,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
#endif
" ret\n");
}
+NOKPROBE_SYMBOL(kretprobe_trampoline_holder);
+NOKPROBE_SYMBOL(kretprobe_trampoline);
/*
* Called from kretprobe_trampoline
*/
-__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
+__visible __used void *trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
@@ -747,6 +763,7 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
}
return (void *)orig_ret_address;
}
+NOKPROBE_SYMBOL(trampoline_handler);
/*
* Called after single-stepping. p->addr is the address of the
@@ -775,8 +792,8 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
* jump instruction after the copied instruction, that jumps to the next
* instruction after the probepoint.
*/
-static void __kprobes
-resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static void resume_execution(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -851,12 +868,13 @@ resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
no_change:
restore_btf();
}
+NOKPROBE_SYMBOL(resume_execution);
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+int kprobe_debug_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -891,8 +909,9 @@ out:
return 1;
}
+NOKPROBE_SYMBOL(kprobe_debug_handler);
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -949,12 +968,13 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
return 0;
}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
/*
* Wrapper routine for handling exceptions.
*/
-int __kprobes
-kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
{
struct die_args *args = data;
int ret = NOTIFY_DONE;
@@ -962,22 +982,7 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d
if (args->regs && user_mode_vm(args->regs))
return ret;
- switch (val) {
- case DIE_INT3:
- if (kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_DEBUG:
- if (post_kprobe_handler(args->regs)) {
- /*
- * Reset the BS bit in dr6 (pointed by args->err) to
- * denote completion of processing
- */
- (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
- ret = NOTIFY_STOP;
- }
- break;
- case DIE_GPF:
+ if (val == DIE_GPF) {
/*
* To be potentially processing a kprobe fault and to
* trust the result from kprobe_running(), we have
@@ -986,14 +991,12 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d
if (!preemptible() && kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
ret = NOTIFY_STOP;
- break;
- default:
- break;
}
return ret;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
@@ -1017,8 +1020,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->ip = (unsigned long)(jp->entry);
return 1;
}
+NOKPROBE_SYMBOL(setjmp_pre_handler);
-void __kprobes jprobe_return(void)
+void jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -1034,8 +1038,10 @@ void __kprobes jprobe_return(void)
" nop \n"::"b"
(kcb->jprobe_saved_sp):"memory");
}
+NOKPROBE_SYMBOL(jprobe_return);
+NOKPROBE_SYMBOL(jprobe_return_end);
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
@@ -1063,13 +1069,22 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(longjmp_break_handler);
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end);
+}
int __init arch_init_kprobes(void)
{
return 0;
}
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
{
return 0;
}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 23ef5c556f06..717b02a22e67 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,8 +25,9 @@
#include "common.h"
-static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
/*
* Emulate singlestep (and also recover regs->ip)
@@ -41,18 +42,19 @@ static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
return 1;
}
-int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
if (kprobe_ftrace(p))
return __skip_singlestep(p, regs, kcb);
else
return 0;
}
+NOKPROBE_SYMBOL(skip_singlestep);
/* Ftrace callback handler for kprobes */
-void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
{
struct kprobe *p;
struct kprobe_ctlblk *kcb;
@@ -84,8 +86,9 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
end:
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
-int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
{
p->ainsn.insn = NULL;
p->ainsn.boostable = -1;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 898160b42e43..f304773285ae 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -77,7 +77,7 @@ found:
}
/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
{
#ifdef CONFIG_X86_64
*addr++ = 0x48;
@@ -138,7 +138,8 @@ asm (
#define INT3_SIZE sizeof(kprobe_opcode_t)
/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
unsigned long flags;
@@ -168,8 +169,9 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(optimized_callback);
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+static int copy_optimized_instructions(u8 *dest, u8 *src)
{
int len = 0, ret;
@@ -189,7 +191,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
}
/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
+static int insn_is_indirect_jump(struct insn *insn)
{
return ((insn->opcode.bytes[0] == 0xff &&
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -224,7 +226,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
}
/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
+static int can_optimize(unsigned long paddr)
{
unsigned long addr, size = 0, offset = 0;
struct insn insn;
@@ -275,7 +277,7 @@ static int __kprobes can_optimize(unsigned long paddr)
}
/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
{
int i;
struct kprobe *p;
@@ -290,15 +292,15 @@ int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
}
/* Check the addr is within the optimized instructions. */
-int __kprobes
-arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
{
return ((unsigned long)op->kp.addr <= addr &&
(unsigned long)op->kp.addr + op->optinsn.size > addr);
}
/* Free optimized instruction slot */
-static __kprobes
+static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
if (op->optinsn.insn) {
@@ -308,7 +310,7 @@ void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
}
}
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
{
__arch_remove_optimized_kprobe(op, 1);
}
@@ -318,7 +320,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
* Target instructions MUST be relocatable (checked inside)
* This is called when new aggr(opt)probe is allocated or reused.
*/
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
{
u8 *buf;
int ret;
@@ -372,7 +374,7 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
* Replace breakpoints (int3) with relative jumps.
* Caller must call with locking kprobe_mutex and text_mutex.
*/
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
u8 insn_buf[RELATIVEJUMP_SIZE];
@@ -398,7 +400,7 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
}
/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buf[RELATIVEJUMP_SIZE];
@@ -424,8 +426,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,
}
}
-int __kprobes
-setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
{
struct optimized_kprobe *op;
@@ -441,3 +442,4 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
}
return 0;
}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 0331cb389d68..3dd8e2c4d74a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -251,15 +251,16 @@ u32 kvm_read_and_reset_pf_reason(void)
return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
-dotraplinkage void __kprobes
+dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
{
enum ctx_state prev_state;
switch (kvm_read_and_reset_pf_reason()) {
default:
- do_page_fault(regs, error_code);
+ trace_do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
@@ -276,6 +277,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
break;
}
}
+NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index dcbbaa165bde..c37886d759cc 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,8 +20,6 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
-int sysctl_ldt16 = 0;
-
#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
{
@@ -231,16 +229,10 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}
- /*
- * On x86-64 we do not support 16-bit segments due to
- * IRET leaking the high bits of the kernel stack address.
- */
-#ifdef CONFIG_X86_64
- if (!ldt_info.seg_32bit && !sysctl_ldt16) {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
error = -EINVAL;
goto out_unlock;
}
-#endif
fill_ldt(&ldt, &ldt_info);
if (oldmode)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 679cef0791cd..8b04018e5d1f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
@@ -21,6 +23,11 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+
+static struct kexec_file_ops *kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+};
static void free_transition_pgtable(struct kimage *image)
{
@@ -171,6 +178,38 @@ static void load_segments(void)
);
}
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -184,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
return 0;
}
@@ -283,3 +327,198 @@ void arch_crash_save_vmcoreinfo(void)
(unsigned long)&_text - __START_KERNEL);
}
+/* arch-dependent functionality related to kexec file-based syscall */
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ int i, ret = -ENOEXEC;
+ struct kexec_file_ops *fops;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+ fops = kexec_file_loaders[i];
+ if (!fops || !fops->probe)
+ continue;
+
+ ret = fops->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
+}
+
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
+ unsigned long kernel_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(kernel, kernel_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * ehdr: Pointer to elf headers
+ * sechdrs: Pointer to section headers.
+ * relsec: section index of SHT_RELA section.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
+ Elf64_Shdr *sechdrs, unsigned int relsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ Elf64_Shdr *section, *symtabsec;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+
+ /*
+ * ->sh_offset has been modified to keep the pointer to section
+ * contents in memory
+ */
+ rel = (void *)sechdrs[relsec].sh_offset;
+
+ /* Section to which relocations apply */
+ section = &sechdrs[sechdrs[relsec].sh_info];
+
+ pr_debug("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+
+ /* Associated symbol table */
+ symtabsec = &sechdrs[sechdrs[relsec].sh_link];
+
+ /* String table */
+ if (symtabsec->sh_link >= ehdr->e_shnum) {
+ /* Invalid strtab section number */
+ pr_err("Invalid string table section index %d\n",
+ symtabsec->sh_link);
+ return -ENOEXEC;
+ }
+
+ strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
+
+ /* section header string table */
+ shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update (->sh_offset). This is temporary
+ * buffer where section is currently loaded. This will finally
+ * be loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = (void *)(section->sh_offset + rel[i].r_offset);
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (Elf64_Sym *)symtabsec->sh_offset +
+ ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
new file mode 100644
index 000000000000..c73aecf10d34
--- /dev/null
+++ b/arch/x86/kernel/mcount_64.S
@@ -0,0 +1,206 @@
+/*
+ * linux/arch/x86_64/mcount_64.S
+ *
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+
+
+ .code64
+ .section .entry.text, "ax"
+
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(function_hook)
+ retq
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+ MCOUNT_SAVE_FRAME \skip
+
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Load ip into the first parameter */
+ movq RIP(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+ /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+.endm
+
+ENTRY(ftrace_caller)
+ ftrace_caller_setup
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
+
+GLOBAL(ftrace_call)
+ call ftrace_stub
+
+ MCOUNT_RESTORE_FRAME
+ftrace_return:
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+ jmp ftrace_stub
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before compare (in SS location)*/
+ pushfq
+
+ /* skip=8 to skip flags saved in SS */
+ ftrace_caller_setup 8
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq SS(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address */
+ leaq SS+16(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, SS(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, SS+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBP(%rsp), %rbp
+ movq RBX(%rsp), %rbx
+
+ /* skip=8 to skip flags saved in SS */
+ MCOUNT_RESTORE_FRAME 8
+
+ /* Restore flags */
+ popfq
+
+ jmp ftrace_return
+
+ popfq
+ jmp ftrace_stub
+
+END(ftrace_regs_caller)
+
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+
+trace:
+ MCOUNT_SAVE_FRAME
+
+ movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ MCOUNT_SAVE_FRAME
+
+#ifdef CC_USING_FENTRY
+ leaq SS+16(%rsp), %rdi
+ movq $0, %rdx /* No framepointers needed */
+#else
+ leaq 8(%rbp), %rdi
+ movq (%rbp), %rdx
+#endif
+ movq RIP(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+GLOBAL(return_to_handler)
+ subq $24, %rsp
+
+ /* Save the return values */
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq 8(%rsp), %rdx
+ movq (%rsp), %rax
+ addq $24, %rsp
+ jmp *%rdi
+#endif
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index b4872b999a71..c3e985d1751c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
a->handler, whole_msecs, decimal_msecs);
}
-static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -146,6 +146,7 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
/* return total number of NMI events handled */
return handled;
}
+NOKPROBE_SYMBOL(nmi_handle);
int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
@@ -208,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static __kprobes void
+static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
/* check to see if anyone registered against these types of errors */
@@ -238,8 +239,9 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(pci_serr_error);
-static __kprobes void
+static void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
@@ -269,8 +271,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(io_check_error);
-static __kprobes void
+static void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -298,11 +301,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Dazed and confused, but trying to continue\n");
}
+NOKPROBE_SYMBOL(unknown_nmi_error);
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static __kprobes void default_do_nmi(struct pt_regs *regs)
+static void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -401,6 +405,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
else
unknown_nmi_error(reason, regs);
}
+NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can hit breakpoints which will cause it to lose its
@@ -520,7 +525,7 @@ static inline void nmi_nesting_postprocess(void)
}
#endif
-dotraplinkage notrace __kprobes void
+dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
@@ -537,6 +542,7 @@ do_nmi(struct pt_regs *regs, long error_code)
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
}
+NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b10af835c31..548d25f00c90 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -23,6 +23,7 @@
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -389,6 +390,11 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.end_context_switch = paravirt_nop,
};
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93eb..a1da6737ba5b 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f7d0672481fd..a25e202bb319 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -97,12 +97,17 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_mask = dma_alloc_coherent_mask(dev, flag);
- flag |= __GFP_ZERO;
+ flag &= ~__GFP_ZERO;
again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
- if (flag & __GFP_WAIT)
+ if (flag & __GFP_WAIT) {
page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ if (page && page_to_phys(page) + size > dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
+ }
/* fallback */
if (!page)
page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
@@ -120,7 +125,7 @@ again:
return NULL;
}
-
+ memset(page_address(page), 0, size);
*dma_addr = addr;
return page_address(page);
}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6c483ba98b9c..77dd0ad58be4 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -14,7 +14,7 @@
#include <asm/iommu_table.h>
int swiotlb __read_mostly;
-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
struct dma_attrs *attrs)
{
@@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
}
-static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+void x86_swiotlb_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_addr,
struct dma_attrs *attrs)
{
- swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr)))
+ swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ else
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static struct dma_map_ops swiotlb_dma_ops = {
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
new file mode 100644
index 000000000000..0c424a67985d
--- /dev/null
+++ b/arch/x86/kernel/pmc_atom.c
@@ -0,0 +1,321 @@
+/*
+ * Intel Atom SOC Power Management Controller Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+
+#include <asm/pmc_atom.h>
+
+#define DRIVER_NAME KBUILD_MODNAME
+
+struct pmc_dev {
+ u32 base_addr;
+ void __iomem *regmap;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+};
+
+static struct pmc_dev pmc_device;
+static u32 acpi_base_addr;
+
+struct pmc_dev_map {
+ const char *name;
+ u32 bit_mask;
+};
+
+static const struct pmc_dev_map dev_map[] = {
+ {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
+ {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
+ {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
+ {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1},
+ {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2},
+ {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI},
+ {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX},
+ {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX},
+ {"8 - SCC_EMMC", BIT_SCC_EMMC},
+ {"9 - SCC_SDIO", BIT_SCC_SDIO},
+ {"10 - SCC_SDCARD", BIT_SCC_SDCARD},
+ {"11 - SCC_MIPI", BIT_SCC_MIPI},
+ {"12 - HDA", BIT_HDA},
+ {"13 - LPE", BIT_LPE},
+ {"14 - OTG", BIT_OTG},
+ {"15 - USH", BIT_USH},
+ {"16 - GBE", BIT_GBE},
+ {"17 - SATA", BIT_SATA},
+ {"18 - USB_EHCI", BIT_USB_EHCI},
+ {"19 - SEC", BIT_SEC},
+ {"20 - PCIE_PORT0", BIT_PCIE_PORT0},
+ {"21 - PCIE_PORT1", BIT_PCIE_PORT1},
+ {"22 - PCIE_PORT2", BIT_PCIE_PORT2},
+ {"23 - PCIE_PORT3", BIT_PCIE_PORT3},
+ {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA},
+ {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1},
+ {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2},
+ {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3},
+ {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4},
+ {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5},
+ {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6},
+ {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7},
+ {"32 - SMB", BIT_SMB},
+ {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY},
+ {"34 - USH_SS_PHY", BIT_USH_SS_PHY},
+ {"35 - DFX", BIT_DFX},
+};
+
+static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
+{
+ return readl(pmc->regmap + reg_offset);
+}
+
+static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
+{
+ writel(val, pmc->regmap + reg_offset);
+}
+
+static void pmc_power_off(void)
+{
+ u16 pm1_cnt_port;
+ u32 pm1_cnt_value;
+
+ pr_info("Preparing to enter system sleep state S5\n");
+
+ pm1_cnt_port = acpi_base_addr + PM1_CNT;
+
+ pm1_cnt_value = inl(pm1_cnt_port);
+ pm1_cnt_value &= SLEEP_TYPE_MASK;
+ pm1_cnt_value |= SLEEP_TYPE_S5;
+ pm1_cnt_value |= SLEEP_ENABLE;
+
+ outl(pm1_cnt_value, pm1_cnt_port);
+}
+
+static void pmc_hw_reg_setup(struct pmc_dev *pmc)
+{
+ /*
+ * Disable PMC S0IX_WAKE_EN events coming from:
+ * - LPC clock run
+ * - GPIO_SUS ored dedicated IRQs
+ * - GPIO_SCORE ored dedicated IRQs
+ * - GPIO_SUS shared IRQ
+ * - GPIO_SCORE shared IRQ
+ */
+ pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int pmc_dev_state_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u32 func_dis, func_dis_2, func_dis_index;
+ u32 d3_sts_0, d3_sts_1, d3_sts_index;
+ int dev_num, dev_index, reg_index;
+
+ func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
+ func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
+ d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
+ d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
+
+ dev_num = ARRAY_SIZE(dev_map);
+
+ for (dev_index = 0; dev_index < dev_num; dev_index++) {
+ reg_index = dev_index / PMC_REG_BIT_WIDTH;
+ if (reg_index) {
+ func_dis_index = func_dis_2;
+ d3_sts_index = d3_sts_1;
+ } else {
+ func_dis_index = func_dis;
+ d3_sts_index = d3_sts_0;
+ }
+
+ seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
+ dev_map[dev_index].name,
+ dev_map[dev_index].bit_mask & func_dis_index ?
+ "Disabled" : "Enabled ",
+ dev_map[dev_index].bit_mask & d3_sts_index ?
+ "D3" : "D0");
+ }
+ return 0;
+}
+
+static int pmc_dev_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_dev_state_ops = {
+ .open = pmc_dev_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
+
+ s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
+ s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
+ s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
+ s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
+ s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
+
+ seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
+ seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
+ seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
+ seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
+ seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr);
+ return 0;
+}
+
+static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_sleep_tmr_show, inode->i_private);
+}
+
+static const struct file_operations pmc_sleep_tmr_ops = {
+ .open = pmc_sleep_tmr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
+{
+ if (!pmc->dbgfs_dir)
+ return;
+
+ debugfs_remove_recursive(pmc->dbgfs_dir);
+ pmc->dbgfs_dir = NULL;
+}
+
+static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+{
+ struct dentry *dir, *f;
+
+ dir = debugfs_create_dir("pmc_atom", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_dev_state_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "dev_states register failed\n");
+ goto err;
+ }
+ f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_sleep_tmr_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "sleep_state register failed\n");
+ goto err;
+ }
+ pmc->dbgfs_dir = dir;
+ return 0;
+err:
+ pmc_dbgfs_unregister(pmc);
+ return -ENODEV;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static int pmc_setup_dev(struct pci_dev *pdev)
+{
+ struct pmc_dev *pmc = &pmc_device;
+ int ret;
+
+ /* Obtain ACPI base address */
+ pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
+ acpi_base_addr &= ACPI_BASE_ADDR_MASK;
+
+ /* Install power off function */
+ if (acpi_base_addr != 0 && pm_power_off == NULL)
+ pm_power_off = pmc_power_off;
+
+ pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
+ pmc->base_addr &= PMC_BASE_ADDR_MASK;
+
+ pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
+ if (!pmc->regmap) {
+ dev_err(&pdev->dev, "error: ioremap failed\n");
+ return -ENOMEM;
+ }
+
+ /* PMC hardware registers setup */
+ pmc_hw_reg_setup(pmc);
+
+#ifdef CONFIG_DEBUG_FS
+ ret = pmc_dbgfs_register(pmc, pdev);
+ if (ret) {
+ iounmap(pmc->regmap);
+ return ret;
+ }
+#endif /* CONFIG_DEBUG_FS */
+ return 0;
+}
+
+/*
+ * Data for PCI driver interface
+ *
+ * This data only exists for exporting the supported
+ * PCI ids via MODULE_DEVICE_TABLE. We do not actually
+ * register a pci_driver, because lpc_ich will register
+ * a driver on the same PCI id.
+ */
+static const struct pci_device_id pmc_pci_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+ { 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
+
+static int __init pmc_atom_init(void)
+{
+ int err = -ENODEV;
+ struct pci_dev *pdev = NULL;
+ const struct pci_device_id *ent;
+
+ /* We look for our device - PCU PMC
+ * we assume that there is max. one device.
+ *
+ * We can't use plain pci_driver mechanism,
+ * as the device is really a multiple function device,
+ * main driver that binds to the pci_device is lpc_ich
+ * and have to find & bind to the device this way.
+ */
+ for_each_pci_dev(pdev) {
+ ent = pci_match_id(pmc_pci_ids, pdev);
+ if (ent) {
+ err = pmc_setup_dev(pdev);
+ goto out;
+ }
+ }
+ /* Device not found. */
+out:
+ return err;
+}
+
+module_init(pmc_atom_init);
+/* no module_exit, this driver shouldn't be unloaded */
+
+MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 898d077617a9..ca5b02d405c3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -413,12 +413,11 @@ void set_personality_ia32(bool x32)
set_thread_flag(TIF_ADDR32);
/* Mark the associated mm as containing 32-bit tasks. */
- if (current->mm)
- current->mm->context.ia32_compat = 1;
-
if (x32) {
clear_thread_flag(TIF_IA32);
set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
/* is_compat_task() uses the presence of the x32
syscall bit flag to determine compat status */
@@ -426,6 +425,8 @@ void set_personality_ia32(bool x32)
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
current_thread_info()->status |= TS_COMPAT;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 52b1157c53eb..17962e667a91 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -28,6 +28,7 @@
#include <linux/mc146818rtc.h>
#include <asm/realmode.h>
#include <asm/x86_init.h>
+#include <asm/efi.h>
/*
* Power off function, if any
@@ -401,12 +402,25 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
static int __init reboot_init(void)
{
+ int rv;
+
/*
* Only do the DMI check if reboot_type hasn't been overridden
* on the command line
*/
- if (reboot_default)
- dmi_check_system(reboot_dmi_table);
+ if (!reboot_default)
+ return 0;
+
+ /*
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set, force EFI
+ * reboot.
+ */
+ rv = dmi_check_system(reboot_dmi_table);
+
+ if (!rv && efi_reboot_required())
+ reboot_type = BOOT_EFI;
+
return 0;
}
core_initcall(reboot_init);
@@ -528,11 +542,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_EFI:
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi.reset_system(reboot_mode == REBOOT_WARM ?
- EFI_RESET_WARM :
- EFI_RESET_COLD,
- EFI_SUCCESS, 0, NULL);
+ efi_reboot(reboot_mode, NULL);
reboot_type = BOOT_BIOS;
break;
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819bb6a8..80eab01c1a68 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail)
void arch_remove_reservations(struct resource *avail)
{
- /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
if (avail->flags & IORESOURCE_MEM) {
- if (avail->start < BIOS_END)
- avail->start = BIOS_END;
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
remove_e820_regions(avail);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 09c76d265550..41ead8d3bc0b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -924,10 +924,10 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL32", 4)) {
+ EFI32_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL64", 4)) {
+ EFI64_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
set_bit(EFI_64BIT, &efi.flags);
}
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
memblock_set_current_limit(get_max_mapped());
- dma_contiguous_reserve(0);
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9e5de6813e1f..2851d63c1202 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -298,7 +298,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
}
if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_sigreturn;
else
restorer = &frame->retcode;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
@@ -361,7 +362,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 34826934d4a7..5492798930ef 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap();
+#endif
+
+ /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
@@ -859,9 +866,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);
-
- set_cpu_present(cpu, false);
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
@@ -921,7 +925,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
- pr_debug("do_boot_cpu failed %d\n", err);
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f73b5d435bdc..0d0e922fafc1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -106,7 +107,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
-static int __kprobes
+static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
{
@@ -136,7 +137,38 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
return -1;
}
-static void __kprobes
+static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
+ siginfo_t *info)
+{
+ unsigned long siaddr;
+ int sicode;
+
+ switch (trapnr) {
+ default:
+ return SEND_SIG_PRIV;
+
+ case X86_TRAP_DE:
+ sicode = FPE_INTDIV;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_UD:
+ sicode = ILL_ILLOPN;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_AC:
+ sicode = BUS_ADRALN;
+ siaddr = 0;
+ break;
+ }
+
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = sicode;
+ info->si_addr = (void __user *)siaddr;
+ return info;
+}
+
+static void
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
{
@@ -168,60 +200,43 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
}
#endif
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
+ force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
}
+NOKPROBE_SYMBOL(do_trap);
-#define DO_ERROR(trapnr, signr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- enum ctx_state prev_state; \
- \
- prev_state = exception_enter(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, \
- trapnr, signr) == NOTIFY_STOP) { \
- exception_exit(prev_state); \
- return; \
- } \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
- exception_exit(prev_state); \
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr)
+{
+ enum ctx_state prev_state = exception_enter();
+ siginfo_t info;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ conditional_sti(regs);
+ do_trap(trapnr, signr, str, regs, error_code,
+ fill_trap_info(regs, signr, trapnr, &info));
+ }
+
+ exception_exit(prev_state);
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+#define DO_ERROR(trapnr, signr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- siginfo_t info; \
- enum ctx_state prev_state; \
- \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- prev_state = exception_enter(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, \
- trapnr, signr) == NOTIFY_STOP) { \
- exception_exit(prev_state); \
- return; \
- } \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
- exception_exit(prev_state); \
+ do_error_trap(regs, error_code, str, trapnr, signr); \
}
-DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip )
-DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow )
-DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds )
-DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip )
-DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun )
-DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS )
-DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present )
+DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
#ifdef CONFIG_X86_32
-DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment )
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
#endif
-DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 )
+DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
@@ -263,7 +278,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
-dotraplinkage void __kprobes
+dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
@@ -305,13 +320,14 @@ do_general_protection(struct pt_regs *regs, long error_code)
pr_cont("\n");
}
- force_sig(SIGSEGV, tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_general_protection);
/* May run on IST stack. */
-dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
@@ -334,6 +350,11 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+#ifdef CONFIG_KPROBES
+ if (kprobe_int3_handler(regs))
+ goto exit;
+#endif
+
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
@@ -350,6 +371,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
@@ -357,7 +379,7 @@ exit:
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
*/
-asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -376,6 +398,7 @@ asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*regs = *eregs;
return regs;
}
+NOKPROBE_SYMBOL(sync_regs);
#endif
/*
@@ -402,7 +425,7 @@ asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*
* May run on IST stack.
*/
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
enum ctx_state prev_state;
@@ -440,6 +463,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
+#ifdef CONFIG_KPROBES
+ if (kprobe_debug_handler(regs))
+ goto exit;
+#endif
+
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
goto exit;
@@ -482,13 +510,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_debug);
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
siginfo_t info;
@@ -518,7 +547,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_addr = (void __user *)regs->ip;
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
if (trapnr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
@@ -645,7 +674,7 @@ void math_state_restore(void)
*/
if (unlikely(restore_fpu_checking(tsk))) {
drop_init_fpu(tsk);
- force_sig(SIGSEGV, tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
return;
}
@@ -653,7 +682,7 @@ void math_state_restore(void)
}
EXPORT_SYMBOL_GPL(math_state_restore);
-dotraplinkage void __kprobes
+dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
@@ -679,6 +708,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#endif
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57e5ce126d5a..b6025f9e36c6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -234,9 +234,6 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
return ns;
}
-/* XXX surely we already have this someplace in the kernel?! */
-#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
-
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now;
@@ -259,7 +256,9 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+ data->cyc2ns_mul =
+ DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
+ cpu_khz);
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
@@ -920,9 +919,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
@@ -951,7 +950,7 @@ core_initcall(cpufreq_tsc);
static struct clocksource clocksource_tsc;
/*
- * We compare the TSC to the cycle_last value in the clocksource
+ * We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
@@ -961,26 +960,23 @@ static struct clocksource clocksource_tsc;
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
* timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
static cycle_t read_tsc(struct clocksource *cs)
{
- cycle_t ret = (cycle_t)get_cycles();
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
-}
-
-static void resume_tsc(struct clocksource *cs)
-{
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
- clocksource_tsc.cycle_last = 0;
+ return (cycle_t)get_cycles();
}
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
- .resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2ed845928b5f..5d1cbfe4ae58 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -32,20 +32,20 @@
/* Post-execution fixups. */
-/* No fixup needed */
-#define UPROBE_FIX_NONE 0x0
-
/* Adjust IP back to vicinity of actual insn */
-#define UPROBE_FIX_IP 0x1
+#define UPROBE_FIX_IP 0x01
/* Adjust the return address of a call insn */
-#define UPROBE_FIX_CALL 0x2
+#define UPROBE_FIX_CALL 0x02
/* Instruction will modify TF, don't change it */
-#define UPROBE_FIX_SETF 0x4
+#define UPROBE_FIX_SETF 0x04
-#define UPROBE_FIX_RIP_AX 0x8000
-#define UPROBE_FIX_RIP_CX 0x4000
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
#define UPROBE_TRAP_NR UINT_MAX
@@ -53,7 +53,7 @@
#define OPCODE1(insn) ((insn)->opcode.bytes[0])
#define OPCODE2(insn) ((insn)->opcode.bytes[1])
#define OPCODE3(insn) ((insn)->opcode.bytes[2])
-#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -67,6 +67,7 @@
* to keep gcc from statically optimizing it out, as variable_test_bit makes
* some versions of gcc to think only *(unsigned long*) is used.
*/
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static volatile u32 good_insns_32[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
@@ -89,33 +90,12 @@ static volatile u32 good_insns_32[256 / 32] = {
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
+#else
+#define good_insns_32 NULL
+#endif
-/* Using this for both 64-bit and 32-bit apps */
-static volatile u32 good_2byte_insns[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ---------------------------------------------- */
- W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
- W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
- W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
- W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
- W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
- /* ---------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-
-#ifdef CONFIG_X86_64
/* Good-instruction tables for 64-bit apps */
+#if defined(CONFIG_X86_64)
static volatile u32 good_insns_64[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
@@ -138,7 +118,33 @@ static volatile u32 good_insns_64[256 / 32] = {
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
+#else
+#define good_insns_64 NULL
#endif
+
+/* Using this for both 64-bit and 32-bit apps */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
#undef W
/*
@@ -209,16 +215,25 @@ static bool is_prefix_bad(struct insn *insn)
return false;
}
-static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
- insn_init(insn, auprobe->insn, false);
+ u32 volatile *good_insns;
+
+ insn_init(insn, auprobe->insn, x86_64);
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (WARN_ON_ONCE(!insn_complete(insn)))
+ return -ENOEXEC;
- /* Skip good instruction prefixes; reject "bad" ones. */
- insn_get_opcode(insn);
if (is_prefix_bad(insn))
return -ENOTSUPP;
- if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
return 0;
if (insn->opcode.nbytes == 2) {
@@ -229,72 +244,19 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOTSUPP;
}
-/*
- * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
- * annotate arch_uprobe->fixups accordingly. To start with,
- * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
- */
-static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
{
- bool fix_ip = true, fix_call = false; /* defaults */
- int reg;
-
- insn_get_opcode(insn); /* should be a nop */
-
- switch (OPCODE1(insn)) {
- case 0x9d:
- /* popf */
- auprobe->fixups |= UPROBE_FIX_SETF;
- break;
- case 0xc3: /* ret/lret */
- case 0xcb:
- case 0xc2:
- case 0xca:
- /* ip is correct */
- fix_ip = false;
- break;
- case 0xe8: /* call relative - Fix return addr */
- fix_call = true;
- break;
- case 0x9a: /* call absolute - Fix return addr, not ip */
- fix_call = true;
- fix_ip = false;
- break;
- case 0xff:
- insn_get_modrm(insn);
- reg = MODRM_REG(insn);
- if (reg == 2 || reg == 3) {
- /* call or lcall, indirect */
- /* Fix return addr; ip is correct. */
- fix_call = true;
- fix_ip = false;
- } else if (reg == 4 || reg == 5) {
- /* jmp or ljmp, indirect */
- /* ip is correct. */
- fix_ip = false;
- }
- break;
- case 0xea: /* jmp absolute -- ip is correct */
- fix_ip = false;
- break;
- default:
- break;
- }
- if (fix_ip)
- auprobe->fixups |= UPROBE_FIX_IP;
- if (fix_call)
- auprobe->fixups |= UPROBE_FIX_CALL;
+ return !config_enabled(CONFIG_IA32_EMULATION) ||
+ !(mm->context.ia32_compat == TIF_IA32);
}
-
-#ifdef CONFIG_X86_64
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
* immediately. Otherwise, rewrite the instruction so that it accesses
* its memory operand indirectly through a scratch register. Set
- * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
- * accordingly. (The contents of the scratch register will be saved
- * before we single-step the modified instruction, and restored
- * afterward.)
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
*
* We do this because a rip-relative instruction can access only a
* relatively small area (+/- 2 GB from the instruction), and the XOL
@@ -305,248 +267,513 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
*
* Some useful facts about rip-relative instructions:
*
- * - There's always a modrm byte.
+ * - There's always a modrm byte with bit layout "00 reg 101".
* - There's never a SIB byte.
* - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
*/
-static void
-handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 *cursor;
u8 reg;
+ u8 reg2;
- if (mm->context.ia32_compat)
- return;
-
- auprobe->rip_rela_target_address = 0x0;
if (!insn_rip_relative(insn))
return;
/*
- * insn_rip_relative() would have decoded rex_prefix, modrm.
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
* Clear REX.b bit (extension of MODRM.rm field):
- * we want to encode rax/rcx, not r8/r9.
+ * we want to encode low numbered reg, not r8+.
*/
if (insn->rex_prefix.nbytes) {
cursor = auprobe->insn + insn_offset_rex_prefix(insn);
- *cursor &= 0xfe; /* Clearing REX.B bit */
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3 prefix.
+ * TODO: add XOP/EVEX treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes == 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * (evex will need setting of both b and x since
+ * in non-sib encoding evex.x is 4th bit of MODRM.rm)
+ * Setting VEX3.b (setting because it has inverted meaning):
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x20;
}
/*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes == 2)
+ reg2 = insn->vex_prefix.bytes[1];
+ else if (insn->vex_prefix.nbytes == 3)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP, EXEV vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
* Point cursor at the modrm byte. The next 4 bytes are the
* displacement. Beyond the displacement, for some instructions,
* is the immediate operand.
*/
cursor = auprobe->insn + insn_offset_modrm(insn);
- insn_get_length(insn);
-
/*
- * Convert from rip-relative addressing to indirect addressing
- * via a scratch register. Change the r/m field from 0x5 (%rip)
- * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
*/
- reg = MODRM_REG(insn);
- if (reg == 0) {
- /*
- * The register operand (if any) is either the A register
- * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
- * REX prefix) %r8. In any case, we know the C register
- * is NOT the register operand, so we use %rcx (register
- * #1) for the scratch register.
- */
- auprobe->fixups = UPROBE_FIX_RIP_CX;
- /* Change modrm from 00 000 101 to 00 000 001. */
- *cursor = 0x1;
- } else {
- /* Use %rax (register #0) for the scratch register. */
- auprobe->fixups = UPROBE_FIX_RIP_AX;
- /* Change modrm from 00 xxx 101 to 00 xxx 000 */
- *cursor = (reg << 3);
- }
-
- /* Target address = address of next instruction + (signed) offset */
- auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
-
- /* Displacement field is gone; slide immediate field (if any) over. */
- if (insn->immediate.nbytes) {
- cursor++;
- memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
- }
- return;
+ *cursor = 0x80 | (reg << 3) | reg2;
}
-static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- insn_init(insn, auprobe->insn, true);
-
- /* Skip good instruction prefixes; reject "bad" ones. */
- insn_get_opcode(insn);
- if (is_prefix_bad(insn))
- return -ENOTSUPP;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
+}
- if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
- return 0;
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
- if (insn->opcode.nbytes == 2) {
- if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
- return 0;
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
}
- return -ENOTSUPP;
}
-static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- if (mm->context.ia32_compat)
- return validate_insn_32bits(auprobe, insn);
- return validate_insn_64bits(auprobe, insn);
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
}
#else /* 32-bit: */
-static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+static inline bool is_64bit_mm(struct mm_struct *mm)
{
- /* No RIP-relative addressing on 32-bit */
+ return false;
}
-
-static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- return validate_insn_32bits(auprobe, insn);
}
#endif /* CONFIG_X86_64 */
-/**
- * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
- * @mm: the probed address space.
- * @arch_uprobe: the probepoint information.
- * @addr: virtual address at which to install the probepoint
- * Return 0 on success or a -ve number on error.
- */
-int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(void)
{
- int ret;
- struct insn insn;
+ return is_ia32_task() ? 4 : 8;
+}
- auprobe->fixups = 0;
- ret = validate_insn_bits(auprobe, mm, &insn);
- if (ret != 0)
- return ret;
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
- handle_riprel_insn(auprobe, mm, &insn);
- prepare_fixups(auprobe, &insn);
+static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+{
+ unsigned long new_sp = regs->sp - sizeof_long();
+ if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+ return -EFAULT;
+
+ regs->sp = new_sp;
return 0;
}
-#ifdef CONFIG_X86_64
/*
- * If we're emulating a rip-relative instruction, save the contents
- * of the scratch register and store the target address in that register.
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
*/
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
-{
- if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
- autask->saved_scratch_register = regs->ax;
- regs->ax = current->utask->vaddr;
- regs->ax += auprobe->rip_rela_target_address;
- } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
- autask->saved_scratch_register = regs->cx;
- regs->cx = current->utask->vaddr;
- regs->cx += auprobe->rip_rela_target_address;
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(); /* Pop incorrect return address */
+ if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
}
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
}
-#else
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- /* No RIP-relative addressing on 32-bit */
+ riprel_post_xol(auprobe, regs);
}
-#endif
-/*
- * arch_uprobe_pre_xol - prepare to execute out of line.
- * @auprobe: the probepoint information.
- * @regs: reflects the saved user state of current task.
- */
-int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+static struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
{
- struct arch_uprobe_task *autask;
+ return auprobe->branch.opc1 == 0xe8;
+}
- autask = &current->utask->autask;
- autask->saved_trap_nr = current->thread.trap_nr;
- current->thread.trap_nr = UPROBE_TRAP_NR;
- regs->ip = current->utask->xol_vaddr;
- pre_xol_rip_insn(auprobe, regs, autask);
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
- autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
- regs->flags |= X86_EFLAGS_TF;
- if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
- set_task_blockstep(current, false);
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
- return 0;
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
}
-/*
- * This function is called by arch_uprobe_post_xol() to adjust the return
- * address pushed by a call instruction executed out of line.
- */
-static int adjust_ret_addr(unsigned long sp, long correction)
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- int rasize, ncopied;
- long ra = 0;
+ unsigned long flags = regs->flags;
- if (is_ia32_task())
- rasize = 4;
- else
- rasize = 8;
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
- ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
- if (unlikely(ncopied))
- return -EFAULT;
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
- ra += correction;
- ncopied = copy_to_user((void __user *)sp, &ra, rasize);
- if (unlikely(ncopied))
- return -EFAULT;
+#undef XF
+#undef COND
+#undef CASE_COND
- return 0;
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (push_ret_address(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
}
-#ifdef CONFIG_X86_64
-static bool is_riprel_insn(struct arch_uprobe *auprobe)
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long();
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
}
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+static struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
- if (is_riprel_insn(auprobe)) {
- struct arch_uprobe_task *autask;
+ u8 opc1 = OPCODE1(insn);
+ int i;
- autask = &current->utask->autask;
- if (auprobe->fixups & UPROBE_FIX_RIP_AX)
- regs->ax = autask->saved_scratch_register;
- else
- regs->cx = autask->saved_scratch_register;
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
/*
- * The original instruction includes a displacement, and so
- * is 4 bytes longer than what we've just single-stepped.
- * Fall through to handle stuff like "jmpq *...(%rip)" and
- * "callq *...(%rip)".
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
*/
- if (correction)
- *correction += 4;
+ opc1 = OPCODE2(insn) - 0x10;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ if (insn->prefixes.bytes[i] == 0x66)
+ return -ENOTSUPP;
}
+
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
}
-#else
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
- /* No RIP-relative addressing on 32-bit */
+ struct insn insn;
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ /* fall through */
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
}
-#endif
/*
* If xol insn itself traps and generates a signal(Say,
@@ -572,53 +799,42 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
* single-step, we single-stepped a copy of the instruction.
*
* This function prepares to resume execution after the single-step.
- * We have to fix things up as follows:
- *
- * Typically, the new ip is relative to the copied instruction. We need
- * to make it relative to the original instruction (FIX_IP). Exceptions
- * are return instructions and absolute or indirect jump or call instructions.
- *
- * If the single-stepped instruction was a call, the return address that
- * is atop the stack is the address following the copied instruction. We
- * need to make it the address following the original instruction (FIX_CALL).
- *
- * If the original instruction was a rip-relative instruction such as
- * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
- * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
- * We need to restore the contents of the scratch register and adjust
- * the ip, keeping in mind that the instruction we executed is 4 bytes
- * shorter than the original instruction (since we squeezed out the offset
- * field). (FIX_RIP_AX or FIX_RIP_CX)
*/
int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- struct uprobe_task *utask;
- long correction;
- int result = 0;
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
-
- utask = current->utask;
current->thread.trap_nr = utask->autask.saved_trap_nr;
- correction = (long)(utask->vaddr - utask->xol_vaddr);
- handle_riprel_post_xol(auprobe, regs, &correction);
- if (auprobe->fixups & UPROBE_FIX_IP)
- regs->ip += correction;
-
- if (auprobe->fixups & UPROBE_FIX_CALL)
- result = adjust_ret_addr(regs->sp, correction);
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
/*
* arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
* so we can get an extra SIGTRAP if we do not clear TF. We need
* to examine the opcode to make it right.
*/
- if (utask->autask.saved_tf)
+ if (send_sigtrap)
send_sig(SIGTRAP, current, 0);
- else if (!(auprobe->fixups & UPROBE_FIX_SETF))
+
+ if (!utask->autask.saved_tf)
regs->flags &= ~X86_EFLAGS_TF;
- return result;
+ return err;
}
/* callback routine for handling exceptions. */
@@ -652,41 +868,27 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
/*
* This function gets called when XOL instruction either gets trapped or
- * the thread has a fatal signal, so reset the instruction pointer to its
- * probed address.
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
*/
void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
- current->thread.trap_nr = utask->autask.saved_trap_nr;
- handle_riprel_post_xol(auprobe, regs, NULL);
- instruction_pointer_set(regs, utask->vaddr);
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
/* clear TF if it was set by us in arch_uprobe_pre_xol() */
if (!utask->autask.saved_tf)
regs->flags &= ~X86_EFLAGS_TF;
}
-/*
- * Skip these instructions as per the currently known x86 ISA.
- * rep=0x66*; nop=0x90
- */
static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- int i;
-
- for (i = 0; i < MAX_UINSN_BYTES; i++) {
- if (auprobe->insn[i] == 0x66)
- continue;
-
- if (auprobe->insn[i] == 0x90) {
- regs->ip += i + 1;
- return true;
- }
-
- break;
- }
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
return false;
}
@@ -701,23 +903,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- int rasize, ncopied;
+ int rasize = sizeof_long(), nleft;
unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
- rasize = is_ia32_task() ? 4 : 8;
- ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize);
- if (unlikely(ncopied))
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
return -1;
/* check whether address has been already hijacked */
if (orig_ret_vaddr == trampoline_vaddr)
return orig_ret_vaddr;
- ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
- if (likely(!ncopied))
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft))
return orig_ret_vaddr;
- if (ncopied != rasize) {
+ if (nleft != rasize) {
pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8b3b3eb3cead..e1e1e80fc6a6 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -81,17 +81,17 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
{
int nr;
- if ((addr & ~0xC00UL) != VSYSCALL_START)
+ if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
return -EINVAL;
nr = (addr & 0xC00UL) >> 10;
@@ -330,24 +330,17 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
- (unsigned long)VSYSCALL_START);
-
- __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
- (unsigned long)VVAR_ADDRESS);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+ (unsigned long)VSYSCALL_ADDR);
}
static int __init vsyscall_init(void)
{
- BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
-
cpu_notifier_register_begin();
on_each_cpu(cpu_vsyscall_init, NULL, 1);
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index 9531fbb123ba..c7d791f32b98 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,29 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
gtod_write_begin(vdata);
/* copy vsyscall data */
- vdata->vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->cycle_last = tk->clock->cycle_last;
- vdata->mask = tk->clock->mask;
- vdata->mult = tk->mult;
- vdata->shift = tk->shift;
+ vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->tkr.cycle_last;
+ vdata->mask = tk->tkr.mask;
+ vdata->mult = tk->tkr.mult;
+ vdata->shift = tk->tkr.shift;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->xtime_nsec;
+ vdata->wall_time_snsec = tk->tkr.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
+ vdata->monotonic_time_snsec = tk->tkr.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->shift);
+ << tk->tkr.shift);
while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
+ tk->tkr.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 287e4c85fff9..f9d16ff56c6b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
select MMU_NOTIFIER
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f47a104a749c..38a0afe83c6b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
+ /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
+ * but *not* advertised to guests via CPUID ! */
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
@@ -495,6 +497,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_supported_word6_x86_features;
cpuid_mask(&entry->ecx, 6);
break;
+ case 0x80000007: /* Advanced power management */
+ /* invariant TSC is CPUID.80000007H:EDX[8] */
+ entry->edx &= (1 << 8);
+ /* mask against host */
+ entry->edx &= boot_cpu_data.x86_power;
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
case 0x80000008: {
unsigned g_phys_as = (entry->eax >> 16) & 0xff;
unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
@@ -525,7 +534,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
case 3: /* Processor serial number */
case 5: /* MONITOR/MWAIT */
case 6: /* Thermal management */
- case 0x80000007: /* Advanced power management */
case 0xC0000002:
case 0xC0000003:
case 0xC0000004:
@@ -726,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
not_found:
return 36;
}
+EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
/*
* If no match is found, check whether we exceed the vCPU's limit
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index eeecbed26ac7..a5380590ab0e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -88,4 +88,19 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_X2APIC));
}
+static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_GBPAGES));
+}
+
+static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_RTM));
+}
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 205b17eed93c..56657b0bb3bb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,6 +161,11 @@
#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
#define NoWrite ((u64)1 << 45) /* No writeback */
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define NoBigReal ((u64)1 << 50) /* No big real mode */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -425,6 +430,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
.modrm_reg = ctxt->modrm_reg,
.modrm_rm = ctxt->modrm_rm,
.src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
.src_bytes = ctxt->src.bytes,
.dst_bytes = ctxt->dst.bytes,
.ad_bytes = ctxt->ad_bytes,
@@ -510,12 +516,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
return desc->g ? (limit << 12) | 0xfff : limit;
}
-static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
-{
- ctxt->has_seg_override = true;
- ctxt->seg_override = seg;
-}
-
static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
@@ -524,14 +524,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
return ctxt->ops->get_cached_segment_base(ctxt, seg);
}
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
-{
- if (!ctxt->has_seg_override)
- return 0;
-
- return ctxt->seg_override;
-}
-
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
@@ -650,7 +642,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (!fetch && (desc.type & 8) && !(desc.type & 2))
goto bad;
lim = desc_limit_scaled(&desc);
- if ((desc.type & 8) || !(desc.type & 4)) {
+ if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch &&
+ (ctxt->d & NoBigReal)) {
+ /* la is between zero and 0xffff */
+ if (la > 0xffff || (u32)(la + size - 1) > 0xffff)
+ goto bad;
+ } else if ((desc.type & 8) || !(desc.type & 4)) {
/* expand-up segment */
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
@@ -715,68 +712,71 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
}
/*
- * Fetch the next byte of the instruction being emulated which is pointed to
- * by ctxt->_eip, then increment ctxt->_eip.
- *
- * Also prefetch the remaining bytes of the instruction without crossing page
+ * Prefetch the remaining bytes of the instruction without crossing page
* boundary if they are not in fetch_cache yet.
*/
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
{
- struct fetch_cache *fc = &ctxt->fetch;
int rc;
- int size, cur_size;
-
- if (ctxt->_eip == fc->end) {
- unsigned long linear;
- struct segmented_address addr = { .seg = VCPU_SREG_CS,
- .ea = ctxt->_eip };
- cur_size = fc->end - fc->start;
- size = min(15UL - cur_size,
- PAGE_SIZE - offset_in_page(ctxt->_eip));
- rc = __linearize(ctxt, addr, size, false, true, &linear);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
- size, &ctxt->exception);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- fc->end += size;
- }
- *dest = fc->data[ctxt->_eip - fc->start];
- ctxt->_eip++;
- return X86EMUL_CONTINUE;
-}
+ unsigned size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ size = 15UL ^ cur_size;
+ rc = __linearize(ctxt, addr, size, false, true, &linear);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- void *dest, unsigned size)
-{
- int rc;
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
- /* x86 instructions are limited to 15 bytes. */
- if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
return X86EMUL_UNHANDLEABLE;
- while (size--) {
- rc = do_insn_fetch_byte(ctxt, dest++);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- }
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
return X86EMUL_CONTINUE;
}
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size))
+ return __do_insn_fetch_bytes(ctxt, size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _ctxt) \
-({ unsigned long _x; \
- rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
- (_type)_x; \
+ ctxt->_eip += sizeof(_type); \
+ _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
})
#define insn_fetch_arr(_arr, _size, _ctxt) \
-({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
})
/*
@@ -1062,22 +1062,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
u8 sib;
- int index_reg = 0, base_reg = 0, scale;
+ int index_reg, base_reg, scale;
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- if (ctxt->rex_prefix) {
- ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
- ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
- }
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
- ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
- ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
ctxt->modrm_seg = VCPU_SREG_DS;
- if (ctxt->modrm_mod == 3) {
+ if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
@@ -1092,7 +1090,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if (ctxt->d & Mmx) {
op->type = OP_MM;
op->bytes = 8;
- op->addr.xmm = ctxt->modrm_rm & 7;
+ op->addr.mm = ctxt->modrm_rm & 7;
return rc;
}
fetch_register_operand(op);
@@ -1189,6 +1187,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
}
}
op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
done:
return rc;
}
@@ -1219,12 +1220,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
long sv = 0, mask;
if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
- mask = ~(ctxt->dst.bytes * 8 - 1);
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
if (ctxt->src.bytes == 2)
sv = (s16)ctxt->src.val & (s16)mask;
else if (ctxt->src.bytes == 4)
sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
ctxt->dst.addr.mem.ea += (sv >> 3);
}
@@ -1314,8 +1317,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
in_page = (ctxt->eflags & EFLG_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
- n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
- count);
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
@@ -1324,7 +1326,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
rc->end = n * size;
}
- if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
+ if (ctxt->rep_prefix && (ctxt->d & String) &&
+ !(ctxt->eflags & EFLG_DF)) {
ctxt->dst.data = rc->data + rc->pos;
ctxt->dst.type = OP_MEM_STR;
ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1356,17 +1359,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_ptr *dt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
if (selector & 1 << 2) {
struct desc_struct desc;
u16 sel;
memset (dt, 0, sizeof *dt);
- if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
- dt->address = get_desc_base(&desc);
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
} else
ops->get_gdt(ctxt, dt);
}
@@ -1409,17 +1414,18 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
}
/* Does not support long mode */
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, int seg)
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg, u8 cpl, bool in_task_switch)
{
struct desc_struct seg_desc, old_desc;
- u8 dpl, rpl, cpl;
+ u8 dpl, rpl;
unsigned err_vec = GP_VECTOR;
u32 err_code = 0;
bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
ulong desc_addr;
int ret;
u16 dummy;
+ u32 base3 = 0;
memset(&seg_desc, 0, sizeof seg_desc);
@@ -1441,7 +1447,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
}
rpl = selector & 3;
- cpl = ctxt->ops->cpl(ctxt);
/* NULL selector is not valid for TR, CS and SS (except for long mode) */
if ((seg == VCPU_SREG_CS
@@ -1486,6 +1491,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
break;
case VCPU_SREG_CS:
+ if (in_task_switch && rpl != dpl)
+ goto exception;
+
if (!(seg_desc.type & 8))
goto exception;
@@ -1534,15 +1542,27 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
ret = write_segment_descriptor(ctxt, selector, &seg_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
+ sizeof(base3), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
}
load:
- ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
return X86EMUL_CONTINUE;
exception:
emulate_exception(ctxt, err_vec, err_code, true);
return X86EMUL_PROPAGATE_FAULT;
}
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg)
+{
+ u8 cpl = ctxt->ops->cpl(ctxt);
+ return __load_segment_descriptor(ctxt, selector, seg, cpl, false);
+}
+
static void write_register_operand(struct operand *op)
{
/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
@@ -1564,34 +1584,28 @@ static void write_register_operand(struct operand *op)
static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
{
- int rc;
-
switch (op->type) {
case OP_REG:
write_register_operand(op);
break;
case OP_MEM:
if (ctxt->lock_prefix)
- rc = segmented_cmpxchg(ctxt,
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
op->addr.mem,
- &op->orig_val,
&op->val,
op->bytes);
- else
- rc = segmented_write(ctxt,
- op->addr.mem,
- &op->val,
- op->bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
break;
case OP_MEM_STR:
- rc = segmented_write(ctxt,
- op->addr.mem,
- op->data,
- op->bytes * op->count);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
break;
case OP_XMM:
write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
@@ -1660,7 +1674,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
- | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
switch(ctxt->mode) {
case X86EMUL_MODE_PROT64:
@@ -1743,6 +1757,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
}
@@ -1980,6 +1997,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
{
u64 old = ctxt->dst.orig_val64;
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
@@ -2006,6 +2026,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
unsigned long cs;
+ int cpl = ctxt->ops->cpl(ctxt);
rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2015,6 +2036,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -2033,8 +2057,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
{
/* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
ctxt->src.orig_val = ctxt->src.val;
- ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
@@ -2044,6 +2070,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
/* Failure: write the value we saw to EAX. */
ctxt->dst.type = OP_REG;
ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->dst.val = ctxt->dst.orig_val;
}
return X86EMUL_CONTINUE;
}
@@ -2183,7 +2210,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
*reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
- *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
ops->get_msr(ctxt,
ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2191,14 +2218,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->_eip = msr_data;
ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
- ctxt->eflags &= ~(msr_data | EFLG_RF);
+ ctxt->eflags &= ~msr_data;
#endif
} else {
/* legacy mode */
ops->get_msr(ctxt, MSR_STAR, &msr_data);
ctxt->_eip = (u32)msr_data;
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
}
return X86EMUL_CONTINUE;
@@ -2247,7 +2274,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
break;
}
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
cs_sel = (u16)msr_data;
cs_sel &= ~SELECTOR_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2404,6 +2431,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
struct tss_segment_16 *tss)
{
int ret;
+ u8 cpl;
ctxt->_eip = tss->ip;
ctxt->eflags = tss->flag | 2;
@@ -2426,23 +2454,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ cpl = tss->cs & 3;
+
/*
* Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
- ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2496,7 +2526,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
struct tss_segment_32 *tss)
{
- tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
+ /* CR3 and ldt selector are not saved intentionally */
tss->eip = ctxt->_eip;
tss->eflags = ctxt->eflags;
tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
@@ -2514,13 +2544,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
- tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
}
static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct tss_segment_32 *tss)
{
int ret;
+ u8 cpl;
if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
return emulate_gp(ctxt, 0);
@@ -2539,7 +2569,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
/*
* SDM says that segment selectors are loaded before segment
- * descriptors
+ * descriptors. This is important because CPL checks will
+ * use CS.RPL.
*/
set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
@@ -2553,43 +2584,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
* If we're switching between Protected Mode and VM86, we need to make
* sure to update the mode before loading the segment descriptors so
* that the selectors are interpreted correctly.
- *
- * Need to get rflags to the vcpu struct immediately because it
- * influences the CPL which is checked at least when loading the segment
- * descriptors and when pushing an error code to the new kernel stack.
- *
- * TODO Introduce a separate ctxt->ops->set_cpl callback
*/
- if (ctxt->eflags & X86_EFLAGS_VM)
+ if (ctxt->eflags & X86_EFLAGS_VM) {
ctxt->mode = X86EMUL_MODE_VM86;
- else
+ cpl = 3;
+ } else {
ctxt->mode = X86EMUL_MODE_PROT32;
-
- ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+ cpl = tss->cs & 3;
+ }
/*
* Now load segment descriptors. If fault happenes at this stage
* it is handled in a context of new task
*/
- ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2604,6 +2630,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
struct tss_segment_32 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
+ u32 eip_offset = offsetof(struct tss_segment_32, eip);
+ u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
@@ -2613,8 +2641,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
save_state_to_tss32(ctxt, &tss_seg);
- ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
+ /* Only GP registers and segment selectors are saved */
+ ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset, &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
@@ -2951,7 +2980,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
- memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
return X86EMUL_CONTINUE;
}
@@ -3208,7 +3237,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
static int em_smsw(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.bytes = 2;
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
return X86EMUL_CONTINUE;
}
@@ -3386,10 +3416,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
if (efer & EFER_LMA)
rsvd = CR3_L_MODE_RESERVED_BITS;
- else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
- rsvd = CR3_PAE_RESERVED_BITS;
- else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
- rsvd = CR3_NONPAE_RESERVED_BITS;
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
@@ -3487,7 +3513,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
- (rcx > 3))
+ ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -3512,9 +3538,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
}
#define D(_y) { .flags = (_y) }
-#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
-#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
#define N D(NotImpl)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
@@ -3523,10 +3549,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
#define II(_f, _e, _i) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
#define IIP(_f, _e, _i, _p) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
#define D2bv(_f) D((_f) | ByteOp), D(_f)
@@ -3625,8 +3651,8 @@ static const struct opcode group6[] = {
};
static const struct group_dual group7 = { {
- II(Mov | DstMem | Priv, em_sgdt, sgdt),
- II(Mov | DstMem | Priv, em_sidt, sidt),
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
II(SrcMem | Priv, em_lgdt, lgdt),
II(SrcMem | Priv, em_lidt, lidt),
II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3869,10 +3895,12 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N, N, N, N, N,
D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM),
/* 0x20 - 0x2F */
- DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
- DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
- IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
- IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+ check_cr_write),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+ check_dr_write),
N, N, N, N,
GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
@@ -3888,7 +3916,7 @@ static const struct opcode twobyte_table[256] = {
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
- X16(D(DstReg | SrcMem | ModRM | Mov)),
+ X16(D(DstReg | SrcMem | ModRM)),
/* 0x50 - 0x5F */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
@@ -4050,12 +4078,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
mem_common:
*op = ctxt->memop;
ctxt->memopp = op;
- if ((ctxt->d & BitOp) && op == &ctxt->dst)
+ if (ctxt->d & BitOp)
fetch_bit_operand(ctxt);
op->orig_val = op->val;
break;
case OpMem64:
- ctxt->memop.bytes = 8;
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
goto mem_common;
case OpAcc:
op->type = OP_REG;
@@ -4139,7 +4167,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
- op->addr.mem.seg = seg_override(ctxt);
+ op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
op->count = 1;
break;
@@ -4150,7 +4178,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
register_address(ctxt,
reg_read(ctxt, VCPU_REGS_RBX) +
(reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
- op->addr.mem.seg = seg_override(ctxt);
+ op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
break;
case OpImmFAddr:
@@ -4197,16 +4225,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
bool op_prefix = false;
+ bool has_seg_override = false;
struct opcode opcode;
ctxt->memop.type = OP_NONE;
ctxt->memopp = NULL;
ctxt->_eip = ctxt->eip;
- ctxt->fetch.start = ctxt->_eip;
- ctxt->fetch.end = ctxt->fetch.start + insn_len;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ }
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -4250,11 +4284,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
case 0x2e: /* CS override */
case 0x36: /* SS override */
case 0x3e: /* DS override */
- set_seg_override(ctxt, (ctxt->b >> 3) & 3);
+ has_seg_override = true;
+ ctxt->seg_override = (ctxt->b >> 3) & 3;
break;
case 0x64: /* FS override */
case 0x65: /* GS override */
- set_seg_override(ctxt, ctxt->b & 7);
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->b & 7;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -4303,6 +4339,13 @@ done_prefixes:
if (ctxt->d & ModRM)
ctxt->modrm = insn_fetch(u8, ctxt);
+ /* vex-prefix instructions are not implemented */
+ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+ (mode == X86EMUL_MODE_PROT64 ||
+ (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) {
+ ctxt->d = NotImpl;
+ }
+
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
@@ -4345,49 +4388,59 @@ done_prefixes:
ctxt->d |= opcode.flags;
}
- ctxt->execute = opcode.u.execute;
- ctxt->check_perm = opcode.check_perm;
- ctxt->intercept = opcode.intercept;
-
/* Unrecognised? */
- if (ctxt->d == 0 || (ctxt->d & NotImpl))
+ if (ctxt->d == 0)
return EMULATION_FAILED;
- if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
- return EMULATION_FAILED;
+ ctxt->execute = opcode.u.execute;
- if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
- ctxt->op_bytes = 8;
+ if (unlikely(ctxt->d &
+ (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
+ return EMULATION_FAILED;
- if (ctxt->d & Op3264) {
- if (mode == X86EMUL_MODE_PROT64)
+ if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
ctxt->op_bytes = 8;
- else
- ctxt->op_bytes = 4;
- }
- if (ctxt->d & Sse)
- ctxt->op_bytes = 16;
- else if (ctxt->d & Mmx)
- ctxt->op_bytes = 8;
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
/* ModRM and SIB bytes. */
if (ctxt->d & ModRM) {
rc = decode_modrm(ctxt, &ctxt->memop);
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, ctxt->modrm_seg);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
} else if (ctxt->d & MemAbs)
rc = decode_abs(ctxt, &ctxt->memop);
if (rc != X86EMUL_CONTINUE)
goto done;
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, VCPU_SREG_DS);
-
- ctxt->memop.addr.mem.seg = seg_override(ctxt);
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
- if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
- ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
/*
* Decode and fetch the source operand: register, memory
@@ -4409,7 +4462,7 @@ done_prefixes:
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
done:
- if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
+ if (ctxt->rip_relative)
ctxt->memopp->addr.mem.ea += ctxt->_eip;
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
@@ -4484,6 +4537,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
return X86EMUL_CONTINUE;
}
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ memset(&ctxt->rip_relative, 0,
+ (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -4492,12 +4555,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->mem_read.pos = 0;
- if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
- (ctxt->d & Undefined)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
/* LOCK prefix is allowed only with some instructions */
if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
rc = emulate_ud(ctxt);
@@ -4509,69 +4566,82 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
- || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
- rc = emulate_nm(ctxt);
- goto done;
- }
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
- if (ctxt->d & Mmx) {
- rc = flush_pending_x87_faults(ctxt);
- if (rc != X86EMUL_CONTINUE)
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
goto done;
- /*
- * Now that we know the fpu is exception safe, we can fetch
- * operands from it.
- */
- fetch_possible_mmx_operand(ctxt, &ctxt->src);
- fetch_possible_mmx_operand(ctxt, &ctxt->src2);
- if (!(ctxt->d & Mov))
- fetch_possible_mmx_operand(ctxt, &ctxt->dst);
- }
+ }
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_PRE_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
goto done;
- }
+ }
- /* Privileged instruction can be executed only in CPL=0 */
- if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(ctxt, &ctxt->src);
+ fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ }
- /* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
- rc = emulate_ud(ctxt);
- goto done;
- }
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
- /* Do instruction specific permission checks */
- if (ctxt->check_perm) {
- rc = ctxt->check_perm(ctxt);
- if (rc != X86EMUL_CONTINUE)
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
goto done;
- }
+ }
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_POST_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
goto done;
- }
+ }
- if (ctxt->rep_prefix && (ctxt->d & String)) {
- /* All REP prefixes have the same first termination condition */
- if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
- ctxt->eip = ctxt->_eip;
- goto done;
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~EFLG_RF;
+ goto done;
+ }
}
}
@@ -4605,13 +4675,18 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
goto done;
}
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= EFLG_RF;
+ else
+ ctxt->eflags &= ~EFLG_RF;
+
if (ctxt->execute) {
if (ctxt->d & Fastop) {
void (*fop)(struct fastop *) = (void *)ctxt->execute;
@@ -4646,8 +4721,9 @@ special_insn:
break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
- break;
- rc = em_xchg(ctxt);
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
break;
case 0x98: /* cbw/cwde/cdqe */
switch (ctxt->op_bytes) {
@@ -4698,17 +4774,17 @@ special_insn:
goto done;
writeback:
- if (!(ctxt->d & NoWrite)) {
- rc = writeback(ctxt, &ctxt->dst);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
if (ctxt->d & SrcWrite) {
BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
rc = writeback(ctxt, &ctxt->src);
if (rc != X86EMUL_CONTINUE)
goto done;
}
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
/*
* restore dst type in case the decoding will be reused
@@ -4750,6 +4826,7 @@ writeback:
}
goto done; /* skip rip writeback */
}
+ ctxt->eflags &= ~EFLG_RF;
}
ctxt->eip = ctxt->_eip;
@@ -4782,8 +4859,10 @@ twobyte_insn:
ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
break;
case 0x40 ... 0x4f: /* cmov */
- ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
- if (!test_cc(ctxt->b, ctxt->eflags))
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
+ ctxt->op_bytes != 4)
ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
@@ -4807,8 +4886,8 @@ twobyte_insn:
break;
case 0xc3: /* movnti */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
- (u64) ctxt->src.val;
+ ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val :
+ (u32) ctxt->src.val;
break;
default:
goto cannot_emulate;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 484bc874688b..a1ec6a50a05a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,11 +108,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
vector = kvm_cpu_get_extint(v);
- if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ if (vector != -1)
return vector; /* PIC */
return kvm_get_apic_interrupt(v); /* APIC */
}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9736529ade08..08e8a899e005 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -352,31 +352,90 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = false;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ /* try to update RVI */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else {
+ vec = apic_search_irr(apic);
+ apic->irr_pending = (vec != -1);
+ }
}
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
+ else {
++apic->isr_count;
- BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
/*
- * ISR (in service register) bit is set when injecting an interrupt.
- * The highest vector is injected. Thus the latest bit set matches
- * the highest bit in ISR.
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
*/
- apic->highest_isr_cache = vec;
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
}
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ struct kvm_vcpu *vcpu;
+ if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+ apic_find_highest_isr(apic));
+ else {
--apic->isr_count;
- BUG_ON(apic->isr_count < 0);
- apic->highest_isr_cache = -1;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
}
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
@@ -456,22 +515,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
- int result;
-
- /* Note that isr_count is always 1 with vid enabled */
- if (!apic->isr_count)
- return -1;
- if (likely(apic->highest_isr_cache != -1))
- return apic->highest_isr_cache;
-
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1429,7 +1472,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+ apic_debug("%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
@@ -1608,6 +1651,13 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
if (vector == -1)
return -1;
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
@@ -1871,7 +1921,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- pr_debug("vcpu %d received sipi with vector # %x\n",
+ apic_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, sipi_vector);
kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 813d31038b93..931467881da7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,6 +22,7 @@
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "cpuid.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomicly update it, see the comments in
* spte_has_volatile_bits().
*/
- if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ if (spte_is_locklessly_modifiable(old_spte) &&
+ !is_writable_pte(new_spte))
ret = true;
if (!shadow_accessed_mask)
@@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
/*
* Write-protect on the specified @sptep, @pt_protect indicates whether
- * spte writ-protection is caused by protecting shadow page table.
- * @flush indicates whether tlb need be flushed.
+ * spte write-protection is caused by protecting shadow page table.
*
* Note: write protection is difference between drity logging and spte
* protection:
@@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
* - for spte protection, the spte can be writable only after unsync-ing
* shadow page.
*
- * Return true if the spte is dropped.
+ * Return true if tlb need be flushed.
*/
-static bool
-spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
@@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
- if (__drop_large_spte(kvm, sptep)) {
- *flush |= true;
- return true;
- }
-
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
spte = spte & ~PT_WRITABLE_MASK;
- *flush |= mmu_spte_update(sptep, spte);
- return false;
+ return mmu_spte_update(sptep, spte);
}
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
- sptep = rmap_get_first(*rmapp, &iter);
- continue;
- }
+ flush |= spte_write_protect(kvm, sptep, pt_protect);
sptep = rmap_get_next(&iter);
}
@@ -2802,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code)
}
static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *sptep, u64 spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
gfn_t gfn;
WARN_ON(!sp->role.direct);
@@ -2830,6 +2821,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
bool ret = false;
u64 spte = 0ull;
@@ -2853,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
goto exit;
}
- if (!is_last_spte(spte, level))
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
goto exit;
/*
@@ -2875,11 +2868,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
goto exit;
/*
+ * Do not fix write-permission on the large spte since we only dirty
+ * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
+ * that means other pages are missed if its slot is dirty-logged.
+ *
+ * Instead, we let the slow page fault path create a normal spte to
+ * fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ goto exit;
+
+ /*
* Currently, fast page fault only works for direct mapping since
* the gfn is not stable for indirect shadow page.
* See Documentation/virtual/kvm/locking.txt to get more detail.
*/
- ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+ ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
exit:
trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
spte, ret);
@@ -3511,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
{
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
+ u64 gbpages_bit_rsvd = 0;
context->bad_mt_xwr = 0;
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
+ if (!guest_cpuid_has_gbpages(vcpu))
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
@@ -3538,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62); /* PDE */
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -3550,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 29);
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
@@ -4304,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (*rmapp)
__rmap_write_protect(kvm, rmapp, false);
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_flush_remote_tlbs(kvm);
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
- }
}
}
- kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+
+ /*
+ * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * which do tlb flush out of mmu-lock should be serialized by
+ * kvm->slots_lock otherwise tlb flush would be missed.
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * We can flush all the TLBs out of the mmu lock without TLB
+ * corruption since we just change the spte from writable to
+ * readonly so that we only need to care the case of changing
+ * spte from present to present (changing the spte from present
+ * to nonpresent will flush all the TLBs immediately), in other
+ * words, the only case we care is mmu_spte_update() where we
+ * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
+ * instead of PT_WRITABLE_MASK, that means it does not depend
+ * on PT_WRITABLE_MASK anymore.
+ */
+ kvm_flush_remote_tlbs(kvm);
}
#define BATCH_ZAP_PAGES 10
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3842e70bdb7c..b982112d2ca5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte)
return pte & PT_PRESENT_MASK;
}
+/*
+ * Currently, we have two sorts of write-protection, a) the first one
+ * write-protects guest page to sync the guest modification, b) another one is
+ * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
+ * between these two sorts are:
+ * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 2) the first case requires flushing tlb immediately avoiding corrupting
+ * shadow page table between all vcpus so it should be in the protection of
+ * mmu-lock. And the another case does not need to flush tlb until returning
+ * the dirty bitmap to userspace since it only write-protects the page
+ * logged in the bitmap, that means the page in the dirty bitmap is not
+ * missed, so it can flush tlb out of mmu-lock.
+ *
+ * So, there is the problem: the first case can meet the corrupted tlb caused
+ * by another case which write-protects pages but without flush tlb
+ * immediately. In order to making the first case be aware this problem we let
+ * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
+ * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ *
+ * Anyway, whenever a spte is updated (only permission and status bits are
+ * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * readonly, if that happens, we need to flush tlb. Fortunately,
+ * mmu_spte_update() has already handled it perfectly.
+ *
+ * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * - if we want to see if it has writable tlb entry or if the spte can be
+ * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ * case, otherwise
+ * - if we fix page fault on the spte or do write-protection by dirty logging,
+ * check PT_WRITABLE_MASK.
+ *
+ * TODO: introduce APIs to split these two cases.
+ */
static inline int is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1185fe7a7f47..9ade5cfb5a4c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
int ret;
unsigned long enable;
- ret = strict_strtoul(val, 10, &enable);
+ ret = kstrtoul(val, 10, &enable);
if (ret < 0)
return -EINVAL;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9d2e0ffcb190..5aaf35641768 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -22,7 +22,7 @@
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
- const char *ret = p->buffer + p->len; \
+ const u32 saved_len = p->len; \
static const char *access_str[] = { \
"---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
}; \
@@ -41,7 +41,7 @@
role.nxe ? "" : "!", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
- ret; \
+ p->buffer + saved_len; \
})
#define kvm_mmu_trace_pferr_flags \
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 123efd3ec29f..410776528265 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -913,8 +913,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
* and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
* used by guest then tlbs are not flushed, so guest is allowed to access the
* freed pages.
- * We set tlbs_dirty to let the notifier know this change and delay the flush
- * until such a case actually happens.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -943,7 +942,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return -EINVAL;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- vcpu->kvm->tlbs_dirty = true;
+ vcpu->kvm->tlbs_dirty++;
continue;
}
@@ -958,7 +957,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
- vcpu->kvm->tlbs_dirty = true;
+ vcpu->kvm->tlbs_dirty++;
continue;
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 5c4f63151b4d..3dd6accb64ec 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+ }
}
static void kvm_perf_overflow_intr(struct perf_event *perf_event,
@@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
- kvm_perf_overflow(perf_event, data, regs);
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
/*
* Inject PMI. If vcpu was in a guest mode during NMI PMI
@@ -425,6 +428,15 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ bool fixed = pmc & (1u << 30);
+ pmc &= ~(3u << 30);
+ return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
+ (fixed && pmc >= pmu->nr_arch_fixed_counters);
+}
+
int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
{
struct kvm_pmu *pmu = &vcpu->arch.pmu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7f4f9c2badae..ddf742768ecf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info)
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}
-static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
}
static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
@@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
-static void svm_update_cpl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int cpl;
-
- if (!is_protmode(vcpu))
- cpl = 0;
- else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
- cpl = 3;
- else
- cpl = svm->vmcb->save.cs.selector & 0x3;
-
- svm->vmcb->save.cpl = cpl;
-}
-
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.rflags;
@@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
-
+ /*
+ * Any change of EFLAGS.VM is accompained by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
to_svm(vcpu)->vmcb->save.rflags = rflags;
- if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
- svm_update_cpl(vcpu);
}
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1429,7 +1415,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
/*
* AMD's VMCB does not have an explicit unusable field, so emulate it
@@ -1438,14 +1433,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->unusable = !var->present || (var->type == 0);
switch (seg) {
- case VCPU_SREG_CS:
- /*
- * SVM always stores 0 for the 'G' bit in the CS selector in
- * the VMCB on a VMEXIT. This hurts cross-vendor migration:
- * Intel's VMENTRY has a check on the 'G' bit.
- */
- var->g = s->limit > 0xfffff;
- break;
case VCPU_SREG_TR:
/*
* Work around a bug where the busy flag in the tr selector
@@ -1476,6 +1463,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
}
@@ -1631,8 +1619,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
}
- if (seg == VCPU_SREG_CS)
- svm_update_cpl(vcpu);
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
mark_dirty(svm->vmcb, VMCB_SEG);
}
@@ -2122,22 +2117,27 @@ static void nested_svm_unmap(struct page *page)
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
- unsigned port;
- u8 val, bit;
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
u64 gpa;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
gpa = svm->nested.vmcb_iopm + (port / 8);
- bit = port % 8;
- val = 0;
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
- val &= (1 << bit);
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
- return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -2770,12 +2770,6 @@ static int xsetbv_interception(struct vcpu_svm *svm)
return 1;
}
-static int invalid_op_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
static int task_switch_interception(struct vcpu_svm *svm)
{
u16 tss_selector;
@@ -3287,6 +3281,24 @@ static int pause_interception(struct vcpu_svm *svm)
return 1;
}
+static int nop_interception(struct vcpu_svm *svm)
+{
+ skip_emulated_instruction(&(svm->vcpu));
+ return 1;
+}
+
+static int monitor_interception(struct vcpu_svm *svm)
+{
+ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+ return nop_interception(svm);
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+ return nop_interception(svm);
+}
+
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3344,8 +3356,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
[SVM_EXIT_WBINVD] = emulate_on_interception,
- [SVM_EXIT_MONITOR] = invalid_op_interception,
- [SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_MONITOR] = monitor_interception,
+ [SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
};
@@ -4199,7 +4211,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
if (info->intercept == x86_intercept_cr_write)
icpt_info.exit_code += info->modrm_reg;
- if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
break;
intercept = svm->nested.intercept;
@@ -4244,14 +4257,14 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
u64 exit_info;
u32 bytes;
- exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
-
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
- exit_info |= SVM_IOIO_TYPE_MASK;
- bytes = info->src_bytes;
- } else {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
}
if (info->intercept == x86_intercept_outs ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 545245d7cc63..e850a7d332be 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall,
/*
* Tracepoint for PIO.
*/
+
+#define KVM_PIO_IN 0
+#define KVM_PIO_OUT 1
+
TRACE_EVENT(kvm_pio,
TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
- unsigned int count),
- TP_ARGS(rw, port, size, count),
+ unsigned int count, void *data),
+ TP_ARGS(rw, port, size, count, data),
TP_STRUCT__entry(
__field( unsigned int, rw )
__field( unsigned int, port )
__field( unsigned int, size )
__field( unsigned int, count )
+ __field( unsigned int, val )
),
TP_fast_assign(
@@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio,
__entry->port = port;
__entry->size = size;
__entry->count = count;
+ if (size == 1)
+ __entry->val = *(unsigned char *)data;
+ else if (size == 2)
+ __entry->val = *(unsigned short *)data;
+ else
+ __entry->val = *(unsigned int *)data;
),
- TP_printk("pio_%s at 0x%x size %d count %d",
+ TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
__entry->rw ? "write" : "read",
- __entry->port, __entry->size, __entry->count)
+ __entry->port, __entry->size, __entry->count, __entry->val,
+ __entry->count > 1 ? "(...)" : "")
);
/*
@@ -709,10 +721,10 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
- __entry->len = vcpu->arch.emulate_ctxt._eip
- - vcpu->arch.emulate_ctxt.fetch.start;
+ __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr
+ - vcpu->arch.emulate_ctxt.fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len;
memcpy(__entry->insn,
vcpu->arch.emulate_ctxt.fetch.data,
15);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33e8c028842f..bfe11cf124a1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -354,6 +354,7 @@ struct vmcs02_list {
struct nested_vmx {
/* Has the level1 guest done vmxon? */
bool vmxon;
+ gpa_t vmxon_ptr;
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
@@ -382,6 +383,9 @@ struct nested_vmx {
struct hrtimer preemption_timer;
bool preemption_timer_expired;
+
+ /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
+ u64 vmcs01_debugctl;
};
#define POSTED_INTR_ON 0
@@ -413,7 +417,6 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
u8 fail;
- u8 cpl;
bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
@@ -740,7 +743,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static bool vmx_mpx_supported(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -820,7 +822,6 @@ static const u32 vmx_msr_index[] = {
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
static inline bool is_page_fault(u32 intr_info)
{
@@ -1940,7 +1941,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
}
-static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;
@@ -1950,7 +1951,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
+ return ret;
}
static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
@@ -2239,10 +2240,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
* or other means.
*/
static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_true_procbased_ctls_low;
static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_true_exit_ctls_low;
static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_true_entry_ctls_low;
static u32 nested_vmx_misc_low, nested_vmx_misc_high;
static u32 nested_vmx_ept_caps;
static __init void nested_vmx_setup_ctls_msrs(void)
@@ -2265,25 +2269,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- /*
- * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
- * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
- */
nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- /*
- * Exit controls
- * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
- * 17 must be 1.
- */
+ /* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
- /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+
nested_vmx_exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
@@ -2291,14 +2287,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
- VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+
if (vmx_mpx_supported())
nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ /* We support free control of debug control saving. */
+ nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high &=
#ifdef CONFIG_X86_64
@@ -2310,10 +2310,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
if (vmx_mpx_supported())
nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ /* We support free control of debug control loading. */
+ nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = 0;
+ nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
@@ -2334,7 +2338,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+ nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -2353,12 +2362,11 @@ static __init void nested_vmx_setup_ctls_msrs(void)
VMX_EPT_INVEPT_BIT;
nested_vmx_ept_caps &= vmx_capability.ept;
/*
- * Since invept is completely emulated we support both global
- * and context invalidation independent of what host cpu
- * supports
+ * For nested guests, we don't do anything specific
+ * for single context invalidation. Hence, only advertise
+ * support for global context invalidation.
*/
- nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
- VMX_EPT_EXTENT_CONTEXT_BIT;
+ nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
} else
nested_vmx_ept_caps = 0;
@@ -2394,7 +2402,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- *pdata = VMCS12_REVISION |
+ *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
break;
@@ -2404,16 +2412,25 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
nested_vmx_pinbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high);
+ break;
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
+ nested_vmx_exit_ctls_high);
+ break;
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
+ nested_vmx_entry_ctls_high);
+ break;
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
nested_vmx_entry_ctls_high);
@@ -2442,7 +2459,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = -1ULL;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = 0x1f;
+ *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
@@ -3186,10 +3203,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-
- /* CPL is always 0 when CPU enters protected mode */
- __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- vmx->cpl = 0;
}
static void fix_rmode_seg(int seg, struct kvm_segment *save)
@@ -3591,22 +3604,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!is_protmode(vcpu))
+ if (unlikely(vmx->rmode.vm86_active))
return 0;
-
- if (!is_long_mode(vcpu)
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
- return 3;
-
- if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
+ else {
+ int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return AR_DPL(ar);
}
-
- return vmx->cpl;
}
-
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
@@ -3634,8 +3639,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
vmx_segment_cache_clear(vmx);
- if (seg == VCPU_SREG_CS)
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
vmx->rmode.segs[seg] = *var;
@@ -3667,7 +3670,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
out:
- vmx->emulation_required |= emulation_required(vcpu);
+ vmx->emulation_required = emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -4436,7 +4439,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx->vcpu.arch.pat = host_pat;
}
- for (i = 0; i < NR_VMX_MSR; ++i) {
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
int j = vmx->nmsrs;
@@ -4564,6 +4567,16 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
PIN_BASED_EXT_INTR_MASK;
}
+/*
+ * In nested virtualization, check if L1 has set
+ * VM_EXIT_ACK_INTR_ON_EXIT
+ */
+static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
return get_vmcs12(vcpu)->pin_based_vm_exec_control &
@@ -4877,7 +4890,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ if (!(dr6 & ~DR6_RESERVED)) /* icebp */
+ skip_emulated_instruction(vcpu);
+
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
}
@@ -5040,7 +5056,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- val = kvm_register_read(vcpu, reg);
+ val = kvm_register_readl(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
@@ -5057,7 +5073,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
- u8 cr8 = kvm_register_read(vcpu, reg);
+ u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
@@ -5133,7 +5149,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 0;
} else {
vcpu->arch.dr7 &= ~DR7_GD;
- vcpu->arch.dr6 |= DR6_BD;
+ vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
@@ -5166,7 +5182,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 1;
kvm_register_write(vcpu, reg, val);
} else
- if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
+ if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
return 1;
skip_emulated_instruction(vcpu);
@@ -5439,7 +5455,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
}
/* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
/*
* TODO: What about debug traps on tss switch?
@@ -5565,6 +5581,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa_t gpa;
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
ret = handle_mmio_page_fault_common(vcpu, gpa, true);
if (likely(ret == RET_MMIO_PF_EMULATE))
@@ -5618,7 +5638,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
- while (!guest_state_valid(vcpu) && count-- != 0) {
+ while (vmx->emulation_required && count-- != 0) {
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
@@ -5652,7 +5672,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = emulation_required(vcpu);
out:
return ret;
}
@@ -5669,12 +5688,24 @@ static int handle_pause(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
+static int handle_nop(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception(vcpu, UD_VECTOR);
+ skip_emulated_instruction(vcpu);
return 1;
}
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+ return handle_nop(vcpu);
+}
+
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+ return handle_nop(vcpu);
+}
+
/*
* To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
* We could reuse a single VMCS for all the L2 guests, but we also want the
@@ -5739,22 +5770,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
/*
* Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
- * currently used, if running L2), and vmcs01 when running L2.
+ * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+ * must be &vmx->vmcs01.
*/
static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
{
struct vmcs02_list *item, *n;
+
+ WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- if (vmx->loaded_vmcs != &item->vmcs02)
- free_loaded_vmcs(&item->vmcs02);
+ /*
+ * Something will leak if the above WARN triggers. Better than
+ * a use-after-free.
+ */
+ if (vmx->loaded_vmcs == &item->vmcs02)
+ continue;
+
+ free_loaded_vmcs(&item->vmcs02);
list_del(&item->list);
kfree(item);
+ vmx->nested.vmcs02_num--;
}
- vmx->nested.vmcs02_num = 0;
-
- if (vmx->loaded_vmcs != &vmx->vmcs01)
- free_loaded_vmcs(&vmx->vmcs01);
}
/*
@@ -5812,6 +5848,154 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
}
/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+ unsigned long exit_qualification,
+ u32 vmx_instruction_info, gva_t *ret)
+{
+ /*
+ * According to Vol. 3B, "Information for VM Exits Due to Instruction
+ * Execution", on an exit, vmx_instruction_info holds most of the
+ * addressing components of the operand. Only the displacement part
+ * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+ * For how an actual address is calculated from all these components,
+ * refer to Vol. 1, "Operand Addressing".
+ */
+ int scaling = vmx_instruction_info & 3;
+ int addr_size = (vmx_instruction_info >> 7) & 7;
+ bool is_reg = vmx_instruction_info & (1u << 10);
+ int seg_reg = (vmx_instruction_info >> 15) & 7;
+ int index_reg = (vmx_instruction_info >> 18) & 0xf;
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+ int base_reg = (vmx_instruction_info >> 23) & 0xf;
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27));
+
+ if (is_reg) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offset = base + [index * scale] + displacement */
+ *ret = vmx_get_segment_base(vcpu, seg_reg);
+ if (base_is_valid)
+ *ret += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ *ret += kvm_register_read(vcpu, index_reg)<<scaling;
+ *ret += exit_qualification; /* holds the displacement */
+
+ if (addr_size == 1) /* 32 bit */
+ *ret &= 0xffffffff;
+
+ /*
+ * TODO: throw #GP (and return 1) in various cases that the VM*
+ * instructions require it - e.g., offset beyond segment limit,
+ * unusable or unreadable/unwritable segment, non-canonical 64-bit
+ * address, and so on. Currently these are not checked.
+ */
+ return 0;
+}
+
+/*
+ * This function performs the various checks including
+ * - if it's 4KB aligned
+ * - No bits beyond the physical address width are set
+ * - Returns 0 on success or else 1
+ * (Intel SDM Section 30.3)
+ */
+static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
+ gpa_t *vmpointer)
+{
+ gva_t gva;
+ gpa_t vmptr;
+ struct x86_exception e;
+ struct page *page;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+ sizeof(vmptr), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (exit_reason) {
+ case EXIT_REASON_VMON:
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1
+ * for the nested case;
+ * which replaces physical address width with 32
+ *
+ */
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
+ nested_vmx_failInvalid(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ page = nested_get_page(vcpu, vmptr);
+ if (page == NULL ||
+ *(u32 *)kmap(page) != VMCS12_REVISION) {
+ nested_vmx_failInvalid(vcpu);
+ kunmap(page);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ kunmap(page);
+ vmx->nested.vmxon_ptr = vmptr;
+ break;
+ case EXIT_REASON_VMCLEAR:
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMCLEAR_INVALID_ADDRESS);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMCLEAR_VMXON_POINTER);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ break;
+ case EXIT_REASON_VMPTRLD:
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMPTRLD_INVALID_ADDRESS);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMCLEAR_VMXON_POINTER);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ break;
+ default:
+ return 1; /* shouldn't happen */
+ }
+
+ if (vmpointer)
+ *vmpointer = vmptr;
+ return 0;
+}
+
+/*
* Emulate the VMXON instruction.
* Currently, we just remember that VMX is active, and do not save or even
* inspect the argument to VMXON (the so-called "VMXON pointer") because we
@@ -5849,6 +6033,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
kvm_inject_gp(vcpu, 0);
return 1;
}
+
+ if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+ return 1;
+
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
skip_emulated_instruction(vcpu);
@@ -5919,20 +6107,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
u32 exec_control;
+ if (vmx->nested.current_vmptr == -1ull)
+ return;
+
+ /* current_vmptr and current_vmcs12 are always set/reset together */
+ if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
+ return;
+
if (enable_shadow_vmcs) {
- if (vmx->nested.current_vmcs12 != NULL) {
- /* copy to memory all shadowed fields in case
- they were modified */
- copy_shadow_to_vmcs12(vmx);
- vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
- }
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx->nested.sync_shadow_vmcs = false;
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -5943,12 +6138,9 @@ static void free_nested(struct vcpu_vmx *vmx)
{
if (!vmx->nested.vmxon)
return;
+
vmx->nested.vmxon = false;
- if (vmx->nested.current_vmptr != -1ull) {
- nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
+ nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
/* Unpin physical memory we referred to in current vmcs02 */
@@ -5971,93 +6163,22 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
return 1;
}
-/*
- * Decode the memory-address operand of a vmx instruction, as recorded on an
- * exit caused by such an instruction (run by a guest hypervisor).
- * On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
- */
-static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
- unsigned long exit_qualification,
- u32 vmx_instruction_info, gva_t *ret)
-{
- /*
- * According to Vol. 3B, "Information for VM Exits Due to Instruction
- * Execution", on an exit, vmx_instruction_info holds most of the
- * addressing components of the operand. Only the displacement part
- * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
- * For how an actual address is calculated from all these components,
- * refer to Vol. 1, "Operand Addressing".
- */
- int scaling = vmx_instruction_info & 3;
- int addr_size = (vmx_instruction_info >> 7) & 7;
- bool is_reg = vmx_instruction_info & (1u << 10);
- int seg_reg = (vmx_instruction_info >> 15) & 7;
- int index_reg = (vmx_instruction_info >> 18) & 0xf;
- bool index_is_valid = !(vmx_instruction_info & (1u << 22));
- int base_reg = (vmx_instruction_info >> 23) & 0xf;
- bool base_is_valid = !(vmx_instruction_info & (1u << 27));
-
- if (is_reg) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- /* Addr = segment_base + offset */
- /* offset = base + [index * scale] + displacement */
- *ret = vmx_get_segment_base(vcpu, seg_reg);
- if (base_is_valid)
- *ret += kvm_register_read(vcpu, base_reg);
- if (index_is_valid)
- *ret += kvm_register_read(vcpu, index_reg)<<scaling;
- *ret += exit_qualification; /* holds the displacement */
-
- if (addr_size == 1) /* 32 bit */
- *ret &= 0xffffffff;
-
- /*
- * TODO: throw #GP (and return 1) in various cases that the VM*
- * instructions require it - e.g., offset beyond segment limit,
- * unusable or unreadable/unwritable segment, non-canonical 64-bit
- * address, and so on. Currently these are not checked.
- */
- return 0;
-}
-
/* Emulate the VMCLEAR instruction */
static int handle_vmclear(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gva_t gva;
gpa_t vmptr;
struct vmcs12 *vmcs12;
struct page *page;
- struct x86_exception e;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
- return 1;
-
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
-
- if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
- nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
+ if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
return 1;
- }
- if (vmptr == vmx->nested.current_vmptr) {
+ if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
page = nested_get_page(vcpu, vmptr);
if (page == NULL) {
@@ -6285,7 +6406,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
return 1;
/* Decode instruction info and find the field to read */
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
if (!vmcs12_read_any(vcpu, field, &field_value)) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -6298,7 +6419,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
if (vmx_instruction_info & (1u << 10)) {
- kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+ kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
field_value);
} else {
if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -6335,21 +6456,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_read(vcpu,
+ field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
else {
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, &gva))
return 1;
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+ &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
}
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
if (vmcs_field_readonly(field)) {
nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
@@ -6372,29 +6493,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gva_t gva;
gpa_t vmptr;
- struct x86_exception e;
u32 exec_control;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
- return 1;
-
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
return 1;
- }
-
- if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
- nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
- }
if (vmx->nested.current_vmptr != vmptr) {
struct vmcs12 *new_vmcs12;
@@ -6414,9 +6520,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
- if (vmx->nested.current_vmptr != -1ull)
- nested_release_vmcs12(vmx);
+ nested_release_vmcs12(vmx);
vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
@@ -6471,7 +6576,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
struct {
u64 eptp, gpa;
} operand;
- u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
!(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
@@ -6488,7 +6592,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
@@ -6511,16 +6615,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
switch (type) {
- case VMX_EPT_EXTENT_CONTEXT:
- if ((operand.eptp & eptp_mask) !=
- (nested_ept_get_cr3(vcpu) & eptp_mask))
- break;
case VMX_EPT_EXTENT_GLOBAL:
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
nested_vmx_succeed(vcpu);
break;
default:
+ /* Trap single context invalidation invept calls */
BUG_ON(1);
break;
}
@@ -6571,8 +6672,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
[EXIT_REASON_INVEPT] = handle_invept,
};
@@ -6671,7 +6772,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
int cr = exit_qualification & 15;
int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_read(vcpu, reg);
+ unsigned long val = kvm_register_readl(vcpu, reg);
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
@@ -7032,7 +7133,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
if (max_irr == -1)
return;
- vmx_set_rvi(max_irr);
+ /*
+ * If a vmexit is needed, vmx_check_nested_events handles it.
+ */
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return;
+
+ if (!is_guest_mode(vcpu)) {
+ vmx_set_rvi(max_irr);
+ return;
+ }
+
+ /*
+ * Fall back to pre-APICv interrupt injection since L2
+ * is run without virtual interrupt delivery.
+ */
+ if (!kvm_event_needs_reinjection(vcpu) &&
+ vmx_interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, max_irr, false);
+ vmx_inject_irq(vcpu);
+ }
}
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -7413,7 +7533,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
@@ -7441,13 +7560,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_complete_interrupts(vmx);
}
+static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (vmx->loaded_vmcs == &vmx->vmcs01)
+ return;
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_put(vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+}
+
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- free_loaded_vmcs(vmx->loaded_vmcs);
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
free_nested(vmx);
+ free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7469,6 +7606,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
+ > PAGE_SIZE);
+
err = -ENOMEM;
if (!vmx->guest_msrs) {
goto uninit_vcpu;
@@ -7757,7 +7897,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
vmcs12->vm_entry_intr_info_field);
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -7767,7 +7913,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
- kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs12->guest_pending_dbg_exceptions);
@@ -7778,7 +7923,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control = vmcs12->pin_based_vm_exec_control;
exec_control |= vmcs_config.pin_based_exec_ctrl;
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
+ PIN_BASED_POSTED_INTR);
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
vmx->nested.preemption_timer_expired = false;
@@ -7815,7 +7961,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (!vmx->rdtscp_enabled)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
/* Take the following fields only from vmcs12 */
- exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -8031,14 +8179,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+ !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
- !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+ !PAGE_ALIGNED(vmcs12->apic_access_addr)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8054,15 +8202,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+ nested_vmx_true_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+ nested_vmx_true_exit_ctls_low,
+ nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+ nested_vmx_true_entry_ctls_low,
+ nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8139,6 +8290,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
cpu = get_cpu();
vmx->loaded_vmcs = vmcs02;
vmx_vcpu_put(vcpu);
@@ -8316,7 +8470,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
@@ -8395,9 +8548,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
* the relevant bit asks not to trap the change */
- vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
@@ -8588,7 +8745,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
unsigned long exit_qualification)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* trying to cancel vmlaunch/vmresume is a bug */
@@ -8598,6 +8754,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ vmx_load_vmcs01(vcpu);
+
+ if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ && nested_exit_intr_ack_set(vcpu)) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
+ WARN_ON(irq < 0);
+ vmcs12->vm_exit_intr_info = irq |
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+ }
+
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
vmcs12->idt_vectoring_info_field,
@@ -8605,13 +8771,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- cpu = get_cpu();
- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_put(vcpu);
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- put_cpu();
-
vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
vmx_segment_cache_clear(vmx);
@@ -8800,7 +8959,7 @@ static int __init vmx_init(void)
rdmsrl_safe(MSR_EFER, &host_efer);
- for (i = 0; i < NR_VMX_MSR; ++i)
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
kvm_define_shared_msr(i, vmx_msr_index[i]);
vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6c0bacca9bd..8f1e22d3b286 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -106,6 +107,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+static bool backwards_tsc_observed = false;
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -209,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr)
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
+ BUG_ON(slot >= KVM_NR_SHARED_MSRS);
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
shared_msrs_global.msrs[slot] = msr;
@@ -308,6 +312,31 @@ static int exception_class(int vector)
return EXCPT_BENIGN;
}
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /* #DB is trap, as instruction watchpoints are handled elsewhere */
+ if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool reinject)
@@ -702,25 +731,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
- if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
- return 1;
- } else
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS)
- return 1;
- if (is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return 1;
- }
- /*
- * We don't check reserved bits in nonpae mode, because
- * this isn't enforced, and VMware depends on this.
- */
- }
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
+ } else if (is_pae(vcpu) && is_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
@@ -770,6 +785,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpuid_has_rtm(vcpu))
+ fixed |= DR6_RTM;
+ return fixed;
+}
+
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
@@ -785,7 +809,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 6:
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
kvm_update_dr6(vcpu);
break;
case 5:
@@ -996,9 +1020,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;
- /* open coded 'struct timespec' */
- u64 monotonic_time_snsec;
- time_t monotonic_time_sec;
+ u64 boot_ns;
+ u64 nsec_base;
};
static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1006,27 +1029,21 @@ static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+ u64 boot_ns;
+
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->clock->cycle_last;
- vdata->clock.mask = tk->clock->mask;
- vdata->clock.mult = tk->mult;
- vdata->clock.shift = tk->shift;
-
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
- + (tk->wall_to_monotonic.tv_nsec
- << tk->shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
- vdata->monotonic_time_sec++;
- }
+ vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr.cycle_last;
+ vdata->clock.mask = tk->tkr.mask;
+ vdata->clock.mult = tk->tkr.mult;
+ vdata->clock.shift = tk->tkr.shift;
+
+ vdata->boot_ns = boot_ns;
+ vdata->nsec_base = tk->tkr.xtime_nsec;
write_seqcount_end(&vdata->seq);
}
@@ -1121,11 +1138,7 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
static inline u64 get_kernel_ns(void)
{
- struct timespec ts;
-
- ktime_get_ts(&ts);
- monotonic_to_bootbased(&ts);
- return timespec_to_ns(&ts);
+ return ktime_get_boot_ns();
}
#ifdef CONFIG_X86_64
@@ -1227,6 +1240,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
unsigned long flags;
s64 usdiff;
bool matched;
+ bool already_matched;
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -1291,6 +1305,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
+ already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
} else {
/*
* We split periods of matched TSC writes into generations.
@@ -1306,7 +1321,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
matched = false;
- pr_debug("kvm: new tsc generation %u, clock %llu\n",
+ pr_debug("kvm: new tsc generation %llu, clock %llu\n",
kvm->arch.cur_tsc_generation, data);
}
@@ -1331,10 +1346,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
- if (matched)
- kvm->arch.nr_vcpus_matched_tsc++;
- else
+ if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (!already_matched) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
kvm_track_tsc_matching(vcpu);
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1387,23 +1403,22 @@ static inline u64 vgettsc(cycle_t *cycle_now)
return v * gtod->clock.mult;
}
-static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
- u64 ns;
int mode;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ u64 ns;
- ts->tv_nsec = 0;
do {
seq = read_seqcount_begin(&gtod->seq);
mode = gtod->clock.vclock_mode;
- ts->tv_sec = gtod->monotonic_time_sec;
- ns = gtod->monotonic_time_snsec;
+ ns = gtod->nsec_base;
ns += vgettsc(cycle_now);
ns >>= gtod->clock.shift;
+ ns += gtod->boot_ns;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
- timespec_add_ns(ts, ns);
+ *t = ns;
return mode;
}
@@ -1411,19 +1426,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
/* returns true if host is using tsc clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
{
- struct timespec ts;
-
/* checked again under seqlock below */
if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
return false;
- if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
- return false;
-
- monotonic_to_bootbased(&ts);
- *kernel_ns = timespec_to_ns(&ts);
-
- return true;
+ return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
}
#endif
@@ -1486,7 +1493,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_kernel_ns,
&ka->master_cycle_now);
- ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !backwards_tsc_observed;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -1909,7 +1917,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
break;
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, data,
+ if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
&tsc_ref, sizeof(tsc_ref)))
return 1;
mark_page_dirty(kvm, gfn);
@@ -1932,6 +1940,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
vcpu->arch.hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+ return 1;
break;
}
gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
@@ -1942,6 +1952,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 1;
vcpu->arch.hv_vapic = data;
mark_page_dirty(vcpu->kvm, gfn);
+ if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+ return 1;
break;
}
case HV_X64_MSR_EOI:
@@ -2039,6 +2051,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
+ data &= ~(u64)0x40000; /* ignore Mc status write enable */
if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -2623,7 +2636,7 @@ out:
return r;
}
-int kvm_dev_ioctl_check_extension(long ext)
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -2644,6 +2657,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IRQFD:
case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
@@ -2980,9 +2994,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow =
- kvm_x86_ops->get_interrupt_shadow(vcpu,
- KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+ events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
@@ -3646,11 +3658,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
offset = i * BITS_PER_LONG;
kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
}
- if (is_dirty)
- kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+ /* See the comments in kvm_mmu_slot_remove_write_access(). */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * All the TLBs can be flushed out of mmu lock, see the comments in
+ * kvm_mmu_slot_remove_write_access().
+ */
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
goto out;
@@ -4080,7 +4100,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+ ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
+ offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4101,10 +4122,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK,
- exception);
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == UNMAPPED_GVA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
}
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
@@ -4486,8 +4521,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val,
unsigned int count, bool in)
{
- trace_kvm_pio(!in, port, size, count);
-
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
vcpu->arch.pio.count = count;
@@ -4522,6 +4555,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (ret) {
data_avail:
memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
vcpu->arch.pio.count = 0;
return 1;
}
@@ -4536,6 +4570,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
memcpy(vcpu->arch.pio_data, val, size * count);
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
@@ -4647,11 +4682,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
return res;
}
-static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
-{
- kvm_set_rflags(emul_to_vcpu(ctxt), val);
-}
-
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4733,7 +4763,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
if (desc->g)
var.limit = (var.limit << 12) | 0xfff;
var.type = desc->type;
- var.present = desc->p;
var.dpl = desc->dpl;
var.db = desc->d;
var.s = desc->s;
@@ -4765,6 +4794,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc)
+{
+ return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+}
+
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc, u64 *pdata)
{
@@ -4836,12 +4871,12 @@ static const struct x86_emulate_ops emulate_ops = {
.set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
- .set_rflags = emulator_set_rflags,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
+ .check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
@@ -4854,7 +4889,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -4862,8 +4897,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
* means that the last instruction is an sti. We should not
* leave the flag on in this case. The same goes for mov ss
*/
- if (!(int_shadow & mask))
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
}
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
@@ -4878,19 +4918,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
-{
- memset(&ctxt->opcode_len, 0,
- (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
-
- ctxt->fetch.start = 0;
- ctxt->fetch.end = 0;
- ctxt->io_read.pos = 0;
- ctxt->io_read.end = 0;
- ctxt->mem_read.pos = 0;
- ctxt->mem_read.end = 0;
-}
-
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -4902,7 +4929,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
- cs_l ? X86EMUL_MODE_PROT64 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
ctxt->guest_mode = is_guest_mode(vcpu);
@@ -5089,23 +5116,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
/*
- * Use the "raw" value to see if TF was passed to the processor.
- * Note that the new value of the flags has not been saved yet.
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
*
* This is correct even for TF set by the guest, because "the
* processor will not generate this exception after the instruction
* that sets the TF flag".
*/
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
-
if (unlikely(rflags & X86_EFLAGS_TF)) {
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
+ DR6_RTM;
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -5118,7 +5144,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
* cleared by the processor".
*/
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5137,7 +5163,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.eff_db);
if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
get_segment_base(vcpu, VCPU_SREG_CS);
@@ -5148,14 +5174,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
}
}
- if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
vcpu->arch.db);
if (dr6 != 0) {
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
*r = EMULATE_DONE;
return true;
@@ -5219,6 +5246,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return EMULATE_DONE;
}
@@ -5269,13 +5298,22 @@ restart:
r = EMULATE_DONE;
if (writeback) {
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, &r);
- kvm_set_rflags(vcpu, ctxt->eflags);
+ kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
} else
vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5666,7 +5704,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
u64 param, ingpa, outgpa, ret;
uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
bool fast, longmode;
- int cs_db, cs_l;
/*
* hypercall generates UD from non zero cpl and real mode
@@ -5677,8 +5714,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- longmode = is_long_mode(vcpu) && cs_l == 1;
+ longmode = is_64_bit_mode(vcpu);
if (!longmode) {
param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
@@ -5743,7 +5779,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
- int r = 1;
+ int op_64_bit, r = 1;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
@@ -5756,7 +5792,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- if (!is_long_mode(vcpu)) {
+ op_64_bit = is_64_bit_mode(vcpu);
+ if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
@@ -5782,6 +5819,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
}
out:
+ if (!op_64_bit)
+ ret = (u32)ret;
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
@@ -5860,6 +5899,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
trace_kvm_inj_exception(vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
+
+ if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
@@ -5891,6 +5935,18 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_nmi(vcpu);
}
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
@@ -6839,9 +6895,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = DR6_FIXED_1;
+ vcpu->arch.dr6 = DR6_INIT;
kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -6945,6 +7003,7 @@ int kvm_arch_hardware_enable(void *garbage)
*/
if (backwards_tsc) {
u64 delta_cyc = max_tsc - local_tsc;
+ backwards_tsc_observed = true;
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
@@ -7329,8 +7388,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
/*
* Write protect all pages for dirty logging.
- * Existing largepage mappings are destroyed here and new ones will
- * not be created until the end of the logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
*/
if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
@@ -7392,12 +7455,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8c97bac9a895..306a1b77581f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -47,6 +47,16 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
#endif
}
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ if (!is_long_mode(vcpu))
+ return false;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
{
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
@@ -108,6 +118,23 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return false;
}
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write(vcpu, reg, val);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index eabcb6e6a900..4d4f96a27638 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
-lib-y := delay.o misc.o
+lib-y := delay.o misc.o cmdline.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 000000000000..422db000d727
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/setup.h>
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/**
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found.
+ */
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ char c;
+ int len, pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
+ if (!len)
+ return 0;
+
+ while (len--) {
+ c = *(char *)cmdline++;
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ /* fall through */
+
+ case st_wordcmp:
+ if (!*opptr)
+ if (!c || myisspace(c))
+ return wstart;
+ else
+ state = st_wordskip;
+ else if (!c)
+ return 0;
+ else if (c != *opptr++)
+ state = st_wordskip;
+ else if (!len) /* last word and is matching */
+ return wstart;
+ break;
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 2930ae05d773..28f85c916712 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -4,8 +4,8 @@
* (inspired by Andi Kleen's thunk_64.S)
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
-
#include <linux/linkage.h>
+ #include <asm/asm.h>
#ifdef CONFIG_TRACE_IRQFLAGS
/* put return address in eax (arg1) */
@@ -22,6 +22,7 @@
popl %ecx
popl %eax
ret
+ _ASM_NOKPROBE(\name)
.endm
thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index a63efd6bb6a5..92d9feaff42b 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -8,6 +8,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/calling.h>
+#include <asm/asm.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
@@ -25,6 +26,7 @@
call \func
jmp restore
CFI_ENDPROC
+ _ASM_NOKPROBE(\name)
.endm
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -43,3 +45,4 @@ restore:
RESTORE_ARGS
ret
CFI_ENDPROC
+ _ASM_NOKPROBE(restore)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 20621d753d5f..167ffcac16ed 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,12 +30,14 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
bool to_dmesg;
};
struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -46,6 +48,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -68,6 +71,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -182,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
@@ -197,6 +201,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
+ st->lines = 0;
pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
st->marker->name);
} else if (prot != cur || level != st->level ||
@@ -208,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+ delta, *unit);
+ printk_prot(m, st->current_prot, st->level,
+ st->to_dmesg);
}
- pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level, st->to_dmesg);
+ st->lines++;
/*
* We print markers for special areas of address space,
@@ -226,7 +238,17 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
st->marker++;
+ st->lines = 0;
pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
st->marker->name);
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8e5722992677..a24194681513 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,7 +8,7 @@
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
-#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
@@ -18,7 +18,8 @@
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
-#include <asm/fixmap.h> /* VSYSCALL_START */
+#include <asm/fixmap.h> /* VSYSCALL_ADDR */
+#include <asm/vsyscall.h> /* emulate_vsyscall */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -45,7 +46,7 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int __kprobes
+static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
@@ -54,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int __kprobes kprobes_fault(struct pt_regs *regs)
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -261,7 +262,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -291,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -358,7 +360,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -425,6 +427,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
@@ -574,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@ -594,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+ if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ (pgd_flags(*pgd) & _PAGE_USER) &&
+ (read_cr4() & X86_CR4_SMEP))
+ printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@ -771,7 +780,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* emulation.
*/
if (unlikely((error_code & PF_INSTR) &&
- ((address & ~0xfff) == VSYSCALL_START))) {
+ ((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
@@ -927,7 +936,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline __kprobes int
+static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
@@ -975,6 +984,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
return ret;
}
+NOKPROBE_SYMBOL(spurious_fault);
int show_unhandled_signals = 1;
@@ -1030,7 +1040,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* {,trace_}do_page_fault() have notrace on. Having this an actual function
* guarantees there's a function trace entry.
*/
-static void __kprobes noinline
+static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
@@ -1208,7 +1218,8 @@ good_area:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
- * the fault:
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
@@ -1253,8 +1264,9 @@ good_area:
up_read(&mm->mmap_sem);
}
+NOKPROBE_SYMBOL(__do_page_fault);
-dotraplinkage void __kprobes notrace
+dotraplinkage void notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
unsigned long address = read_cr2(); /* Get the faulting address */
@@ -1272,10 +1284,12 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_page_fault);
#ifdef CONFIG_TRACING
-static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
{
if (user_mode(regs))
trace_page_fault_user(address, regs, error_code);
@@ -1283,7 +1297,7 @@ static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs
trace_page_fault_kernel(address, regs, error_code);
}
-dotraplinkage void __kprobes notrace
+dotraplinkage void notrace
trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
/*
@@ -1300,4 +1314,5 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(trace_do_page_fault);
#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8c9f647ff9e1..8b977ebf9388 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
return NULL;
}
-
-int pmd_huge_support(void)
-{
- return 0;
-}
#else
struct page *
@@ -80,11 +75,6 @@ int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
-
-int pmd_huge_support(void)
-{
- return 1;
-}
#endif
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f97130618113..66dba36f2343 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,13 @@
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
+
#include "mm_internal.h"
static unsigned long __initdata pgt_buf_start;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e39504878aec..7d05565ba781 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -825,7 +825,8 @@ void __init mem_init(void)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata = NODE_DATA(nid);
- struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+ struct zone *zone = pgdata->node_zones +
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f35c66c5959a..5621c47d7a1a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -691,7 +691,8 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+ struct zone *zone = pgdat->node_zones +
+ zone_for_memory(nid, start, size, ZONE_NORMAL);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
@@ -1055,8 +1056,8 @@ void __init mem_init(void)
after_bootmem = 1;
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
- VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
+ PAGE_SIZE, KCORE_OTHER);
mem_init_print_info(NULL);
}
@@ -1185,11 +1186,19 @@ int kern_addr_valid(unsigned long addr)
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+ return "[vsyscall]";
+}
+static struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_START,
- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
};
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
@@ -1218,29 +1227,46 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr)
*/
int in_gate_area_no_mm(unsigned long addr)
{
- return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+ return (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
-const char *arch_vma_name(struct vm_area_struct *vma)
+static unsigned long probe_memory_block_size(void)
{
- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
- return "[vdso]";
- if (vma == &gate_vma)
- return "[vsyscall]";
- return NULL;
-}
+ /* start from 2g */
+ unsigned long bz = 1UL<<31;
#ifdef CONFIG_X86_UV
-unsigned long memory_block_size_bytes(void)
-{
if (is_uv_system()) {
printk(KERN_INFO "UV: memory block size 2GB\n");
return 2UL * 1024 * 1024 * 1024;
}
- return MIN_MEMORY_BLOCK_SIZE;
-}
#endif
+ /* less than 64g installed */
+ if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+ return MIN_MEMORY_BLOCK_SIZE;
+
+ /* get the tail size */
+ while (bz > MIN_MEMORY_BLOCK_SIZE) {
+ if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+ break;
+ bz >>= 1;
+ }
+
+ printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+ return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
+}
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 597ac155c91c..baff1da354e0 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
@@ -355,6 +367,12 @@ void __init early_ioremap_init(void)
{
pmd_t *pmd;
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+ WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
+
early_ioremap_setup();
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9c390f..a32b706c401a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -559,7 +559,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
int i, nid;
nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
unsigned long start, end;
- struct memblock_type *type = &memblock.reserved;
+ struct memblock_region *r;
/*
* At this time, all memory regions reserved by memblock are
@@ -573,8 +573,8 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
/* Mark all kernel nodes. */
- for (i = 0; i < type->cnt; i++)
- node_set(type->regions[i].nid, numa_kernel_nodes);
+ for_each_memblock(reserved, r)
+ node_set(r->nid, numa_kernel_nodes);
/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
for (i = 0; i < numa_meminfo.nr_blks; i++) {
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 461bc8289024..6629f397b467 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -35,7 +35,7 @@ enum {
static int pte_testbit(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UNUSED1;
+ return pte_flags(pte) & _PAGE_SOFTW1;
}
struct split_state {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index c96314abd144..6fb6927f9e76 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young;
-
- young = ptep_test_and_clear_young(vma, address, ptep);
- if (young)
- flush_tlb_page(vma, address);
-
- return young;
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -449,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
- printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
- (int)-reserve);
- __FIXADDR_TOP = -reserve - PAGE_SIZE;
+ __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+ printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+ -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dd8dda167a24..1fe33987de02 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -49,6 +49,7 @@ void leave_mm(int cpu)
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
}
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -102,20 +103,24 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
+ if (!f->flush_end)
+ f->flush_end = f->flush_start + PAGE_SIZE;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL)
+ if (f->flush_end == TLB_FLUSH_ALL) {
local_flush_tlb();
- else if (!f->flush_end)
- __flush_tlb_single(f->flush_start);
- else {
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+ } else {
unsigned long addr;
+ unsigned long nr_pages =
+ f->flush_end - f->flush_start / PAGE_SIZE;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
}
} else
leave_mm(smp_processor_id());
@@ -153,46 +158,45 @@ void flush_tlb_current_task(void)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
+/*
+ * See Documentation/x86/tlb.txt for details. We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead. Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+unsigned long tlb_single_page_flush_ceiling = 33;
+
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
- unsigned act_entries, tlb_entries = 0;
- unsigned long nr_base_pages;
+ /* do a global flush by default */
+ unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
if (current->active_mm != mm)
- goto flush_all;
+ goto out;
if (!current->mm) {
leave_mm(smp_processor_id());
- goto flush_all;
+ goto out;
}
- if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
- || vmflag & VM_HUGETLB) {
- local_flush_tlb();
- goto flush_all;
- }
-
- /* In modern CPU, last level tlb used for both data/ins */
- if (vmflag & VM_EXEC)
- tlb_entries = tlb_lli_4k[ENTRIES];
- else
- tlb_entries = tlb_lld_4k[ENTRIES];
+ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+ base_pages_to_flush = (end - start) >> PAGE_SHIFT;
- /* Assume all of TLB entries was occupied by this task */
- act_entries = tlb_entries >> tlb_flushall_shift;
- act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
- nr_base_pages = (end - start) >> PAGE_SHIFT;
-
- /* tlb_flushall_shift is on balance point, details in commit log */
- if (nr_base_pages > act_entries) {
+ if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
+ base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
@@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
-
- if (cpumask_any_but(mm_cpumask(mm),
- smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, end);
- preempt_enable();
- return;
}
-
-flush_all:
+ trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
+out:
+ if (base_pages_to_flush == TLB_FLUSH_ALL) {
+ start = 0UL;
+ end = TLB_FLUSH_ALL;
+ }
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+ flush_tlb_others(mm_cpumask(mm), mm, start, end);
preempt_enable();
}
@@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- unsigned act_entries;
- struct flush_tlb_info info;
-
- /* In modern CPU, last level tlb used for both data/ins */
- act_entries = tlb_lld_4k[ENTRIES];
/* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
- (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
-
+ if (end == TLB_FLUSH_ALL ||
+ (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
- else {
+ } else {
+ struct flush_tlb_info info;
info.flush_start = start;
info.flush_end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
-#ifdef CONFIG_DEBUG_TLBFLUSH
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
- len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+ len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
@@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file,
{
char buf[32];
ssize_t len;
- s8 shift;
+ int ceiling;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
buf[len] = '\0';
- if (kstrtos8(buf, 0, &shift))
+ if (kstrtoint(buf, 0, &ceiling))
return -EINVAL;
- if (shift < -1 || shift >= BITS_PER_LONG)
+ if (ceiling < 0)
return -EINVAL;
- tlb_flushall_shift = shift;
+ tlb_single_page_flush_ceiling = ceiling;
return count;
}
@@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = {
.llseek = default_llseek,
};
-static int __init create_tlb_flushall_shift(void)
+static int __init create_tlb_single_page_flush_ceiling(void)
{
- debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+ debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_tlbflush);
return 0;
}
-late_initcall(create_tlb_flushall_shift);
-#endif
+late_initcall(create_tlb_single_page_flush_ceiling);
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 01495755701b..6440221ced0d 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -12,13 +12,16 @@
/*
* Calling convention :
- * rdi : skb pointer
+ * rbx : skb pointer (callee saved)
* esi : offset of byte(s) to fetch in skb (can be scratched)
- * r8 : copy of skb->data
+ * r10 : copy of skb->data
* r9d : hlen = skb->len - skb->data_len
*/
-#define SKBDATA %r8
+#define SKBDATA %r10
#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
+#define MAX_BPF_STACK (512 /* from filter.h */ + \
+ 32 /* space for rbx,r13,r14,r15 */ + \
+ 8 /* space for skb_copy_bits */)
sk_load_word:
.globl sk_load_word
@@ -68,53 +71,31 @@ sk_load_byte_positive_offset:
movzbl (SKBDATA,%rsi),%eax
ret
-/**
- * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
- *
- * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
- * Must preserve A accumulator (%eax)
- * Inputs : %esi is the offset value
- */
-sk_load_byte_msh:
- .globl sk_load_byte_msh
- test %esi,%esi
- js bpf_slow_path_byte_msh_neg
-
-sk_load_byte_msh_positive_offset:
- .globl sk_load_byte_msh_positive_offset
- cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
- jle bpf_slow_path_byte_msh
- movzbl (SKBDATA,%rsi),%ebx
- and $15,%bl
- shl $2,%bl
- ret
-
/* rsi contains offset and can be scratched */
#define bpf_slow_path_common(LEN) \
- push %rdi; /* save skb */ \
+ mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
mov $LEN,%ecx; /* len */ \
- lea -12(%rbp),%rdx; \
+ lea - MAX_BPF_STACK + 32(%rbp),%rdx; \
call skb_copy_bits; \
test %eax,%eax; \
pop SKBDATA; \
- pop %r9; \
- pop %rdi
+ pop %r9;
bpf_slow_path_word:
bpf_slow_path_common(4)
js bpf_error
- mov -12(%rbp),%eax
+ mov - MAX_BPF_STACK + 32(%rbp),%eax
bswap %eax
ret
bpf_slow_path_half:
bpf_slow_path_common(2)
js bpf_error
- mov -12(%rbp),%ax
+ mov - MAX_BPF_STACK + 32(%rbp),%ax
rol $8,%ax
movzwl %ax,%eax
ret
@@ -122,21 +103,11 @@ bpf_slow_path_half:
bpf_slow_path_byte:
bpf_slow_path_common(1)
js bpf_error
- movzbl -12(%rbp),%eax
- ret
-
-bpf_slow_path_byte_msh:
- xchg %eax,%ebx /* dont lose A , X is about to be scratched */
- bpf_slow_path_common(1)
- js bpf_error
- movzbl -12(%rbp),%eax
- and $15,%al
- shl $2,%al
- xchg %eax,%ebx
+ movzbl - MAX_BPF_STACK + 32(%rbp),%eax
ret
#define sk_negative_common(SIZE) \
- push %rdi; /* save skb */ \
+ mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
@@ -145,10 +116,8 @@ bpf_slow_path_byte_msh:
test %rax,%rax; \
pop SKBDATA; \
pop %r9; \
- pop %rdi; \
jz bpf_error
-
bpf_slow_path_word_neg:
cmp SKF_MAX_NEG_OFF, %esi /* test range */
jl bpf_error /* offset lower -> error */
@@ -179,22 +148,12 @@ sk_load_byte_negative_offset:
movzbl (%rax), %eax
ret
-bpf_slow_path_byte_msh_neg:
- cmp SKF_MAX_NEG_OFF, %esi
- jl bpf_error
-sk_load_byte_msh_negative_offset:
- .globl sk_load_byte_msh_negative_offset
- xchg %eax,%ebx /* dont lose A , X is about to be scratched */
- sk_negative_common(1)
- movzbl (%rax),%eax
- and $15,%al
- shl $2,%al
- xchg %eax,%ebx
- ret
-
bpf_error:
# force a return 0 from jit handler
- xor %eax,%eax
- mov -8(%rbp),%rbx
+ xor %eax,%eax
+ mov - MAX_BPF_STACK(%rbp),%rbx
+ mov - MAX_BPF_STACK + 8(%rbp),%r13
+ mov - MAX_BPF_STACK + 16(%rbp),%r14
+ mov - MAX_BPF_STACK + 24(%rbp),%r15
leaveq
ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 6d5663a599a7..5c8cb8043c5a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,6 +1,7 @@
/* bpf_jit_comp.c : BPF JIT compiler
*
* Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
+ * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,28 +15,16 @@
#include <linux/if_vlan.h>
#include <linux/random.h>
-/*
- * Conventions :
- * EAX : BPF A accumulator
- * EBX : BPF X accumulator
- * RDI : pointer to skb (first argument given to JIT function)
- * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
- * ECX,EDX,ESI : scratch registers
- * r9d : skb->len - skb->data_len (headlen)
- * r8 : skb->data
- * -8(RBP) : saved RBX value
- * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
- */
int bpf_jit_enable __read_mostly;
/*
* assembly code in arch/x86/net/bpf_jit.S
*/
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[], sk_load_byte_msh_positive_offset[];
+extern u8 sk_load_byte_positive_offset[];
extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[], sk_load_byte_msh_negative_offset[];
+extern u8 sk_load_byte_negative_offset[];
static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -56,30 +45,44 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
-#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
-
-#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
-#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
+#define EMIT1_off32(b1, off) \
+ do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
static inline bool is_imm8(int value)
{
return value <= 127 && value >= -128;
}
-static inline bool is_near(int offset)
+static inline bool is_simm32(s64 value)
{
- return offset <= 127 && offset >= -128;
+ return value == (s64) (s32) value;
}
-#define EMIT_JMP(offset) \
-do { \
- if (offset) { \
- if (is_near(offset)) \
- EMIT2(0xeb, offset); /* jmp .+off8 */ \
- else \
- EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
- } \
-} while (0)
+/* mov dst, src */
+#define EMIT_mov(DST, SRC) \
+ do {if (DST != SRC) \
+ EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+ } while (0)
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
/* list of x86 cond jumps opcodes (. + s8)
* Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
@@ -90,27 +93,8 @@ do { \
#define X86_JNE 0x75
#define X86_JBE 0x76
#define X86_JA 0x77
-
-#define EMIT_COND_JMP(op, offset) \
-do { \
- if (is_near(offset)) \
- EMIT2(op, offset); /* jxx .+off8 */ \
- else { \
- EMIT2(0x0f, op + 0x10); \
- EMIT(offset, 4); /* jxx .+off32 */ \
- } \
-} while (0)
-
-#define COND_SEL(CODE, TOP, FOP) \
- case CODE: \
- t_op = TOP; \
- f_op = FOP; \
- goto cond_branch
-
-
-#define SEEN_DATAREF 1 /* might call external helpers */
-#define SEEN_XREG 2 /* ebx is used */
-#define SEEN_MEM 4 /* use mem[] for temporary storage */
+#define X86_JGE 0x7D
+#define X86_JG 0x7F
static inline void bpf_flush_icache(void *start, void *end)
{
@@ -125,26 +109,6 @@ static inline void bpf_flush_icache(void *start, void *end)
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-/* Helper to find the offset of pkt_type in sk_buff
- * We want to make sure its still a 3bit field starting at a byte boundary.
- */
-#define PKT_TYPE_MAX 7
-static int pkt_type_offset(void)
-{
- struct sk_buff skb_probe = {
- .pkt_type = ~0,
- };
- char *ct = (char *)&skb_probe;
- unsigned int off;
-
- for (off = 0; off < sizeof(struct sk_buff); off++) {
- if (ct[off] == PKT_TYPE_MAX)
- return off;
- }
- pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n");
- return -1;
-}
-
struct bpf_binary_header {
unsigned int pages;
/* Note : for security reasons, bpf code will follow a randomly
@@ -178,583 +142,771 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
return header;
}
-void bpf_jit_compile(struct sk_filter *fp)
+/* pick a register outside of BPF range for JIT internal work */
+#define AUX_REG (MAX_BPF_REG + 1)
+
+/* the following table maps BPF registers to x64 registers.
+ * x64 register r12 is unused, since if used as base address register
+ * in load/store instructions, it always needs an extra byte of encoding
+ */
+static const int reg2hex[] = {
+ [BPF_REG_0] = 0, /* rax */
+ [BPF_REG_1] = 7, /* rdi */
+ [BPF_REG_2] = 6, /* rsi */
+ [BPF_REG_3] = 2, /* rdx */
+ [BPF_REG_4] = 1, /* rcx */
+ [BPF_REG_5] = 0, /* r8 */
+ [BPF_REG_6] = 3, /* rbx callee saved */
+ [BPF_REG_7] = 5, /* r13 callee saved */
+ [BPF_REG_8] = 6, /* r14 callee saved */
+ [BPF_REG_9] = 7, /* r15 callee saved */
+ [BPF_REG_FP] = 5, /* rbp readonly */
+ [AUX_REG] = 3, /* r11 temp register */
+};
+
+/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+ * which need extra byte of encoding.
+ * rax,rcx,...,rbp have simpler encoding
+ */
+static inline bool is_ereg(u32 reg)
{
- u8 temp[64];
- u8 *prog;
- unsigned int proglen, oldproglen = 0;
- int ilen, i;
- int t_offset, f_offset;
- u8 t_op, f_op, seen = 0, pass;
- u8 *image = NULL;
- struct bpf_binary_header *header = NULL;
- u8 *func;
- int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
- unsigned int cleanup_addr; /* epilogue code offset */
- unsigned int *addrs;
- const struct sock_filter *filter = fp->insns;
- int flen = fp->len;
+ if (reg == BPF_REG_5 || reg == AUX_REG ||
+ (reg >= BPF_REG_7 && reg <= BPF_REG_9))
+ return true;
+ else
+ return false;
+}
- if (!bpf_jit_enable)
- return;
+/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+static inline u8 add_1mod(u8 byte, u32 reg)
+{
+ if (is_ereg(reg))
+ byte |= 1;
+ return byte;
+}
- addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
- if (addrs == NULL)
- return;
+static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+ if (is_ereg(r1))
+ byte |= 1;
+ if (is_ereg(r2))
+ byte |= 4;
+ return byte;
+}
- /* Before first pass, make a rough estimation of addrs[]
- * each bpf instruction is translated to less than 64 bytes
+/* encode 'dst_reg' register into x64 opcode 'byte' */
+static inline u8 add_1reg(u8 byte, u32 dst_reg)
+{
+ return byte + reg2hex[dst_reg];
+}
+
+/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+ return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
+}
+
+struct jit_context {
+ unsigned int cleanup_addr; /* epilogue code offset */
+ bool seen_ld_abs;
+};
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct bpf_insn *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ u8 temp[64];
+ int i;
+ int proglen = 0;
+ u8 *prog = temp;
+ int stacksize = MAX_BPF_STACK +
+ 32 /* space for rbx, r13, r14, r15 */ +
+ 8 /* space for skb_copy_bits() buffer */;
+
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+
+ /* sub rsp, stacksize */
+ EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+
+ /* all classic BPF filters use R6(rbx) save it */
+
+ /* mov qword ptr [rbp-X],rbx */
+ EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+
+ /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
+ * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
+ * R8(r14). R9(r15) spill could be made conditional, but there is only
+ * one 'bpf_error' return path out of helper functions inside bpf_jit.S
+ * The overhead of extra spill is negligible for any filter other
+ * than synthetic ones. Therefore not worth adding complexity.
*/
- for (proglen = 0, i = 0; i < flen; i++) {
- proglen += 64;
- addrs[i] = proglen;
+
+ /* mov qword ptr [rbp-X],r13 */
+ EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+ /* mov qword ptr [rbp-X],r14 */
+ EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+ /* mov qword ptr [rbp-X],r15 */
+ EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+
+ /* clear A and X registers */
+ EMIT2(0x31, 0xc0); /* xor eax, eax */
+ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+
+ if (ctx->seen_ld_abs) {
+ /* r9d : skb->len - skb->data_len (headlen)
+ * r10 : skb->data
+ */
+ if (is_imm8(offsetof(struct sk_buff, len)))
+ /* mov %r9d, off8(%rdi) */
+ EMIT4(0x44, 0x8b, 0x4f,
+ offsetof(struct sk_buff, len));
+ else
+ /* mov %r9d, off32(%rdi) */
+ EMIT3_off32(0x44, 0x8b, 0x8f,
+ offsetof(struct sk_buff, len));
+
+ if (is_imm8(offsetof(struct sk_buff, data_len)))
+ /* sub %r9d, off8(%rdi) */
+ EMIT4(0x44, 0x2b, 0x4f,
+ offsetof(struct sk_buff, data_len));
+ else
+ EMIT3_off32(0x44, 0x2b, 0x8f,
+ offsetof(struct sk_buff, data_len));
+
+ if (is_imm8(offsetof(struct sk_buff, data)))
+ /* mov %r10, off8(%rdi) */
+ EMIT4(0x4c, 0x8b, 0x57,
+ offsetof(struct sk_buff, data));
+ else
+ /* mov %r10, off32(%rdi) */
+ EMIT3_off32(0x4c, 0x8b, 0x97,
+ offsetof(struct sk_buff, data));
}
- cleanup_addr = proglen; /* epilogue address */
- for (pass = 0; pass < 10; pass++) {
- u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;
- /* no prologue/epilogue for trivial filters (RET something) */
- proglen = 0;
- prog = temp;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const s32 imm32 = insn->imm;
+ u32 dst_reg = insn->dst_reg;
+ u32 src_reg = insn->src_reg;
+ u8 b1 = 0, b2 = 0, b3 = 0;
+ s64 jmp_offset;
+ u8 jmp_cond;
+ int ilen;
+ u8 *func;
+
+ switch (insn->code) {
+ /* ALU */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU64 | BPF_ADD | BPF_X:
+ case BPF_ALU64 | BPF_SUB | BPF_X:
+ case BPF_ALU64 | BPF_AND | BPF_X:
+ case BPF_ALU64 | BPF_OR | BPF_X:
+ case BPF_ALU64 | BPF_XOR | BPF_X:
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b2 = 0x01; break;
+ case BPF_SUB: b2 = 0x29; break;
+ case BPF_AND: b2 = 0x21; break;
+ case BPF_OR: b2 = 0x09; break;
+ case BPF_XOR: b2 = 0x31; break;
+ }
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
+ break;
- if (seen_or_pass0) {
- EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
- EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
- /* note : must save %rbx in case bpf_error is hit */
- if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF))
- EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
- if (seen_or_pass0 & SEEN_XREG)
- CLEAR_X(); /* make sure we dont leek kernel memory */
-
- /*
- * If this filter needs to access skb data,
- * loads r9 and r8 with :
- * r9 = skb->len - skb->data_len
- * r8 = skb->data
+ /* mov dst, src */
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ EMIT_mov(dst_reg, src_reg);
+ break;
+
+ /* mov32 dst, src */
+ case BPF_ALU | BPF_MOV | BPF_X:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+ break;
+
+ /* neg dst */
+ case BPF_ALU | BPF_NEG:
+ case BPF_ALU64 | BPF_NEG:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT2(0xF7, add_1reg(0xD8, dst_reg));
+ break;
+
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b3 = 0xC0; break;
+ case BPF_SUB: b3 = 0xE8; break;
+ case BPF_AND: b3 = 0xE0; break;
+ case BPF_OR: b3 = 0xC8; break;
+ case BPF_XOR: b3 = 0xF0; break;
+ }
+
+ if (is_imm8(imm32))
+ EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
+ else
+ EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
+ break;
+
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ /* optimization: if imm32 is positive,
+ * use 'mov eax, imm32' (which zero-extends imm32)
+ * to save 2 bytes
*/
- if (seen_or_pass0 & SEEN_DATAREF) {
- if (offsetof(struct sk_buff, len) <= 127)
- /* mov off8(%rdi),%r9d */
- EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
- else {
- /* mov off32(%rdi),%r9d */
- EMIT3(0x44, 0x8b, 0x8f);
- EMIT(offsetof(struct sk_buff, len), 4);
- }
- if (is_imm8(offsetof(struct sk_buff, data_len)))
- /* sub off8(%rdi),%r9d */
- EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
- else {
- EMIT3(0x44, 0x2b, 0x8f);
- EMIT(offsetof(struct sk_buff, data_len), 4);
- }
+ if (imm32 < 0) {
+ /* 'mov rax, imm32' sign extends imm32 */
+ b1 = add_1mod(0x48, dst_reg);
+ b2 = 0xC7;
+ b3 = 0xC0;
+ EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
+ break;
+ }
- if (is_imm8(offsetof(struct sk_buff, data)))
- /* mov off8(%rdi),%r8 */
- EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
- else {
- /* mov off32(%rdi),%r8 */
- EMIT3(0x4c, 0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, data), 4);
- }
+ case BPF_ALU | BPF_MOV | BPF_K:
+ /* mov %eax, imm32 */
+ if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+ break;
+
+ /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* mov r11, src_reg */
+ EMIT_mov(AUX_REG, src_reg);
+ else
+ /* mov r11, imm32 */
+ EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
+
+ /* mov rax, dst_reg */
+ EMIT_mov(BPF_REG_0, dst_reg);
+
+ /* xor edx, edx
+ * equivalent to 'xor rdx, rdx', but one byte less
+ */
+ EMIT2(0x31, 0xd2);
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* if (src_reg == 0) return 0 */
+
+ /* cmp r11, 0 */
+ EMIT4(0x49, 0x83, 0xFB, 0x00);
+
+ /* jne .+9 (skip over pop, pop, xor and jmp) */
+ EMIT2(X86_JNE, 1 + 1 + 2 + 5);
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+ EMIT2(0x31, 0xc0); /* xor eax, eax */
+
+ /* jmp cleanup_addr
+ * addrs[i] - 11, because there are 11 bytes
+ * after this insn: div, mov, pop, pop, mov
+ */
+ jmp_offset = ctx->cleanup_addr - (addrs[i] - 11);
+ EMIT1_off32(0xE9, jmp_offset);
}
- }
- switch (filter[0].code) {
- case BPF_S_RET_K:
- case BPF_S_LD_W_LEN:
- case BPF_S_ANC_PROTOCOL:
- case BPF_S_ANC_IFINDEX:
- case BPF_S_ANC_MARK:
- case BPF_S_ANC_RXHASH:
- case BPF_S_ANC_CPU:
- case BPF_S_ANC_VLAN_TAG:
- case BPF_S_ANC_VLAN_TAG_PRESENT:
- case BPF_S_ANC_QUEUE:
- case BPF_S_ANC_PKTTYPE:
- case BPF_S_LD_W_ABS:
- case BPF_S_LD_H_ABS:
- case BPF_S_LD_B_ABS:
- /* first instruction sets A register (or is RET 'constant') */
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /* div r11 */
+ EMIT3(0x49, 0xF7, 0xF3);
+ else
+ /* div r11d */
+ EMIT3(0x41, 0xF7, 0xF3);
+
+ if (BPF_OP(insn->code) == BPF_MOD)
+ /* mov r11, rdx */
+ EMIT3(0x49, 0x89, 0xD3);
+ else
+ /* mov r11, rax */
+ EMIT3(0x49, 0x89, 0xC3);
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov dst_reg, r11 */
+ EMIT_mov(dst_reg, AUX_REG);
break;
- default:
- /* make sure we dont leak kernel information to user */
- CLEAR_A(); /* A = 0 */
- }
- for (i = 0; i < flen; i++) {
- unsigned int K = filter[i].k;
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* mov rax, src_reg */
+ EMIT_mov(BPF_REG_0, src_reg);
+ else
+ /* mov rax, imm32 */
+ EMIT3_off32(0x48, 0xC7, 0xC0, imm32);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, AUX_REG));
+ else if (is_ereg(AUX_REG))
+ EMIT1(add_1mod(0x40, AUX_REG));
+ /* mul(q) r11 */
+ EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+
+ /* mov r11, rax */
+ EMIT_mov(AUX_REG, BPF_REG_0);
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov dst_reg, r11 */
+ EMIT_mov(dst_reg, AUX_REG);
+ break;
- switch (filter[i].code) {
- case BPF_S_ALU_ADD_X: /* A += X; */
- seen |= SEEN_XREG;
- EMIT2(0x01, 0xd8); /* add %ebx,%eax */
- break;
- case BPF_S_ALU_ADD_K: /* A += K; */
- if (!K)
- break;
- if (is_imm8(K))
- EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
- else
- EMIT1_off32(0x05, K); /* add imm32,%eax */
- break;
- case BPF_S_ALU_SUB_X: /* A -= X; */
- seen |= SEEN_XREG;
- EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
- break;
- case BPF_S_ALU_SUB_K: /* A -= K */
- if (!K)
- break;
- if (is_imm8(K))
- EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
- else
- EMIT1_off32(0x2d, K); /* sub imm32,%eax */
- break;
- case BPF_S_ALU_MUL_X: /* A *= X; */
- seen |= SEEN_XREG;
- EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
- break;
- case BPF_S_ALU_MUL_K: /* A *= K */
- if (is_imm8(K))
- EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
- else {
- EMIT2(0x69, 0xc0); /* imul imm32,%eax */
- EMIT(K, 4);
- }
- break;
- case BPF_S_ALU_DIV_X: /* A /= X; */
- seen |= SEEN_XREG;
- EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
- if (pc_ret0 > 0) {
- /* addrs[pc_ret0 - 1] is start address of target
- * (addrs[i] - 4) is the address following this jmp
- * ("xor %edx,%edx; div %ebx" being 4 bytes long)
- */
- EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] -
- (addrs[i] - 4));
- } else {
- EMIT_COND_JMP(X86_JNE, 2 + 5);
- CLEAR_A();
- EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
- }
- EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
- break;
- case BPF_S_ALU_MOD_X: /* A %= X; */
- seen |= SEEN_XREG;
- EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
- if (pc_ret0 > 0) {
- /* addrs[pc_ret0 - 1] is start address of target
- * (addrs[i] - 6) is the address following this jmp
- * ("xor %edx,%edx; div %ebx;mov %edx,%eax" being 6 bytes long)
- */
- EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] -
- (addrs[i] - 6));
- } else {
- EMIT_COND_JMP(X86_JNE, 2 + 5);
- CLEAR_A();
- EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 6)); /* jmp .+off32 */
- }
- EMIT2(0x31, 0xd2); /* xor %edx,%edx */
- EMIT2(0xf7, 0xf3); /* div %ebx */
- EMIT2(0x89, 0xd0); /* mov %edx,%eax */
- break;
- case BPF_S_ALU_MOD_K: /* A %= K; */
- if (K == 1) {
- CLEAR_A();
- break;
- }
- EMIT2(0x31, 0xd2); /* xor %edx,%edx */
- EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
- EMIT2(0xf7, 0xf1); /* div %ecx */
- EMIT2(0x89, 0xd0); /* mov %edx,%eax */
- break;
- case BPF_S_ALU_DIV_K: /* A /= K */
- if (K == 1)
- break;
- EMIT2(0x31, 0xd2); /* xor %edx,%edx */
- EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
- EMIT2(0xf7, 0xf1); /* div %ecx */
- break;
- case BPF_S_ALU_AND_X:
- seen |= SEEN_XREG;
- EMIT2(0x21, 0xd8); /* and %ebx,%eax */
- break;
- case BPF_S_ALU_AND_K:
- if (K >= 0xFFFFFF00) {
- EMIT2(0x24, K & 0xFF); /* and imm8,%al */
- } else if (K >= 0xFFFF0000) {
- EMIT2(0x66, 0x25); /* and imm16,%ax */
- EMIT(K, 2);
- } else {
- EMIT1_off32(0x25, K); /* and imm32,%eax */
- }
- break;
- case BPF_S_ALU_OR_X:
- seen |= SEEN_XREG;
- EMIT2(0x09, 0xd8); /* or %ebx,%eax */
- break;
- case BPF_S_ALU_OR_K:
- if (is_imm8(K))
- EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
- else
- EMIT1_off32(0x0d, K); /* or imm32,%eax */
- break;
- case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
- case BPF_S_ALU_XOR_X:
- seen |= SEEN_XREG;
- EMIT2(0x31, 0xd8); /* xor %ebx,%eax */
- break;
- case BPF_S_ALU_XOR_K: /* A ^= K; */
- if (K == 0)
- break;
- if (is_imm8(K))
- EMIT3(0x83, 0xf0, K); /* xor imm8,%eax */
- else
- EMIT1_off32(0x35, K); /* xor imm32,%eax */
- break;
- case BPF_S_ALU_LSH_X: /* A <<= X; */
- seen |= SEEN_XREG;
- EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
- break;
- case BPF_S_ALU_LSH_K:
- if (K == 0)
- break;
- else if (K == 1)
- EMIT2(0xd1, 0xe0); /* shl %eax */
- else
- EMIT3(0xc1, 0xe0, K);
- break;
- case BPF_S_ALU_RSH_X: /* A >>= X; */
- seen |= SEEN_XREG;
- EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
- break;
- case BPF_S_ALU_RSH_K: /* A >>= K; */
- if (K == 0)
- break;
- else if (K == 1)
- EMIT2(0xd1, 0xe8); /* shr %eax */
- else
- EMIT3(0xc1, 0xe8, K);
- break;
- case BPF_S_ALU_NEG:
- EMIT2(0xf7, 0xd8); /* neg %eax */
- break;
- case BPF_S_RET_K:
- if (!K) {
- if (pc_ret0 == -1)
- pc_ret0 = i;
- CLEAR_A();
- } else {
- EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
- }
- /* fallinto */
- case BPF_S_RET_A:
- if (seen_or_pass0) {
- if (i != flen - 1) {
- EMIT_JMP(cleanup_addr - addrs[i]);
- break;
- }
- if (seen_or_pass0 & SEEN_XREG)
- EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
- EMIT1(0xc9); /* leaveq */
- }
- EMIT1(0xc3); /* ret */
- break;
- case BPF_S_MISC_TAX: /* X = A */
- seen |= SEEN_XREG;
- EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
- break;
- case BPF_S_MISC_TXA: /* A = X */
- seen |= SEEN_XREG;
- EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
- break;
- case BPF_S_LD_IMM: /* A = K */
- if (!K)
- CLEAR_A();
- else
- EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
- break;
- case BPF_S_LDX_IMM: /* X = K */
- seen |= SEEN_XREG;
- if (!K)
- CLEAR_X();
+ /* shifts */
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ case BPF_ALU64 | BPF_LSH | BPF_K:
+ case BPF_ALU64 | BPF_RSH | BPF_K:
+ case BPF_ALU64 | BPF_ARSH | BPF_K:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+ EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
+ break;
+
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ switch (imm32) {
+ case 16:
+ /* emit 'ror %ax, 8' to swap lower 2 bytes */
+ EMIT1(0x66);
+ if (is_ereg(dst_reg))
+ EMIT1(0x41);
+ EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
+ break;
+ case 32:
+ /* emit 'bswap eax' to swap lower 4 bytes */
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0x0F);
else
- EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
- break;
- case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
- seen |= SEEN_MEM;
- EMIT3(0x8b, 0x45, 0xf0 - K*4);
- break;
- case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
- seen |= SEEN_XREG | SEEN_MEM;
- EMIT3(0x8b, 0x5d, 0xf0 - K*4);
- break;
- case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
- seen |= SEEN_MEM;
- EMIT3(0x89, 0x45, 0xf0 - K*4);
- break;
- case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
- seen |= SEEN_XREG | SEEN_MEM;
- EMIT3(0x89, 0x5d, 0xf0 - K*4);
- break;
- case BPF_S_LD_W_LEN: /* A = skb->len; */
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
- if (is_imm8(offsetof(struct sk_buff, len)))
- /* mov off8(%rdi),%eax */
- EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
- else {
- EMIT2(0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, len), 4);
- }
- break;
- case BPF_S_LDX_W_LEN: /* X = skb->len; */
- seen |= SEEN_XREG;
- if (is_imm8(offsetof(struct sk_buff, len)))
- /* mov off8(%rdi),%ebx */
- EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
- else {
- EMIT2(0x8b, 0x9f);
- EMIT(offsetof(struct sk_buff, len), 4);
- }
- break;
- case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
- if (is_imm8(offsetof(struct sk_buff, protocol))) {
- /* movzwl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
- } else {
- EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
- EMIT(offsetof(struct sk_buff, protocol), 4);
- }
- EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
- break;
- case BPF_S_ANC_IFINDEX:
- if (is_imm8(offsetof(struct sk_buff, dev))) {
- /* movq off8(%rdi),%rax */
- EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
- } else {
- EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
- EMIT(offsetof(struct sk_buff, dev), 4);
- }
- EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
- EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
- EMIT(offsetof(struct net_device, ifindex), 4);
- break;
- case BPF_S_ANC_MARK:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
- if (is_imm8(offsetof(struct sk_buff, mark))) {
- /* mov off8(%rdi),%eax */
- EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
- } else {
- EMIT2(0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, mark), 4);
- }
- break;
- case BPF_S_ANC_RXHASH:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
- if (is_imm8(offsetof(struct sk_buff, hash))) {
- /* mov off8(%rdi),%eax */
- EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash));
- } else {
- EMIT2(0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, hash), 4);
- }
- break;
- case BPF_S_ANC_QUEUE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
- if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
- /* movzwl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
- } else {
- EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
- EMIT(offsetof(struct sk_buff, queue_mapping), 4);
- }
- break;
- case BPF_S_ANC_CPU:
-#ifdef CONFIG_SMP
- EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
- EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
-#else
- CLEAR_A();
-#endif
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dst_reg));
break;
- case BPF_S_ANC_VLAN_TAG:
- case BPF_S_ANC_VLAN_TAG_PRESENT:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
- if (is_imm8(offsetof(struct sk_buff, vlan_tci))) {
- /* movzwl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, vlan_tci));
- } else {
- EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
- EMIT(offsetof(struct sk_buff, vlan_tci), 4);
- }
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
- if (filter[i].code == BPF_S_ANC_VLAN_TAG) {
- EMIT3(0x80, 0xe4, 0xef); /* and $0xef,%ah */
- } else {
- EMIT3(0xc1, 0xe8, 0x0c); /* shr $0xc,%eax */
- EMIT3(0x83, 0xe0, 0x01); /* and $0x1,%eax */
- }
- break;
- case BPF_S_ANC_PKTTYPE:
- {
- int off = pkt_type_offset();
-
- if (off < 0)
- goto out;
- if (is_imm8(off)) {
- /* movzbl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb6, 0x47, off);
- } else {
- /* movbl off32(%rdi),%eax */
- EMIT3(0x0f, 0xb6, 0x87);
- EMIT(off, 4);
- }
- EMIT3(0x83, 0xe0, PKT_TYPE_MAX); /* and $0x7,%eax */
+ case 64:
+ /* emit 'bswap rax' to swap 8 bytes */
+ EMIT3(add_1mod(0x48, dst_reg), 0x0F,
+ add_1reg(0xC8, dst_reg));
break;
}
- case BPF_S_LD_W_ABS:
- func = CHOOSE_LOAD_FUNC(K, sk_load_word);
-common_load: seen |= SEEN_DATAREF;
- t_offset = func - (image + addrs[i]);
- EMIT1_off32(0xbe, K); /* mov imm32,%esi */
- EMIT1_off32(0xe8, t_offset); /* call */
- break;
- case BPF_S_LD_H_ABS:
- func = CHOOSE_LOAD_FUNC(K, sk_load_half);
- goto common_load;
- case BPF_S_LD_B_ABS:
- func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
- goto common_load;
- case BPF_S_LDX_B_MSH:
- func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
- seen |= SEEN_DATAREF | SEEN_XREG;
- t_offset = func - (image + addrs[i]);
- EMIT1_off32(0xbe, K); /* mov imm32,%esi */
- EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
- break;
- case BPF_S_LD_W_IND:
- func = sk_load_word;
-common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
- t_offset = func - (image + addrs[i]);
- if (K) {
- if (is_imm8(K)) {
- EMIT3(0x8d, 0x73, K); /* lea imm8(%rbx), %esi */
- } else {
- EMIT2(0x8d, 0xb3); /* lea imm32(%rbx),%esi */
- EMIT(K, 4);
- }
- } else {
- EMIT2(0x89,0xde); /* mov %ebx,%esi */
- }
- EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
- break;
- case BPF_S_LD_H_IND:
- func = sk_load_half;
- goto common_load_ind;
- case BPF_S_LD_B_IND:
- func = sk_load_byte;
- goto common_load_ind;
- case BPF_S_JMP_JA:
- t_offset = addrs[i + K] - addrs[i];
- EMIT_JMP(t_offset);
- break;
- COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
- COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
- COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
- COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
- COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
- COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
- COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
- COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
-
-cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
- t_offset = addrs[i + filter[i].jt] - addrs[i];
-
- /* same targets, can avoid doing the test :) */
- if (filter[i].jt == filter[i].jf) {
- EMIT_JMP(t_offset);
- break;
- }
+ break;
+
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ break;
+
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_B:
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0xC6);
+ else
+ EMIT1(0xC6);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_H:
+ if (is_ereg(dst_reg))
+ EMIT3(0x66, 0x41, 0xC7);
+ else
+ EMIT2(0x66, 0xC7);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_W:
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0xC7);
+ else
+ EMIT1(0xC7);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_DW:
+ EMIT2(add_1mod(0x48, dst_reg), 0xC7);
+
+st: if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, dst_reg), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
+
+ EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+ break;
+
+ /* STX: *(u8*)(dst_reg + off) = src_reg */
+ case BPF_STX | BPF_MEM | BPF_B:
+ /* emit 'mov byte ptr [rax + off], al' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg) ||
+ /* have to add extra byte for x86 SIL, DIL regs */
+ src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
+ else
+ EMIT1(0x88);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_H:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT2(0x66, 0x89);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_W:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT1(0x89);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_DW:
+ EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
+stx: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
+ insn->off);
+ break;
+
+ /* LDX: dst_reg = *(u8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEM | BPF_B:
+ /* emit 'movzx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_H:
+ /* emit 'movzx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_W:
+ /* emit 'mov eax, dword ptr [rax+0x14]' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
+ else
+ EMIT1(0x8B);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_DW:
+ /* emit 'mov rax, qword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
+ldx: /* if insn->off == 0 we can save one extra byte, but
+ * special case of x86 r13 which always needs an offset
+ * is not worth the hassle
+ */
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
+ insn->off);
+ break;
+
+ /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
+ case BPF_STX | BPF_XADD | BPF_W:
+ /* emit 'lock add dword ptr [rax + off], eax' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
+ else
+ EMIT2(0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_DW:
+ EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
+xadd: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
+ insn->off);
+ break;
+
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ func = (u8 *) __bpf_call_base + imm32;
+ jmp_offset = func - (image + addrs[i]);
+ if (ctx->seen_ld_abs) {
+ EMIT2(0x41, 0x52); /* push %r10 */
+ EMIT2(0x41, 0x51); /* push %r9 */
+ /* need to adjust jmp offset, since
+ * pop %r9, pop %r10 take 4 bytes after call insn
+ */
+ jmp_offset += 4;
+ }
+ if (!imm32 || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+ EMIT1_off32(0xE8, jmp_offset);
+ if (ctx->seen_ld_abs) {
+ EMIT2(0x41, 0x59); /* pop %r9 */
+ EMIT2(0x41, 0x5A); /* pop %r10 */
+ }
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ /* cmp dst_reg, src_reg */
+ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
+ add_2reg(0xC0, dst_reg, src_reg));
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* test dst_reg, src_reg */
+ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
+ add_2reg(0xC0, dst_reg, src_reg));
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JSET | BPF_K:
+ /* test dst_reg, imm32 */
+ EMIT1(add_1mod(0x48, dst_reg));
+ EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ /* cmp dst_reg, imm8/32 */
+ EMIT1(add_1mod(0x48, dst_reg));
+
+ if (is_imm8(imm32))
+ EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
+ else
+ EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
+
+emit_cond_jmp: /* convert BPF opcode to x86 */
+ switch (BPF_OP(insn->code)) {
+ case BPF_JEQ:
+ jmp_cond = X86_JE;
+ break;
+ case BPF_JSET:
+ case BPF_JNE:
+ jmp_cond = X86_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = X86_JA;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = X86_JAE;
+ break;
+ case BPF_JSGT:
+ /* signed '>', GT in x86 */
+ jmp_cond = X86_JG;
+ break;
+ case BPF_JSGE:
+ /* signed '>=', GE in x86 */
+ jmp_cond = X86_JGE;
+ break;
+ default: /* to silence gcc warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
- switch (filter[i].code) {
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JEQ_X:
- seen |= SEEN_XREG;
- EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
- break;
- case BPF_S_JMP_JSET_X:
- seen |= SEEN_XREG;
- EMIT2(0x85, 0xd8); /* test %ebx,%eax */
- break;
- case BPF_S_JMP_JEQ_K:
- if (K == 0) {
- EMIT2(0x85, 0xc0); /* test %eax,%eax */
- break;
- }
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGE_K:
- if (K <= 127)
- EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
+ case BPF_JMP | BPF_JA:
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (!jmp_offset)
+ /* optimize out nop jumps */
+ break;
+emit_jmp:
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+
+ case BPF_LD | BPF_IND | BPF_W:
+ func = sk_load_word;
+ goto common_load;
+ case BPF_LD | BPF_ABS | BPF_W:
+ func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
+common_load: ctx->seen_ld_abs = true;
+ jmp_offset = func - (image + addrs[i]);
+ if (!func || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ /* mov %esi, imm32 */
+ EMIT1_off32(0xBE, imm32);
+ } else {
+ /* mov %rsi, src_reg */
+ EMIT_mov(BPF_REG_2, src_reg);
+ if (imm32) {
+ if (is_imm8(imm32))
+ /* add %esi, imm8 */
+ EMIT3(0x83, 0xC6, imm32);
else
- EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
- break;
- case BPF_S_JMP_JSET_K:
- if (K <= 0xFF)
- EMIT2(0xa8, K); /* test imm8,%al */
- else if (!(K & 0xFFFF00FF))
- EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
- else if (K <= 0xFFFF) {
- EMIT2(0x66, 0xa9); /* test imm16,%ax */
- EMIT(K, 2);
- } else {
- EMIT1_off32(0xa9, K); /* test imm32,%eax */
- }
- break;
+ /* add %esi, imm32 */
+ EMIT2_off32(0x81, 0xC6, imm32);
}
- if (filter[i].jt != 0) {
- if (filter[i].jf && f_offset)
- t_offset += is_near(f_offset) ? 2 : 5;
- EMIT_COND_JMP(t_op, t_offset);
- if (filter[i].jf)
- EMIT_JMP(f_offset);
- break;
- }
- EMIT_COND_JMP(f_op, f_offset);
- break;
- default:
- /* hmm, too complex filter, give up with jit compiler */
- goto out;
}
- ilen = prog - temp;
- if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
- pr_err("bpb_jit_compile fatal error\n");
- kfree(addrs);
- module_free(NULL, header);
- return;
- }
- memcpy(image + proglen, temp, ilen);
+ /* skb pointer is in R6 (%rbx), it will be copied into
+ * %rdi if skb_copy_bits() call is necessary.
+ * sk_load_* helpers also use %r10 and %r9d.
+ * See bpf_jit.S
+ */
+ EMIT1_off32(0xE8, jmp_offset); /* call */
+ break;
+
+ case BPF_LD | BPF_IND | BPF_H:
+ func = sk_load_half;
+ goto common_load;
+ case BPF_LD | BPF_ABS | BPF_H:
+ func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
+ goto common_load;
+ case BPF_LD | BPF_IND | BPF_B:
+ func = sk_load_byte;
+ goto common_load;
+ case BPF_LD | BPF_ABS | BPF_B:
+ func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
+ goto common_load;
+
+ case BPF_JMP | BPF_EXIT:
+ if (i != insn_cnt - 1) {
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+ goto emit_jmp;
}
- proglen += ilen;
- addrs[i] = proglen;
- prog = temp;
+ /* update cleanup_addr */
+ ctx->cleanup_addr = proglen;
+ /* mov rbx, qword ptr [rbp-X] */
+ EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+ /* mov r13, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+ /* mov r14, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+ /* mov r15, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ break;
+
+ default:
+ /* By design x64 JIT should support all BPF instructions
+ * This error will be seen if new instruction was added
+ * to interpreter, but not to JIT
+ * or if there is junk in bpf_prog
+ */
+ pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
+ return -EINVAL;
}
- /* last bpf instruction is always a RET :
- * use it to give the cleanup instruction(s) addr
- */
- cleanup_addr = proglen - 1; /* ret */
- if (seen_or_pass0)
- cleanup_addr -= 1; /* leaveq */
- if (seen_or_pass0 & SEEN_XREG)
- cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
+ ilen = prog - temp;
+ if (image) {
+ if (unlikely(proglen + ilen > oldproglen)) {
+ pr_err("bpf_jit_compile fatal error\n");
+ return -EFAULT;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+void bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
+void bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ int proglen, oldproglen = 0;
+ struct jit_context ctx = {};
+ u8 *image = NULL;
+ int *addrs;
+ int pass;
+ int i;
+
+ if (!bpf_jit_enable)
+ return;
+
+ if (!prog || !prog->len)
+ return;
+
+ addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return;
+
+ /* Before first pass, make a rough estimation of addrs[]
+ * each bpf instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < prog->len; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ ctx.cleanup_addr = proglen;
+
+ for (pass = 0; pass < 10; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ if (proglen <= 0) {
+ image = NULL;
+ if (header)
+ module_free(NULL, header);
+ goto out;
+ }
if (image) {
if (proglen != oldproglen)
- pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen);
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
break;
}
if (proglen == oldproglen) {
@@ -766,22 +918,21 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
}
if (bpf_jit_enable > 1)
- bpf_jit_dump(flen, proglen, pass, image);
+ bpf_jit_dump(prog->len, proglen, 0, image);
if (image) {
bpf_flush_icache(header, image + proglen);
set_memory_ro((unsigned long)header, header->pages);
- fp->bpf_func = (void *)image;
- fp->jited = 1;
+ prog->bpf_func = (void *)image;
+ prog->jited = 1;
}
out:
kfree(addrs);
- return;
}
static void bpf_jit_free_deferred(struct work_struct *work)
{
- struct sk_filter *fp = container_of(work, struct sk_filter, work);
+ struct bpf_prog *fp = container_of(work, struct bpf_prog, work);
unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
struct bpf_binary_header *header = (void *)addr;
@@ -790,7 +941,7 @@ static void bpf_jit_free_deferred(struct work_struct *work)
kfree(fp);
}
-void bpf_jit_free(struct sk_filter *fp)
+void bpf_jit_free(struct bpf_prog *fp)
{
if (fp->jited) {
INIT_WORK(&fp->work, bpf_jit_free_deferred);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 01edac6c5e18..5075371ab593 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -489,8 +489,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
}
node = acpi_get_node(device->handle);
- if (node == NUMA_NO_NODE)
+ if (node == NUMA_NO_NODE) {
node = x86_pci_root_bus_node(busnum);
+ if (node != 0 && node != NUMA_NO_NODE)
+ dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
+ node);
+ }
if (node != NUMA_NO_NODE && !node_online(node))
node = NUMA_NO_NODE;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e88f4c53d7f6..c20d2cc7ef64 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -11,27 +11,33 @@
#include "bus_numa.h"
-/*
- * This discovers the pcibus <-> node mapping on AMD K8.
- * also get peer root bus resource for io,mmio
- */
+#define AMD_NB_F0_NODE_ID 0x60
+#define AMD_NB_F0_UNIT_ID 0x64
+#define AMD_NB_F1_CONFIG_MAP_REG 0xe0
+
+#define RANGE_NUM 16
+#define AMD_NB_F1_CONFIG_MAP_RANGES 4
-struct pci_hostbridge_probe {
+struct amd_hostbridge {
u32 bus;
u32 slot;
- u32 vendor;
u32 device;
};
-static struct pci_hostbridge_probe pci_probes[] __initdata = {
- { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 },
- { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
- { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
- { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
+/*
+ * IMPORTANT NOTE:
+ * hb_probes[] and early_root_info_init() is in maintenance mode.
+ * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh .
+ * Future processor will rely on information in ACPI.
+ */
+static struct amd_hostbridge hb_probes[] __initdata = {
+ { 0, 0x18, 0x1100 }, /* K8 */
+ { 0, 0x18, 0x1200 }, /* Family10h */
+ { 0xff, 0, 0x1200 }, /* Family10h */
+ { 0, 0x18, 0x1300 }, /* Family11h */
+ { 0, 0x18, 0x1600 }, /* Family15h */
};
-#define RANGE_NUM 16
-
static struct pci_root_info __init *find_pci_root_info(int node, int link)
{
struct pci_root_info *info;
@@ -45,12 +51,12 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
}
/**
- * early_fill_mp_bus_to_node()
+ * early_root_info_init()
* called before pcibios_scan_root and pci_scan_bus
- * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
- * Registers found in the K8 northbridge
+ * fills the mp_bus_to_cpumask array based according
+ * to the LDT Bus Number Registers found in the northbridge.
*/
-static int __init early_fill_mp_bus_info(void)
+static int __init early_root_info_init(void)
{
int i;
unsigned bus;
@@ -75,19 +81,21 @@ static int __init early_fill_mp_bus_info(void)
return -1;
found = false;
- for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
+ for (i = 0; i < ARRAY_SIZE(hb_probes); i++) {
u32 id;
u16 device;
u16 vendor;
- bus = pci_probes[i].bus;
- slot = pci_probes[i].slot;
+ bus = hb_probes[i].bus;
+ slot = hb_probes[i].slot;
id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
-
vendor = id & 0xffff;
device = (id>>16) & 0xffff;
- if (pci_probes[i].vendor == vendor &&
- pci_probes[i].device == device) {
+
+ if (vendor != PCI_VENDOR_ID_AMD)
+ continue;
+
+ if (hb_probes[i].device == device) {
found = true;
break;
}
@@ -96,10 +104,16 @@ static int __init early_fill_mp_bus_info(void)
if (!found)
return 0;
- for (i = 0; i < 4; i++) {
+ /*
+ * We should learn topology and routing information from _PXM and
+ * _CRS methods in the ACPI namespace. We extract node numbers
+ * here to work around BIOSes that don't supply _PXM.
+ */
+ for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
int min_bus;
int max_bus;
- reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2));
+ reg = read_pci_config(bus, slot, 1,
+ AMD_NB_F1_CONFIG_MAP_REG + (i << 2));
/* Check if that register is enabled for bus range */
if ((reg & 7) != 3)
@@ -113,10 +127,21 @@ static int __init early_fill_mp_bus_info(void)
info = alloc_pci_root_info(min_bus, max_bus, node, link);
}
+ /*
+ * The following code extracts routing information for use on old
+ * systems where Linux doesn't automatically use host bridge _CRS
+ * methods (or when the user specifies "pci=nocrs").
+ *
+ * We only do this through Fam11h, because _CRS should be enough on
+ * newer systems.
+ */
+ if (boot_cpu_data.x86 > 0x11)
+ return 0;
+
/* get the default node and link for left over res */
- reg = read_pci_config(bus, slot, 0, 0x60);
+ reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
def_node = (reg >> 8) & 0x07;
- reg = read_pci_config(bus, slot, 0, 0x64);
+ reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID);
def_link = (reg >> 8) & 0x03;
memset(range, 0, sizeof(range));
@@ -363,7 +388,7 @@ static int __init pci_io_ecs_init(void)
int cpu;
/* assume all cpus from fam10h have IO ECS */
- if (boot_cpu_data.x86 < 0x10)
+ if (boot_cpu_data.x86 < 0x10)
return 0;
/* Try the PCI method first. */
@@ -387,7 +412,7 @@ static int __init amd_postcore_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
- early_fill_mp_bus_info();
+ early_root_info_init();
pci_io_ecs_init();
return 0;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 614392ced7d6..bb461cfd01ab 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -60,8 +60,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
word1 = read_pci_config_16(bus, slot, func, 0xc4);
word2 = read_pci_config_16(bus, slot, func, 0xc6);
if (word1 != word2) {
- res.start = (word1 << 16) | 0x0000;
- res.end = (word2 << 16) | 0xffff;
+ res.start = ((resource_size_t) word1 << 16) | 0x0000;
+ res.end = ((resource_size_t) word2 << 16) | 0xffff;
res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
update_res(info, res.start, res.end, res.flags, 0);
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 94ae9ae9574f..c61ea57d1ba1 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,6 +6,7 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/vgaarb.h>
+#include <asm/hpet.h>
#include <asm/pci_x86.h>
static void pci_fixup_i450nx(struct pci_dev *d)
@@ -325,6 +326,27 @@ static void pci_fixup_video(struct pci_dev *pdev)
struct pci_bus *bus;
u16 config;
+ if (!vga_default_device()) {
+ resource_size_t start, end;
+ int i;
+
+ /* Does firmware framebuffer belong to us? */
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+ continue;
+
+ start = pci_resource_start(pdev, i);
+ end = pci_resource_end(pdev, i);
+
+ if (!start || !end)
+ continue;
+
+ if (screen_info.lfb_base >= start &&
+ (screen_info.lfb_base + screen_info.lfb_size) < end)
+ vga_set_default_device(pdev);
+ }
+ }
+
/* Is VGA routed to us? */
bus = pdev->bus;
while (bus) {
@@ -337,9 +359,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
* type BRIDGE, or CARDBUS. Host to PCI controllers use
* PCI header type NORMAL.
*/
- if (bridge
- && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
- || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
+ if (bridge && (pci_is_bridge(bridge))) {
pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
&config);
if (!(config & PCI_BRIDGE_CTL_VGA))
@@ -526,6 +546,19 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev)
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
+#ifdef CONFIG_HPET_TIMER
+static void sb600_hpet_quirk(struct pci_dev *dev)
+{
+ struct resource *r = &dev->resource[1];
+
+ if (r->flags & IORESOURCE_MEM && r->start == hpet_address) {
+ r->flags |= IORESOURCE_PCI_FIXED;
+ dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk);
+#endif
+
/*
* Twinhead H12Y needs us to block out a region otherwise we map devices
* there and any access kills the box.
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index db6b1ab43255..2ae525e0d8ba 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res,
return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* The low 1MB range is reserved for ISA cards */
+ if (start < BIOS_END)
+ start = BIOS_END;
}
return start;
}
@@ -271,11 +275,16 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass)
"BAR %d: reserving %pr (d=%d, p=%d)\n",
idx, r, disabled, pass);
if (pci_claim_resource(dev, idx) < 0) {
- /* We'll assign a new address later */
- pcibios_save_fw_addr(dev,
- idx, r->start);
- r->end -= r->start;
- r->start = 0;
+ if (r->flags & IORESOURCE_PCI_FIXED) {
+ dev_info(&dev->dev, "BAR %d %pR is immovable\n",
+ idx, r);
+ } else {
+ /* We'll assign a new address later */
+ pcibios_save_fw_addr(dev,
+ idx, r->start);
+ r->end -= r->start;
+ r->start = 0;
+ }
}
}
}
@@ -356,6 +365,12 @@ static int __init pcibios_assign_resources(void)
return 0;
}
+/**
+ * called in fs_initcall (one below subsys_initcall),
+ * give a chance for motherboard reserve resources
+ */
+fs_initcall(pcibios_assign_resources);
+
void pcibios_resource_survey_bus(struct pci_bus *bus)
{
dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n");
@@ -392,12 +407,6 @@ void __init pcibios_resource_survey(void)
ioapic_insert_resources();
}
-/**
- * called in fs_initcall (one below subsys_initcall),
- * give a chance for motherboard reserve resources
- */
-fs_initcall(pcibios_assign_resources);
-
static const struct vm_operations_struct pci_mmap_ops = {
.access = generic_access_phys,
};
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 9d8a509c9730..5ceda85b8687 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -173,9 +173,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
{
void *vaddr;
- vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs);
- if (!vaddr)
- vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags);
+ vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs);
*dma_handle = p2a(*dma_handle, to_pci_dev(dev));
return vaddr;
}
@@ -183,7 +181,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
/* We have our own dma_ops: the same as swiotlb but from alloc (above) */
static struct dma_map_ops sta2x11_dma_ops = {
.alloc = sta2x11_swiotlb_alloc_coherent,
- .free = swiotlb_free_coherent,
+ .free = x86_swiotlb_free_coherent,
.map_page = swiotlb_map_page,
.unmap_page = swiotlb_unmap_page,
.map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index d51045afcaaf..2846aaab5103 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3781dd39e8bd..850da94fef30 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -56,13 +56,6 @@
#define EFI_DEBUG
-#define EFI_MIN_RESERVE 5120
-
-#define EFI_DUMMY_GUID \
- EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
-
-static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
-
struct efi_memory_map memmap;
static struct efi efi_phys __initdata;
@@ -95,141 +88,6 @@ static int __init setup_add_efi_memmap(char *arg)
}
early_param("add_efi_memmap", setup_add_efi_memmap);
-static bool efi_no_storage_paranoia;
-
-static int __init setup_storage_paranoia(char *arg)
-{
- efi_no_storage_paranoia = true;
- return 0;
-}
-early_param("efi_no_storage_paranoia", setup_storage_paranoia);
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt2(get_time, tm, tc);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt1(set_time, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
- efi_bool_t *pending,
- efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt3(get_wakeup_time,
- enabled, pending, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt2(set_wakeup_time,
- enabled, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 *attr,
- unsigned long *data_size,
- void *data)
-{
- return efi_call_virt5(get_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
- efi_char16_t *name,
- efi_guid_t *vendor)
-{
- return efi_call_virt3(get_next_variable,
- name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 attr,
- unsigned long data_size,
- void *data)
-{
- return efi_call_virt5(set_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_query_variable_info(u32 attr,
- u64 *storage_space,
- u64 *remaining_space,
- u64 *max_variable_size)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt4(query_variable_info, attr, storage_space,
- remaining_space, max_variable_size);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
- return efi_call_virt1(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system(int reset_type,
- efi_status_t status,
- unsigned long data_size,
- efi_char16_t *data)
-{
- efi_call_virt4(reset_system, reset_type, status,
- data_size, data);
-}
-
-static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
- unsigned long count,
- unsigned long sg_list)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt3(update_capsule, capsules, count, sg_list);
-}
-
-static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
- unsigned long count,
- u64 *max_size,
- int *reset_type)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
- reset_type);
-}
-
static efi_status_t __init phys_efi_set_virtual_address_map(
unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -239,49 +97,13 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
efi_status_t status;
efi_call_phys_prelog();
- status = efi_call_phys4(efi_phys.set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
+ status = efi_call_phys(efi_phys.set_virtual_address_map,
+ memory_map_size, descriptor_size,
+ descriptor_version, virtual_map);
efi_call_phys_epilog();
return status;
}
-int efi_set_rtc_mmss(const struct timespec *now)
-{
- unsigned long nowtime = now->tv_sec;
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
- struct rtc_time tm;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS) {
- pr_err("Oops: efitime: can't read time!\n");
- return -1;
- }
-
- rtc_time_to_tm(nowtime, &tm);
- if (!rtc_valid_tm(&tm)) {
- eft.year = tm.tm_year + 1900;
- eft.month = tm.tm_mon + 1;
- eft.day = tm.tm_mday;
- eft.minute = tm.tm_min;
- eft.second = tm.tm_sec;
- eft.nanosecond = 0;
- } else {
- pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
- __func__, nowtime);
- return -1;
- }
-
- status = efi.set_time(&eft);
- if (status != EFI_SUCCESS) {
- pr_err("Oops: efitime: can't write time!\n");
- return -1;
- }
- return 0;
-}
-
void efi_get_time(struct timespec *now)
{
efi_status_t status;
@@ -352,6 +174,9 @@ int __init efi_memblock_x86_reserve_range(void)
struct efi_info *e = &boot_params.efi_info;
unsigned long pmap;
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
#ifdef CONFIG_X86_32
/* Can't handle data above 4GB at this time */
if (e->efi_memmap_hi) {
@@ -394,69 +219,15 @@ static void __init print_efi_memmap(void)
#endif /* EFI_DEBUG */
}
-void __init efi_reserve_boot_services(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- u64 start = md->phys_addr;
- u64 size = md->num_pages << EFI_PAGE_SHIFT;
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
- /* Only reserve where possible:
- * - Not within any already allocated areas
- * - Not over any memory area (really needed, if above?)
- * - Not within any part of the kernel
- * - Not the bios reserved area
- */
- if ((start + size > __pa_symbol(_text)
- && start <= __pa_symbol(_end)) ||
- !e820_all_mapped(start, start+size, E820_RAM) ||
- memblock_is_region_reserved(start, size)) {
- /* Could not reserve, skip it */
- md->num_pages = 0;
- memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
- start, start+size-1);
- } else
- memblock_reserve(start, size);
- }
-}
-
void __init efi_unmap_memmap(void)
{
clear_bit(EFI_MEMMAP, &efi.flags);
if (memmap.map) {
- early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+ early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
}
}
-void __init efi_free_boot_services(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- unsigned long long start = md->phys_addr;
- unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
-
- /* Could not reserve boot area */
- if (!size)
- continue;
-
- free_bootmem_late(start, size);
- }
-
- efi_unmap_memmap();
-}
-
static int __init efi_systab_init(void *phys)
{
if (efi_enabled(EFI_64BIT)) {
@@ -469,12 +240,12 @@ static int __init efi_systab_init(void *phys)
if (!data)
return -ENOMEM;
}
- systab64 = early_ioremap((unsigned long)phys,
+ systab64 = early_memremap((unsigned long)phys,
sizeof(*systab64));
if (systab64 == NULL) {
pr_err("Couldn't map the system table!\n");
if (data)
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
return -ENOMEM;
}
@@ -506,9 +277,9 @@ static int __init efi_systab_init(void *phys)
systab64->tables;
tmp |= data ? data->tables : systab64->tables;
- early_iounmap(systab64, sizeof(*systab64));
+ early_memunmap(systab64, sizeof(*systab64));
if (data)
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
#ifdef CONFIG_X86_32
if (tmp >> 32) {
pr_err("EFI data located above 4GB, disabling EFI.\n");
@@ -518,7 +289,7 @@ static int __init efi_systab_init(void *phys)
} else {
efi_system_table_32_t *systab32;
- systab32 = early_ioremap((unsigned long)phys,
+ systab32 = early_memremap((unsigned long)phys,
sizeof(*systab32));
if (systab32 == NULL) {
pr_err("Couldn't map the system table!\n");
@@ -539,7 +310,7 @@ static int __init efi_systab_init(void *phys)
efi_systab.nr_tables = systab32->nr_tables;
efi_systab.tables = systab32->tables;
- early_iounmap(systab32, sizeof(*systab32));
+ early_memunmap(systab32, sizeof(*systab32));
}
efi.systab = &efi_systab;
@@ -565,7 +336,7 @@ static int __init efi_runtime_init32(void)
{
efi_runtime_services_32_t *runtime;
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
sizeof(efi_runtime_services_32_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
@@ -580,7 +351,7 @@ static int __init efi_runtime_init32(void)
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
(unsigned long)runtime->set_virtual_address_map;
- early_iounmap(runtime, sizeof(efi_runtime_services_32_t));
+ early_memunmap(runtime, sizeof(efi_runtime_services_32_t));
return 0;
}
@@ -589,7 +360,7 @@ static int __init efi_runtime_init64(void)
{
efi_runtime_services_64_t *runtime;
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
sizeof(efi_runtime_services_64_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
@@ -604,7 +375,7 @@ static int __init efi_runtime_init64(void)
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
(unsigned long)runtime->set_virtual_address_map;
- early_iounmap(runtime, sizeof(efi_runtime_services_64_t));
+ early_memunmap(runtime, sizeof(efi_runtime_services_64_t));
return 0;
}
@@ -618,14 +389,24 @@ static int __init efi_runtime_init(void)
* the runtime services table so that we can grab the physical
* address of several of the EFI runtime functions, needed to
* set the firmware into virtual mode.
+ *
+ * When EFI_PARAVIRT is in force then we could not map runtime
+ * service memory region because we do not have direct access to it.
+ * However, runtime services are available through proxy functions
+ * (e.g. in case of Xen dom0 EFI implementation they call special
+ * hypercall which executes relevant EFI functions) and that is why
+ * they are always enabled.
*/
- if (efi_enabled(EFI_64BIT))
- rv = efi_runtime_init64();
- else
- rv = efi_runtime_init32();
- if (rv)
- return rv;
+ if (!efi_enabled(EFI_PARAVIRT)) {
+ if (efi_enabled(EFI_64BIT))
+ rv = efi_runtime_init64();
+ else
+ rv = efi_runtime_init32();
+
+ if (rv)
+ return rv;
+ }
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
@@ -634,8 +415,11 @@ static int __init efi_runtime_init(void)
static int __init efi_memmap_init(void)
{
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
/* Map the EFI memory map */
- memmap.map = early_ioremap((unsigned long)memmap.phys_map,
+ memmap.map = early_memremap((unsigned long)memmap.phys_map,
memmap.nr_map * memmap.desc_size);
if (memmap.map == NULL) {
pr_err("Could not map the memory map!\n");
@@ -651,62 +435,6 @@ static int __init efi_memmap_init(void)
return 0;
}
-/*
- * A number of config table entries get remapped to virtual addresses
- * after entering EFI virtual mode. However, the kexec kernel requires
- * their physical addresses therefore we pass them via setup_data and
- * correct those entries to their respective physical addresses here.
- *
- * Currently only handles smbios which is necessary for some firmware
- * implementation.
- */
-static int __init efi_reuse_config(u64 tables, int nr_tables)
-{
- int i, sz, ret = 0;
- void *p, *tablep;
- struct efi_setup_data *data;
-
- if (!efi_setup)
- return 0;
-
- if (!efi_enabled(EFI_64BIT))
- return 0;
-
- data = early_memremap(efi_setup, sizeof(*data));
- if (!data) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (!data->smbios)
- goto out_memremap;
-
- sz = sizeof(efi_config_table_64_t);
-
- p = tablep = early_memremap(tables, nr_tables * sz);
- if (!p) {
- pr_err("Could not map Configuration table!\n");
- ret = -ENOMEM;
- goto out_memremap;
- }
-
- for (i = 0; i < efi.systab->nr_tables; i++) {
- efi_guid_t guid;
-
- guid = ((efi_config_table_64_t *)p)->guid;
-
- if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
- ((efi_config_table_64_t *)p)->table = data->smbios;
- p += sz;
- }
- early_iounmap(tablep, nr_tables * sz);
-
-out_memremap:
- early_iounmap(data, sizeof(*data));
-out:
- return ret;
-}
-
void __init efi_init(void)
{
efi_char16_t *c16;
@@ -730,8 +458,6 @@ void __init efi_init(void)
if (efi_systab_init(efi_phys.systab))
return;
- set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
efi.config_table = (unsigned long)efi.systab->tables;
efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
efi.runtime = (unsigned long)efi.systab->runtime;
@@ -739,14 +465,14 @@ void __init efi_init(void)
/*
* Show what we know for posterity
*/
- c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
+ c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
if (c16) {
for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
vendor[i] = *c16++;
vendor[i] = '\0';
} else
pr_err("Could not map the firmware vendor!\n");
- early_iounmap(tmp, 2);
+ early_memunmap(tmp, 2);
pr_info("EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
@@ -772,8 +498,6 @@ void __init efi_init(void)
if (efi_memmap_init())
return;
- set_bit(EFI_MEMMAP, &efi.flags);
-
print_efi_memmap();
}
@@ -849,22 +573,6 @@ void __init old_map_region(efi_memory_desc_t *md)
(unsigned long long)md->phys_addr);
}
-static void native_runtime_setup(void)
-{
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
- efi.query_variable_info = virt_efi_query_variable_info;
- efi.update_capsule = virt_efi_update_capsule;
- efi.query_capsule_caps = virt_efi_query_capsule_caps;
-}
-
/* Merge contiguous regions of the same type and attribute */
static void __init efi_merge_regions(void)
{
@@ -919,6 +627,9 @@ static void __init save_runtime_map(void)
void *tmp, *p, *q = NULL;
int count = 0;
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return;
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
@@ -1048,7 +759,7 @@ static void __init kexec_enter_virtual_mode(void)
*/
efi.runtime_version = efi_systab.hdr.revision;
- native_runtime_setup();
+ efi_native_runtime_setup();
efi.set_virtual_address_map = NULL;
@@ -1056,11 +767,7 @@ static void __init kexec_enter_virtual_mode(void)
runtime_code_page_mkexec();
/* clean DUMMY object */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi_delete_dummy_variable();
#endif
}
@@ -1141,7 +848,7 @@ static void __init __efi_enter_virtual_mode(void)
efi.runtime_version = efi_systab.hdr.revision;
if (efi_is_native())
- native_runtime_setup();
+ efi_native_runtime_setup();
else
efi_thunk_runtime_setup();
@@ -1178,15 +885,14 @@ static void __init __efi_enter_virtual_mode(void)
free_pages((unsigned long)new_memmap, pg_shift);
/* clean DUMMY object */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi_delete_dummy_variable();
}
void __init efi_enter_virtual_mode(void)
{
+ if (efi_enabled(EFI_PARAVIRT))
+ return;
+
if (efi_setup)
kexec_enter_virtual_mode();
else
@@ -1219,6 +925,9 @@ u64 efi_mem_attributes(unsigned long phys_addr)
efi_memory_desc_t *md;
void *p;
+ if (!efi_enabled(EFI_MEMMAP))
+ return 0;
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) &&
@@ -1229,86 +938,6 @@ u64 efi_mem_attributes(unsigned long phys_addr)
return 0;
}
-/*
- * Some firmware implementations refuse to boot if there's insufficient space
- * in the variable store. Ensure that we never use more than a safe limit.
- *
- * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
- * store.
- */
-efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
-{
- efi_status_t status;
- u64 storage_size, remaining_size, max_size;
-
- if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
- return 0;
-
- status = efi.query_variable_info(attributes, &storage_size,
- &remaining_size, &max_size);
- if (status != EFI_SUCCESS)
- return status;
-
- /*
- * We account for that by refusing the write if permitting it would
- * reduce the available space to under 5KB. This figure was provided by
- * Samsung, so should be safe.
- */
- if ((remaining_size - size < EFI_MIN_RESERVE) &&
- !efi_no_storage_paranoia) {
-
- /*
- * Triggering garbage collection may require that the firmware
- * generate a real EFI_OUT_OF_RESOURCES error. We can force
- * that by attempting to use more space than is available.
- */
- unsigned long dummy_size = remaining_size + 1024;
- void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
-
- if (!dummy)
- return EFI_OUT_OF_RESOURCES;
-
- status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- dummy_size, dummy);
-
- if (status == EFI_SUCCESS) {
- /*
- * This should have failed, so if it didn't make sure
- * that we delete it...
- */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, dummy);
- }
-
- kfree(dummy);
-
- /*
- * The runtime code may now have triggered a garbage collection
- * run, so check the variable info again
- */
- status = efi.query_variable_info(attributes, &storage_size,
- &remaining_size, &max_size);
-
- if (status != EFI_SUCCESS)
- return status;
-
- /*
- * There still isn't enough room, so return an error
- */
- if (remaining_size - size < EFI_MIN_RESERVE)
- return EFI_OUT_OF_RESOURCES;
- }
-
- return EFI_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(efi_query_variable_store);
-
static int __init parse_efi_cmdline(char *str)
{
if (*str == '=')
@@ -1320,22 +949,3 @@ static int __init parse_efi_cmdline(char *str)
return 0;
}
early_param("efi", parse_efi_cmdline);
-
-void __init efi_apply_memmap_quirks(void)
-{
- /*
- * Once setup is done earlier, unmap the EFI memory map on mismatched
- * firmware/kernel architectures since there is no support for runtime
- * services.
- */
- if (!efi_runtime_supported()) {
- pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
- efi_unmap_memmap();
- }
-
- /*
- * UV doesn't support the new EFI pagetable mapping yet.
- */
- if (is_uv_system())
- set_bit(EFI_OLD_MEMMAP, &efi.flags);
-}
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index e0984ef0374b..5fcda7272550 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -73,84 +73,7 @@
2:
.endm
-ENTRY(efi_call0)
- SAVE_XMM
- subq $32, %rsp
- SWITCH_PGT
- call *%rdi
- RESTORE_PGT
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call0)
-
-ENTRY(efi_call1)
- SAVE_XMM
- subq $32, %rsp
- mov %rsi, %rcx
- SWITCH_PGT
- call *%rdi
- RESTORE_PGT
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call1)
-
-ENTRY(efi_call2)
- SAVE_XMM
- subq $32, %rsp
- mov %rsi, %rcx
- SWITCH_PGT
- call *%rdi
- RESTORE_PGT
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call2)
-
-ENTRY(efi_call3)
- SAVE_XMM
- subq $32, %rsp
- mov %rcx, %r8
- mov %rsi, %rcx
- SWITCH_PGT
- call *%rdi
- RESTORE_PGT
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call3)
-
-ENTRY(efi_call4)
- SAVE_XMM
- subq $32, %rsp
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- SWITCH_PGT
- call *%rdi
- RESTORE_PGT
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call4)
-
-ENTRY(efi_call5)
- SAVE_XMM
- subq $48, %rsp
- mov %r9, 32(%rsp)
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- SWITCH_PGT
- call *%rdi
- RESTORE_PGT
- addq $48, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call5)
-
-ENTRY(efi_call6)
+ENTRY(efi_call)
SAVE_XMM
mov (%rsp), %rax
mov 8(%rax), %rax
@@ -166,7 +89,7 @@ ENTRY(efi_call6)
addq $48, %rsp
RESTORE_XMM
ret
-ENDPROC(efi_call6)
+ENDPROC(efi_call)
#ifdef CONFIG_EFI_MIXED
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
new file mode 100644
index 000000000000..1c7380da65ff
--- /dev/null
+++ b/arch/x86/platform/efi/quirks.c
@@ -0,0 +1,290 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/efi.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <asm/efi.h>
+#include <asm/uv/uv.h>
+
+#define EFI_MIN_RESERVE 5120
+
+#define EFI_DUMMY_GUID \
+ EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+
+static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
+
+static bool efi_no_storage_paranoia;
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient
+ * space in the variable store. The implementation of garbage collection
+ * in some FW versions causes stale (deleted) variables to take up space
+ * longer than intended and space is only freed once the store becomes
+ * almost completely full.
+ *
+ * Enabling this option disables the space checks in
+ * efi_query_variable_store() and forces garbage collection.
+ *
+ * Only enable this option if deleting EFI variables does not free up
+ * space in your variable store, e.g. if despite deleting variables
+ * you're unable to create new ones.
+ */
+static int __init setup_storage_paranoia(char *arg)
+{
+ efi_no_storage_paranoia = true;
+ return 0;
+}
+early_param("efi_no_storage_paranoia", setup_storage_paranoia);
+
+/*
+ * Deleting the dummy variable which kicks off garbage collection
+*/
+void efi_delete_dummy_variable(void)
+{
+ efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ 0, NULL);
+}
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient space
+ * in the variable store. Ensure that we never use more than a safe limit.
+ *
+ * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+ * store.
+ */
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+{
+ efi_status_t status;
+ u64 storage_size, remaining_size, max_size;
+
+ if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+ return 0;
+
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * We account for that by refusing the write if permitting it would
+ * reduce the available space to under 5KB. This figure was provided by
+ * Samsung, so should be safe.
+ */
+ if ((remaining_size - size < EFI_MIN_RESERVE) &&
+ !efi_no_storage_paranoia) {
+
+ /*
+ * Triggering garbage collection may require that the firmware
+ * generate a real EFI_OUT_OF_RESOURCES error. We can force
+ * that by attempting to use more space than is available.
+ */
+ unsigned long dummy_size = remaining_size + 1024;
+ void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
+
+ if (!dummy)
+ return EFI_OUT_OF_RESOURCES;
+
+ status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ dummy_size, dummy);
+
+ if (status == EFI_SUCCESS) {
+ /*
+ * This should have failed, so if it didn't make sure
+ * that we delete it...
+ */
+ efi_delete_dummy_variable();
+ }
+
+ kfree(dummy);
+
+ /*
+ * The runtime code may now have triggered a garbage collection
+ * run, so check the variable info again
+ */
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * There still isn't enough room, so return an error
+ */
+ if (remaining_size - size < EFI_MIN_RESERVE)
+ return EFI_OUT_OF_RESOURCES;
+ }
+
+ return EFI_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(efi_query_variable_store);
+
+/*
+ * The UEFI specification makes it clear that the operating system is free to do
+ * whatever it wants with boot services code after ExitBootServices() has been
+ * called. Ignoring this recommendation a significant bunch of EFI implementations
+ * continue calling into boot services code (SetVirtualAddressMap). In order to
+ * work around such buggy implementations we reserve boot services region during
+ * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it
+* is discarded.
+*/
+void __init efi_reserve_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ u64 start = md->phys_addr;
+ u64 size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+ /* Only reserve where possible:
+ * - Not within any already allocated areas
+ * - Not over any memory area (really needed, if above?)
+ * - Not within any part of the kernel
+ * - Not the bios reserved area
+ */
+ if ((start + size > __pa_symbol(_text)
+ && start <= __pa_symbol(_end)) ||
+ !e820_all_mapped(start, start+size, E820_RAM) ||
+ memblock_is_region_reserved(start, size)) {
+ /* Could not reserve, skip it */
+ md->num_pages = 0;
+ memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
+ start, start+size-1);
+ } else
+ memblock_reserve(start, size);
+ }
+}
+
+void __init efi_free_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+
+ /* Could not reserve boot area */
+ if (!size)
+ continue;
+
+ free_bootmem_late(start, size);
+ }
+
+ efi_unmap_memmap();
+}
+
+/*
+ * A number of config table entries get remapped to virtual addresses
+ * after entering EFI virtual mode. However, the kexec kernel requires
+ * their physical addresses therefore we pass them via setup_data and
+ * correct those entries to their respective physical addresses here.
+ *
+ * Currently only handles smbios which is necessary for some firmware
+ * implementation.
+ */
+int __init efi_reuse_config(u64 tables, int nr_tables)
+{
+ int i, sz, ret = 0;
+ void *p, *tablep;
+ struct efi_setup_data *data;
+
+ if (!efi_setup)
+ return 0;
+
+ if (!efi_enabled(EFI_64BIT))
+ return 0;
+
+ data = early_memremap(efi_setup, sizeof(*data));
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!data->smbios)
+ goto out_memremap;
+
+ sz = sizeof(efi_config_table_64_t);
+
+ p = tablep = early_memremap(tables, nr_tables * sz);
+ if (!p) {
+ pr_err("Could not map Configuration table!\n");
+ ret = -ENOMEM;
+ goto out_memremap;
+ }
+
+ for (i = 0; i < efi.systab->nr_tables; i++) {
+ efi_guid_t guid;
+
+ guid = ((efi_config_table_64_t *)p)->guid;
+
+ if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
+ ((efi_config_table_64_t *)p)->table = data->smbios;
+ p += sz;
+ }
+ early_memunmap(tablep, nr_tables * sz);
+
+out_memremap:
+ early_memunmap(data, sizeof(*data));
+out:
+ return ret;
+}
+
+void __init efi_apply_memmap_quirks(void)
+{
+ /*
+ * Once setup is done earlier, unmap the EFI memory map on mismatched
+ * firmware/kernel architectures since there is no support for runtime
+ * services.
+ */
+ if (!efi_runtime_supported()) {
+ pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
+ efi_unmap_memmap();
+ }
+
+ /*
+ * UV doesn't support the new EFI pagetable mapping yet.
+ */
+ if (is_uv_system())
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
+}
+
+/*
+ * For most modern platforms the preferred method of powering off is via
+ * ACPI. However, there are some that are known to require the use of
+ * EFI runtime services and for which ACPI does not work at all.
+ *
+ * Using EFI is a last resort, to be used only if no other option
+ * exists.
+ */
+bool efi_reboot_required(void)
+{
+ if (!acpi_gbl_reduced_hardware)
+ return false;
+
+ efi_reboot_quirk_mode = EFI_RESET_WARM;
+ return true;
+}
+
+bool efi_poweroff_required(void)
+{
+ return !!acpi_gbl_reduced_hardware;
+}
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index 097e7a7940d8..af9307f2cc28 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -20,3 +20,4 @@ obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o
# MISC Devices
obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
+obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
new file mode 100644
index 000000000000..973cf3bfa9fd
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -0,0 +1,72 @@
+/*
+ * platform_wdt.c: Watchdog platform library file
+ *
+ * (C) Copyright 2014 Intel Corporation
+ * Author: David Cohen <david.a.cohen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/intel-mid_wdt.h>
+#include <asm/intel-mid.h>
+#include <asm/io_apic.h>
+
+#define TANGIER_EXT_TIMER0_MSI 15
+
+static struct platform_device wdt_dev = {
+ .name = "intel_mid_wdt",
+ .id = -1,
+};
+
+static int tangier_probe(struct platform_device *pdev)
+{
+ int ioapic;
+ int irq;
+ struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
+ struct io_apic_irq_attr irq_attr = { 0 };
+
+ if (!pdata)
+ return -EINVAL;
+
+ irq = pdata->irq;
+ ioapic = mp_find_ioapic(irq);
+ if (ioapic >= 0) {
+ int ret;
+ irq_attr.ioapic = ioapic;
+ irq_attr.ioapic_pin = irq;
+ irq_attr.trigger = 1;
+ /* irq_attr.polarity = 0; -> Active high */
+ ret = io_apic_set_pci_routing(NULL, irq, &irq_attr);
+ if (ret)
+ return ret;
+ } else {
+ dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
+ irq);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct intel_mid_wdt_pdata tangier_pdata = {
+ .irq = TANGIER_EXT_TIMER0_MSI,
+ .probe = tangier_probe,
+};
+
+static int __init register_mid_wdt(void)
+{
+ if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) {
+ wdt_dev.dev.platform_data = &tangier_pdata;
+ return platform_device_register(&wdt_dev);
+ }
+
+ return -ENODEV;
+}
+
+rootfs_initcall(register_mid_wdt);
diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
index 9471b9456f25..baf16e72e668 100644
--- a/arch/x86/platform/ts5500/ts5500.c
+++ b/arch/x86/platform/ts5500/ts5500.c
@@ -1,7 +1,7 @@
/*
* Technologic Systems TS-5500 Single Board Computer support
*
- * Copyright (C) 2013 Savoir-faire Linux Inc.
+ * Copyright (C) 2013-2014 Savoir-faire Linux Inc.
* Vivien Didelot <vivien.didelot@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify it under
@@ -15,8 +15,8 @@
* state or available options. For further information about sysfs entries, see
* Documentation/ABI/testing/sysfs-platform-ts5500.
*
- * This code actually supports the TS-5500 platform, but it may be extended to
- * support similar Technologic Systems x86-based platforms, such as the TS-5600.
+ * This code may be extended to support similar x86-based platforms.
+ * Actually, the TS-5500 and TS-5400 are supported.
*/
#include <linux/delay.h>
@@ -32,6 +32,7 @@
/* Product code register */
#define TS5500_PRODUCT_CODE_ADDR 0x74
#define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */
+#define TS5400_PRODUCT_CODE 0x40 /* TS-5400 product code */
/* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */
#define TS5500_SRAM_RS485_ADC_ADDR 0x75
@@ -66,6 +67,7 @@
/**
* struct ts5500_sbc - TS-5500 board description
+ * @name: Board model name.
* @id: Board product ID.
* @sram: Flag for SRAM option.
* @rs485: Flag for RS-485 option.
@@ -75,6 +77,7 @@
* @jumpers: Bitfield for jumpers' state.
*/
struct ts5500_sbc {
+ const char *name;
int id;
bool sram;
bool rs485;
@@ -122,13 +125,16 @@ static int __init ts5500_detect_config(struct ts5500_sbc *sbc)
if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500"))
return -EBUSY;
- tmp = inb(TS5500_PRODUCT_CODE_ADDR);
- if (tmp != TS5500_PRODUCT_CODE) {
- pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp);
+ sbc->id = inb(TS5500_PRODUCT_CODE_ADDR);
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ sbc->name = "TS-5500";
+ } else if (sbc->id == TS5400_PRODUCT_CODE) {
+ sbc->name = "TS-5400";
+ } else {
+ pr_err("ts5500: unknown product code 0x%x\n", sbc->id);
ret = -ENODEV;
goto cleanup;
}
- sbc->id = tmp;
tmp = inb(TS5500_SRAM_RS485_ADC_ADDR);
sbc->sram = tmp & TS5500_SRAM;
@@ -147,48 +153,52 @@ cleanup:
return ret;
}
-static ssize_t ts5500_show_id(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct ts5500_sbc *sbc = dev_get_drvdata(dev);
- return sprintf(buf, "0x%.2x\n", sbc->id);
+ return sprintf(buf, "%s\n", sbc->name);
}
+static DEVICE_ATTR_RO(name);
-static ssize_t ts5500_show_jumpers(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t id_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct ts5500_sbc *sbc = dev_get_drvdata(dev);
- return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+ return sprintf(buf, "0x%.2x\n", sbc->id);
}
+static DEVICE_ATTR_RO(id);
-#define TS5500_SHOW(field) \
- static ssize_t ts5500_show_##field(struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
- { \
- struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
- return sprintf(buf, "%d\n", sbc->field); \
- }
-
-TS5500_SHOW(sram)
-TS5500_SHOW(rs485)
-TS5500_SHOW(adc)
-TS5500_SHOW(ereset)
-TS5500_SHOW(itr)
+static ssize_t jumpers_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev);
-static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL);
-static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL);
-static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL);
-static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL);
-static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL);
-static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL);
-static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL);
+ return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+}
+static DEVICE_ATTR_RO(jumpers);
+
+#define TS5500_ATTR_BOOL(_field) \
+ static ssize_t _field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+ { \
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
+ \
+ return sprintf(buf, "%d\n", sbc->_field); \
+ } \
+ static DEVICE_ATTR_RO(_field)
+
+TS5500_ATTR_BOOL(sram);
+TS5500_ATTR_BOOL(rs485);
+TS5500_ATTR_BOOL(adc);
+TS5500_ATTR_BOOL(ereset);
+TS5500_ATTR_BOOL(itr);
static struct attribute *ts5500_attributes[] = {
&dev_attr_id.attr,
+ &dev_attr_name.attr,
&dev_attr_jumpers.attr,
&dev_attr_sram.attr,
&dev_attr_rs485.attr,
@@ -311,12 +321,14 @@ static int __init ts5500_init(void)
if (err)
goto error;
- ts5500_dio1_pdev.dev.parent = &pdev->dev;
- if (platform_device_register(&ts5500_dio1_pdev))
- dev_warn(&pdev->dev, "DIO1 block registration failed\n");
- ts5500_dio2_pdev.dev.parent = &pdev->dev;
- if (platform_device_register(&ts5500_dio2_pdev))
- dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ ts5500_dio1_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio1_pdev))
+ dev_warn(&pdev->dev, "DIO1 block registration failed\n");
+ ts5500_dio2_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio2_pdev))
+ dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ }
if (led_classdev_register(&pdev->dev, &ts5500_led_cdev))
dev_warn(&pdev->dev, "LED registration failed\n");
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 766612137a62..1584cbed0dce 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -39,7 +39,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
*/
return BIOS_STATUS_UNIMPLEMENTED;
- ret = efi_call6((void *)__va(tab->function), (u64)which,
+ ret = efi_call((void *)__va(tab->function), (u64)which,
a1, a2, a3, a4, a5);
return ret;
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index dfe605ac1bcd..3968d67d366b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
/*
* The reverse of the above; converts a duration in ns to a duration in cycles.
- */
+ */
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
struct cyc2ns_data *data = cyc2ns_read_begin();
@@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
* UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
* But not currently used.
*/
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
+static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
{
unsigned long descriptor_status;
@@ -606,7 +606,7 @@ int handle_uv2_busy(struct bau_control *bcp)
return FLUSH_GIVEUP;
}
-static int uv2_wait_completion(struct bau_desc *bau_desc,
+static int uv2_3_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
@@ -616,7 +616,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
long busy_reps = 0;
struct ptc_stats *stat = bcp->statp;
- descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
/* spin on the status MMR, waiting for it to go idle */
while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -658,8 +658,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
/* not to hammer on the clock */
busy_reps = 0;
ttm = get_cycles();
- if ((ttm - bcp->send_message) >
- bcp->timeout_interval)
+ if ((ttm - bcp->send_message) > bcp->timeout_interval)
return handle_uv2_busy(bcp);
}
/*
@@ -667,8 +666,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
*/
cpu_relax();
}
- descriptor_stat = uv2_read_status(mmr_offset, right_shift,
- desc);
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
}
bcp->conseccompletes++;
return FLUSH_COMPLETE;
@@ -679,8 +677,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
* which register to read and position in that register based on cpu in
* current hub.
*/
-static int wait_completion(struct bau_desc *bau_desc,
- struct bau_control *bcp, long try)
+static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try)
{
int right_shift;
unsigned long mmr_offset;
@@ -695,11 +692,9 @@ static int wait_completion(struct bau_desc *bau_desc,
}
if (bcp->uvhub_version == 1)
- return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
- bcp, try);
+ return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
else
- return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
- bcp, try);
+ return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
}
/*
@@ -888,7 +883,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
struct ptc_stats *stat = bcp->statp;
struct bau_control *hmaster = bcp->uvhub_master;
struct uv1_bau_msg_header *uv1_hdr = NULL;
- struct uv2_bau_msg_header *uv2_hdr = NULL;
+ struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
if (bcp->uvhub_version == 1) {
uv1 = 1;
@@ -902,27 +897,28 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
if (uv1)
uv1_hdr = &bau_desc->header.uv1_hdr;
else
- uv2_hdr = &bau_desc->header.uv2_hdr;
+ /* uv2 and uv3 */
+ uv2_3_hdr = &bau_desc->header.uv2_3_hdr;
do {
if (try == 0) {
if (uv1)
uv1_hdr->msg_type = MSG_REGULAR;
else
- uv2_hdr->msg_type = MSG_REGULAR;
+ uv2_3_hdr->msg_type = MSG_REGULAR;
seq_number = bcp->message_number++;
} else {
if (uv1)
uv1_hdr->msg_type = MSG_RETRY;
else
- uv2_hdr->msg_type = MSG_RETRY;
+ uv2_3_hdr->msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
if (uv1)
uv1_hdr->sequence = seq_number;
else
- uv2_hdr->sequence = seq_number;
+ uv2_3_hdr->sequence = seq_number;
index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
bcp->send_message = get_cycles();
@@ -1080,8 +1076,10 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned int cpu)
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned int cpu)
{
int locals = 0;
int remotes = 0;
@@ -1268,6 +1266,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
if (bcp->uvhub_version == 2)
process_uv2_message(&msgdesc, bcp);
else
+ /* no error workaround for uv1 or uv3 */
bau_process_message(&msgdesc, bcp, 1);
msg++;
@@ -1325,8 +1324,12 @@ static void __init enable_timeouts(void)
*/
mmr_image |= (1L << SOFTACK_MSHIFT);
if (is_uv2_hub()) {
+ /* do not touch the legacy mode bit */
/* hw bug workaround; do not use extended status */
mmr_image &= ~(1L << UV2_EXT_SHFT);
+ } else if (is_uv3_hub()) {
+ mmr_image &= ~(1L << PREFETCH_HINT_SHFT);
+ mmr_image |= (1L << SB_STATUS_SHFT);
}
write_mmr_misc_control(pnode, mmr_image);
}
@@ -1476,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
return count;
}
- if (strict_strtol(optstr, 10, &input_arg) < 0) {
+ if (kstrtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
@@ -1692,7 +1695,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
struct bau_desc *bau_desc;
struct bau_desc *bd2;
struct uv1_bau_msg_header *uv1_hdr;
- struct uv2_bau_msg_header *uv2_hdr;
+ struct uv2_3_bau_msg_header *uv2_3_hdr;
struct bau_control *bcp;
/*
@@ -1739,15 +1742,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
*/
} else {
/*
- * BIOS uses legacy mode, but UV2 hardware always
+ * BIOS uses legacy mode, but uv2 and uv3 hardware always
* uses native mode for selective broadcasts.
*/
- uv2_hdr = &bd2->header.uv2_hdr;
- uv2_hdr->swack_flag = 1;
- uv2_hdr->base_dest_nasid =
+ uv2_3_hdr = &bd2->header.uv2_3_hdr;
+ uv2_3_hdr->swack_flag = 1;
+ uv2_3_hdr->base_dest_nasid =
UV_PNODE_TO_NASID(base_pnode);
- uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv2_hdr->command = UV_NET_ENDPOINT_INTD;
+ uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
}
}
for_each_present_cpu(cpu) {
@@ -1858,6 +1861,7 @@ static int calculate_destination_timeout(void)
ts_ns *= (mult1 * mult2);
ret = ts_ns / 1000;
} else {
+ /* same destination timeout for uv2 and uv3 */
/* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
@@ -2012,8 +2016,10 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->uvhub_version = 1;
else if (is_uv2_hub())
bcp->uvhub_version = 2;
+ else if (is_uv3_hub())
+ bcp->uvhub_version = 3;
else {
- printk(KERN_EMERG "uvhub version not 1 or 2\n");
+ printk(KERN_EMERG "uvhub version not 1, 2 or 3\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
@@ -2138,9 +2144,10 @@ static int __init uv_bau_init(void)
}
vector = UV_BAU_MESSAGE;
- for_each_possible_blade(uvhub)
+ for_each_possible_blade(uvhub) {
if (uv_blade_nr_possible_cpus(uvhub))
init_uvhub(uvhub, vector, uv_base_pnode);
+ }
alloc_intr_gate(vector, uv_bau_message_intr1);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index acf7752da952..b233681af4de 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -238,11 +238,9 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
unsigned long mmr_offset, int limit)
{
- int irq, ret;
+ int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade));
- irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
-
- if (irq <= 0)
+ if (!irq)
return -EBUSY;
ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
@@ -250,7 +248,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
if (ret == irq)
uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
else
- destroy_irq(irq);
+ irq_free_hwirq(irq);
return ret;
}
@@ -285,6 +283,6 @@ void uv_teardown_irq(unsigned int irq)
n = n->rb_right;
}
spin_unlock_irqrestore(&uv_irq_lock, irqflags);
- destroy_irq(irq);
+ irq_free_hwirq(irq);
}
EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index be27da60dc8f..c89c93320c12 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask;
* Default is all stack dumps go to the console and buffer.
* Lower level to send to log buffer only.
*/
-static int uv_nmi_loglevel = 7;
+static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644);
/*
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 424f4c97a44d..6ec7910f59bf 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -165,7 +165,7 @@ static void fix_processor_context(void)
* by __save_processor_state()
* @ctxt - structure to load the registers contents from
*/
-static void __restore_processor_state(struct saved_context *ctxt)
+static void notrace __restore_processor_state(struct saved_context *ctxt)
{
if (ctxt->misc_enable_saved)
wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
@@ -239,7 +239,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
}
/* Needed by apm.c */
-void restore_processor_state(void)
+void notrace restore_processor_state(void)
{
__restore_processor_state(&saved_context);
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
new file mode 100644
index 000000000000..7fde9ee438a4
--- /dev/null
+++ b/arch/x86/purgatory/Makefile
@@ -0,0 +1,30 @@
+purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
+
+targets += $(purgatory-y)
+PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
+targets += purgatory.ro
+
+# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
+# in turn leaves some undefined symbols like __fentry__ in purgatory and not
+# sure how to relocate those. Like kexec-tools, use custom flags.
+
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large
+
+$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
+ $(call if_changed,ld)
+
+targets += kexec-purgatory.c
+
+quiet_cmd_bin2c = BIN2C $@
+ cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+ $(call if_changed,bin2c)
+
+
+# No loaders for 32bits yet.
+ifeq ($(CONFIG_X86_64),y)
+ obj-$(CONFIG_KEXEC) += kexec-purgatory.o
+endif
diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S
new file mode 100644
index 000000000000..d1a4291d3568
--- /dev/null
+++ b/arch/x86/purgatory/entry64.S
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+
+ * Author(s): Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .balign 16
+ .code64
+ .globl entry64, entry64_regs
+
+
+entry64:
+ /* Setup a gdt that should be preserved */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup new stack */
+ leaq stack_init(%rip), %rsp
+ pushq $0x10 /* CS */
+ leaq new_cs_exit(%rip), %rax
+ pushq %rax
+ lretq
+new_cs_exit:
+
+ /* Load the registers */
+ movq rax(%rip), %rax
+ movq rbx(%rip), %rbx
+ movq rcx(%rip), %rcx
+ movq rdx(%rip), %rdx
+ movq rsi(%rip), %rsi
+ movq rdi(%rip), %rdi
+ movq rsp(%rip), %rsp
+ movq rbp(%rip), %rbp
+ movq r8(%rip), %r8
+ movq r9(%rip), %r9
+ movq r10(%rip), %r10
+ movq r11(%rip), %r11
+ movq r12(%rip), %r12
+ movq r13(%rip), %r13
+ movq r14(%rip), %r14
+ movq r15(%rip), %r15
+
+ /* Jump to the new code... */
+ jmpq *rip(%rip)
+
+ .section ".rodata"
+ .balign 4
+entry64_regs:
+rax: .quad 0x0
+rcx: .quad 0x0
+rdx: .quad 0x0
+rbx: .quad 0x0
+rsp: .quad 0x0
+rbp: .quad 0x0
+rsi: .quad 0x0
+rdi: .quad 0x0
+r8: .quad 0x0
+r9: .quad 0x0
+r10: .quad 0x0
+r11: .quad 0x0
+r12: .quad 0x0
+r13: .quad 0x0
+r14: .quad 0x0
+r15: .quad 0x0
+rip: .quad 0x0
+ .size entry64_regs, . - entry64_regs
+
+ /* GDT */
+ .section ".rodata"
+ .balign 16
+gdt:
+ /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+stack: .quad 0, 0
+stack_init:
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
new file mode 100644
index 000000000000..25e068ba3382
--- /dev/null
+++ b/arch/x86/purgatory/purgatory.c
@@ -0,0 +1,72 @@
+/*
+ * purgatory: Runs between two kernels
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "sha256.h"
+#include "../boot/string.h"
+
+struct sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+unsigned long backup_dest = 0;
+unsigned long backup_src = 0;
+unsigned long backup_sz = 0;
+
+u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 };
+
+struct sha_region sha_regions[16] = {};
+
+/*
+ * On x86, second kernel requries first 640K of memory to boot. Copy
+ * first 640K to a backup region in reserved memory range so that second
+ * kernel can use first 640K.
+ */
+static int copy_backup_region(void)
+{
+ if (backup_dest)
+ memcpy((void *)backup_dest, (void *)backup_src, backup_sz);
+
+ return 0;
+}
+
+int verify_sha256_digest(void)
+{
+ struct sha_region *ptr, *end;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state sctx;
+
+ sha256_init(&sctx);
+ end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])];
+ for (ptr = sha_regions; ptr < end; ptr++)
+ sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+
+ sha256_final(&sctx, digest);
+
+ if (memcmp(digest, sha256_digest, sizeof(digest)))
+ return 1;
+
+ return 0;
+}
+
+void purgatory(void)
+{
+ int ret;
+
+ ret = verify_sha256_digest();
+ if (ret) {
+ /* loop forever */
+ for (;;)
+ ;
+ }
+ copy_backup_region();
+}
diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S
new file mode 100644
index 000000000000..fe3c91ba1bd0
--- /dev/null
+++ b/arch/x86/purgatory/setup-x86_64.S
@@ -0,0 +1,58 @@
+/*
+ * purgatory: setup code
+ *
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .globl purgatory_start
+ .balign 16
+purgatory_start:
+ .code64
+
+ /* Load a gdt so I know what the segment registers are */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup a stack */
+ leaq lstack_end(%rip), %rsp
+
+ /* Call the C code */
+ call purgatory
+ jmp entry64
+
+ .section ".rodata"
+ .balign 16
+gdt: /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as the gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+
+ .bss
+ .balign 4096
+lstack:
+ .skip 4096
+lstack_end:
diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c
new file mode 100644
index 000000000000..548ca675a14a
--- /dev/null
+++ b/arch/x86/purgatory/sha256.c
@@ -0,0 +1,283 @@
+/*
+ * SHA-256, as specified in
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ *
+ * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include "sha256.h"
+#include "../boot/string.h"
+
+static inline u32 Ch(u32 x, u32 y, u32 z)
+{
+ return z ^ (x & (y ^ z));
+}
+
+static inline u32 Maj(u32 x, u32 y, u32 z)
+{
+ return (x & y) | (z & (x | y));
+}
+
+#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
+#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
+#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
+#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+ W[I] = __be32_to_cpu(((__be32 *)(input))[I]);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+ W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+}
+
+static void sha256_transform(u32 *state, const u8 *input)
+{
+ u32 a, b, c, d, e, f, g, h, t1, t2;
+ u32 W[64];
+ int i;
+
+ /* load the input */
+ for (i = 0; i < 16; i++)
+ LOAD_OP(i, W, input);
+
+ /* now blend */
+ for (i = 16; i < 64; i++)
+ BLEND_OP(i, W);
+
+ /* load the state into our registers */
+ a = state[0]; b = state[1]; c = state[2]; d = state[3];
+ e = state[4]; f = state[5]; g = state[6]; h = state[7];
+
+ /* now iterate */
+ t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+ state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+
+ /* clear any sensitive info... */
+ a = b = c = d = e = f = g = h = t1 = t2 = 0;
+ memset(W, 0, 64 * sizeof(u32));
+}
+
+int sha256_init(struct sha256_state *sctx)
+{
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+{
+ unsigned int partial, done;
+ const u8 *src;
+
+ partial = sctx->count & 0x3f;
+ sctx->count += len;
+ done = 0;
+ src = data;
+
+ if ((partial + len) > 63) {
+ if (partial) {
+ done = -partial;
+ memcpy(sctx->buf + partial, data, done + 64);
+ src = sctx->buf;
+ }
+
+ do {
+ sha256_transform(sctx->state, src);
+ done += 64;
+ src = data + done;
+ } while (done + 63 < len);
+
+ partial = 0;
+ }
+ memcpy(sctx->buf + partial, src, len - done);
+
+ return 0;
+}
+
+int sha256_final(struct sha256_state *sctx, u8 *out)
+{
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ unsigned int index, pad_len;
+ int i;
+ static const u8 padding[64] = { 0x80, };
+
+ /* Save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64. */
+ index = sctx->count & 0x3f;
+ pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+ sha256_update(sctx, padding, pad_len);
+
+ /* Append length (before padding) */
+ sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Zeroize sensitive information. */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h
new file mode 100644
index 000000000000..bd15a4127735
--- /dev/null
+++ b/arch/x86/purgatory/sha256.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author: Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#ifndef SHA256_H
+#define SHA256_H
+
+
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+extern int sha256_init(struct sha256_state *sctx);
+extern int sha256_update(struct sha256_state *sctx, const u8 *input,
+ unsigned int length);
+extern int sha256_final(struct sha256_state *sctx, u8 *hash);
+
+#endif /* SHA256_H */
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
new file mode 100644
index 000000000000..3cefba1fefc8
--- /dev/null
+++ b/arch/x86/purgatory/stack.S
@@ -0,0 +1,19 @@
+/*
+ * purgatory: stack
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ /* A stack for the loaded kernel.
+ * Seperate and in the data section so it can be prepopulated.
+ */
+ .data
+ .balign 4096
+ .globl stack, stack_end
+
+stack:
+ .skip 4096
+stack_end:
diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c
new file mode 100644
index 000000000000..d886b1fa36f0
--- /dev/null
+++ b/arch/x86/purgatory/string.c
@@ -0,0 +1,13 @@
+/*
+ * Simple string functions.
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "../boot/string.c"
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3497f14e4dea..7c0d7be176a5 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -52,8 +52,9 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
OBJCOPYFLAGS_realmode.bin := -O binary
targets += realmode.bin
-$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs
+$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE
$(call if_changed,objcopy)
+ @:
quiet_cmd_relocs = RELOCS $@
cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..028b78168d85 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,6 @@
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
+354 i386 seccomp sys_seccomp
+355 i386 getrandom sys_getrandom
+356 i386 memfd_create sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 04376ac3d9ef..35dd922727b9 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -212,10 +212,10 @@
203 common sched_setaffinity sys_sched_setaffinity
204 common sched_getaffinity sys_sched_getaffinity
205 64 set_thread_area
-206 common io_setup sys_io_setup
+206 64 io_setup sys_io_setup
207 common io_destroy sys_io_destroy
208 common io_getevents sys_io_getevents
-209 common io_submit sys_io_submit
+209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
212 common lookup_dcookie sys_lookup_dcookie
@@ -323,6 +323,10 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common seccomp sys_seccomp
+318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
+320 common kexec_file_load sys_kexec_file_load
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -359,3 +363,5 @@
540 x32 process_vm_writev compat_sys_process_vm_writev
541 x32 setsockopt compat_sys_setsockopt
542 x32 getsockopt compat_sys_getsockopt
+543 x32 io_setup compat_sys_io_setup
+544 x32 io_submit compat_sys_io_submit
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 0feee2fd5077..25a1022dd793 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -216,6 +216,5 @@ extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
#define SET_PERSONALITY(ex) do ; while(0)
-#define __HAVE_ARCH_GATE_AREA 1
#endif
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 04f82e020f2b..2a206d2b14ab 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -25,7 +25,8 @@ static inline void rep_nop(void)
__asm__ __volatile__("rep;nop": : :"memory");
}
-#define cpu_relax() rep_nop()
+#define cpu_relax() rep_nop()
+#define cpu_relax_lowlatency() cpu_relax()
#include <asm/processor-generic.h>
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
index c6492e75797b..f8fecaddcc0d 100644
--- a/arch/x86/um/mem_64.c
+++ b/arch/x86/um/mem_64.c
@@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return NULL;
}
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 5e04a1c899fa..79d824551c1a 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -370,13 +370,12 @@ struct rt_sigframe
char retcode[8];
};
-int setup_signal_stack_sc(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- sigset_t *mask)
+int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
/* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */
stack_top = ((stack_top + 4) & -16UL) - 4;
@@ -385,8 +384,8 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
@@ -410,20 +409,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) 0;
PT_REGS_CX(regs) = (unsigned long) 0;
return 0;
}
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- siginfo_t *info, sigset_t *mask)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct rt_sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
stack_top &= -8UL;
frame = (struct rt_sigframe __user *) stack_top - 1;
@@ -431,14 +429,14 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
PT_REGS_SP(regs));
@@ -457,7 +455,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) &frame->info;
PT_REGS_CX(regs) = (unsigned long) &frame->uc;
@@ -502,12 +500,11 @@ struct rt_sigframe
struct _fpstate fpstate;
};
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs * regs,
- siginfo_t *info, sigset_t *set)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *set)
{
struct rt_sigframe __user *frame;
- int err = 0;
+ int err = 0, sig = ksig->sig;
frame = (struct rt_sigframe __user *)
round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -517,8 +514,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto out;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- err |= copy_siginfo_to_user(&frame->info, info);
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
if (err)
goto out;
}
@@ -543,8 +540,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
* already in userspace.
*/
/* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER)
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
else
/* could use a vstub here */
return err;
@@ -570,7 +567,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
*/
PT_REGS_SI(regs) = (unsigned long) &frame->info;
PT_REGS_DX(regs) = (unsigned long) &frame->uc;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
out:
return err;
}
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index af91901babb8..916cda4cd5b4 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -12,7 +12,7 @@
#include <asm/page.h>
#include <linux/init.h>
-unsigned int __read_mostly vdso_enabled = 1;
+static unsigned int __read_mostly vdso_enabled = 1;
unsigned long um_vdso_addr;
extern unsigned long task_size;
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
index 3282874bc61d..aae8ffdd5880 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/vdso/.gitignore
@@ -1,8 +1,7 @@
vdso.lds
-vdso-syms.lds
vdsox32.lds
-vdsox32-syms.lds
-vdso32-syms.lds
vdso32-syscall-syms.lds
vdso32-sysenter-syms.lds
vdso32-int80-syms.lds
+vdso-image-*.c
+vdso2c
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index c580d1210ffe..5a4affe025e8 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -9,30 +9,36 @@ VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
-vdso-install-$(VDSO64-y) += vdso.so
-vdso-install-$(VDSOX32-y) += vdsox32.so
-vdso-install-$(VDSO32-y) += $(vdso32-images)
-
-
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
-vobjs-$(VDSOX32-y) += $(vobjx32s-compat)
-
-# Filter out x32 objects.
-vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y))
-
# files to link into kernel
obj-y += vma.o
-obj-$(VDSO64-y) += vdso.o
-obj-$(VDSOX32-y) += vdsox32.o
-obj-$(VDSO32-y) += vdso32.o vdso32-setup.o
-vobjs := $(foreach F,$(vobj64s),$(obj)/$F)
+# vDSO images to build
+vdso_img-$(VDSO64-y) += 64
+vdso_img-$(VDSOX32-y) += x32
+vdso_img-$(VDSO32-y) += 32-int80
+vdso_img-$(CONFIG_COMPAT) += 32-syscall
+vdso_img-$(VDSO32-y) += 32-sysenter
+
+obj-$(VDSO32-y) += vdso32-setup.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
$(obj)/vdso.o: $(obj)/vdso.so
-targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
+targets += vdso.lds $(vobjs-y)
+
+# Build the vDSO image C files and link them in.
+vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
+vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
+vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
+obj-y += $(vdso_img_objs)
+targets += $(vdso_img_cfiles)
+targets += $(vdso_img_sodbg)
+.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \
+ $(vdso_img-y:%=$(obj)/vdso%.so)
export CPPFLAGS_vdso.lds += -P -C
@@ -41,14 +47,19 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
$(DISABLE_LTO)
-$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
-
-$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
-$(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg FORCE
- $(call if_changed,objcopy)
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+hostprogs-y += vdso2c
+
+quiet_cmd_vdso2c = VDSO2C $@
+define cmd_vdso2c
+ $(obj)/vdso2c $< $(<:%.dbg=%) $@
+endef
+
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
+ $(call if_changed,vdso2c)
#
# Don't omit frame pointers for ease of userspace debugging, but do
@@ -56,7 +67,8 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
- -fno-omit-frame-pointer -foptimize-sibling-calls
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING
$(vobjs): KBUILD_CFLAGS += $(CFL)
@@ -68,22 +80,6 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vvar.o = -pg
-targets += vdso-syms.lds
-obj-$(VDSO64-y) += vdso-syms.lds
-
-#
-# Match symbols in the DSO that look like VDSO*; produce a file of constants.
-#
-sed-vdsosym := -e 's/^00*/0/' \
- -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
-quiet_cmd_vdsosym = VDSOSYM $@
-define cmd_vdsosym
- $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
-endef
-
-$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
- $(call if_changed,vdsosym)
-
#
# X32 processes use x32 vDSO to access 64bit kernel data.
#
@@ -94,16 +90,19 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
# so that it can reach 64bit address space with 64bit pointers.
#
-targets += vdsox32-syms.lds
-obj-$(VDSOX32-y) += vdsox32-syms.lds
-
CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
-Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 \
-Wl,-z,common-page-size=4096
-vobjx32s-y := $(vobj64s:.o=-x32.o)
+# 64-bit objects to re-brand as x32
+vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
+
+# x32-rebranded versions
+vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
+
+# same thing, but in the output directory
vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
# Convert 64bit object file to x32 for x32 vDSO.
@@ -113,9 +112,11 @@ quiet_cmd_x32 = X32 $@
$(obj)/%-x32.o: $(obj)/%.o FORCE
$(call if_changed,x32)
-targets += vdsox32.so vdsox32.so.dbg vdsox32.lds $(vobjx32s-y)
+targets += vdsox32.lds $(vobjx32s-y)
-$(obj)/vdsox32.o: $(src)/vdsox32.S $(obj)/vdsox32.so
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg
+ $(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
@@ -123,7 +124,6 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
#
# Build multiple 32-bit vDSO images to choose from at boot time.
#
-obj-$(VDSO32-y) += vdso32-syms.lds
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter
@@ -138,10 +138,8 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
-targets += $(vdso32-images) $(vdso32-images:=.dbg)
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
-
-extra-y += $(vdso32-images)
+targets += vdso32/vclock_gettime.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
@@ -157,6 +155,7 @@ KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
@@ -166,27 +165,6 @@ $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
$(obj)/vdso32/%.o
$(call if_changed,vdso)
-# Make vdso32-*-syms.lds from each image, and then make sure they match.
-# The only difference should be that some do not define VDSO32_SYSENTER_RETURN.
-
-targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds)
-
-quiet_cmd_vdso32sym = VDSOSYM $@
-define cmd_vdso32sym
- if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \
- $(foreach H,$(filter-out FORCE,$^),\
- if grep -q VDSO32_SYSENTER_RETURN $H; \
- then diff -u $(@D)/.tmp_$(@F) $H; \
- else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \
- diff -u - $H; fi &&) : ;\
- then mv -f $(@D)/.tmp_$(@F) $@; \
- else rm -f $(@D)/.tmp_$(@F); exit 1; \
- fi
-endef
-
-$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
- $(call if_changed,vdso32sym)
-
#
# The DSO images are built using a special linker script.
#
@@ -197,19 +175,34 @@ quiet_cmd_vdso = VDSO $@
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
- $(LTO_CFLAGS)
+ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
#
-# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
+# Install the unstripped copies of vdso*.so. If our toolchain supports
+# build-id, install .build-id links as well.
#
-quiet_cmd_vdso_install = INSTALL $@
- cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
+quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+ if readelf -n $< |grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
+
+vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
+
+$(MODLIB)/vdso: FORCE
@mkdir -p $(MODLIB)/vdso
+
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
$(call cmd,vdso_install)
-PHONY += vdso_install $(vdso-install-y)
-vdso_install: $(vdso-install-y)
+PHONY += vdso_install $(vdso_img_insttargets)
+vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 16d686171e9a..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -11,9 +11,6 @@
* Check with readelf after changing.
*/
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
-
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
#include <asm/hpet.h>
@@ -30,9 +27,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
extern time_t __vdso_time(time_t *t);
#ifdef CONFIG_HPET_TIMER
-static inline u32 read_hpet_counter(const volatile void *addr)
+extern u8 hpet_page
+ __attribute__((visibility("hidden")));
+
+static notrace cycle_t vread_hpet(void)
{
- return *(const volatile u32 *) (addr + HPET_COUNTER);
+ return *(const volatile u32 *)(&hpet_page + HPET_COUNTER);
}
#endif
@@ -43,11 +43,6 @@ static inline u32 read_hpet_counter(const volatile void *addr)
#include <asm/fixmap.h>
#include <asm/pvclock.h>
-static notrace cycle_t vread_hpet(void)
-{
- return read_hpet_counter((const void *)fix_to_virt(VSYSCALL_HPET));
-}
-
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
@@ -137,16 +132,6 @@ static notrace cycle_t vread_pvclock(int *mode)
#else
-extern u8 hpet_page
- __attribute__((visibility("hidden")));
-
-#ifdef CONFIG_HPET_TIMER
-static notrace cycle_t vread_hpet(void)
-{
- return read_hpet_counter((const void *)(&hpet_page));
-}
-#endif
-
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
@@ -154,7 +139,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
asm(
"mov %%ebx, %%edx \n"
"mov %2, %%ebx \n"
- "call VDSO32_vsyscall \n"
+ "call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
: "=a" (ret)
: "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
@@ -169,7 +154,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
asm(
"mov %%ebx, %%edx \n"
"mov %2, %%ebx \n"
- "call VDSO32_vsyscall \n"
+ "call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
: "=a" (ret)
: "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index 9df017ab2285..de2c921025f5 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -1,25 +1,42 @@
+#include <asm/vdso.h>
+
/*
* Linker script for vDSO. This is an ELF shared object prelinked to
* its virtual address, and with only one read-only segment.
* This script controls its layout.
*/
+#if defined(BUILD_VDSO64)
+# define SHDR_SIZE 64
+#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
+# define SHDR_SIZE 40
+#else
+# error unknown VDSO target
+#endif
+
+#define NUM_FAKE_SHDRS 13
+
SECTIONS
{
-#ifdef BUILD_VDSO32
-#include <asm/vdso32.h>
-
- hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE);
+ /*
+ * User/kernel shared data is before the vDSO. This may be a little
+ * uglier than putting it after the vDSO, but it avoids issues with
+ * non-allocatable things that dangle past the end of the PT_LOAD
+ * segment.
+ */
- vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE);
+ vvar_start = . - 2 * PAGE_SIZE;
+ vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset;
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
-#endif
+
+ hpet_page = vvar_start + PAGE_SIZE;
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
@@ -30,43 +47,56 @@ SECTIONS
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+
+ /*
+ * Ideally this would live in a C file, but that won't
+ * work cleanly for x32 until we start building the x32
+ * C code using an x32 toolchain.
+ */
+ VDSO_FAKE_SECTION_TABLE_START = .;
+ . = . + NUM_FAKE_SHDRS * SHDR_SIZE;
+ VDSO_FAKE_SECTION_TABLE_END = .;
+ } :text
+
+ .fake_shstrtab : { *(.fake_shstrtab) } :text
+
+
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
- .dynamic : { *(.dynamic) } :text :dynamic
-
- .rodata : { *(.rodata*) } :text
- .data : {
- *(.data*)
- *(.sdata*)
- *(.got.plt) *(.got)
- *(.gnu.linkonce.d.*)
- *(.bss*)
- *(.dynbss*)
- *(.gnu.linkonce.b.*)
- }
-
- .altinstructions : { *(.altinstructions) }
- .altinstr_replacement : { *(.altinstr_replacement) }
/*
- * Align the actual code well away from the non-instruction data.
- * This is the best thing for the I-cache.
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
*/
- . = ALIGN(0x100);
.text : { *(.text*) } :text =0x90909090,
/*
- * The comma above works around a bug in gold:
- * https://sourceware.org/bugzilla/show_bug.cgi?id=16804
+ * At the end so that eu-elflint stays happy when vdso2c strips
+ * these. A better implementation would avoid allocating space
+ * for these.
*/
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
/DISCARD/ : {
*(.discard)
*(.discard.*)
+ *(__bug_table)
}
}
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
deleted file mode 100644
index be3f23b09af5..000000000000
--- a/arch/x86/vdso/vdso.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <asm/vdso.h>
-
-DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so")
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index b96b2677cad8..6807932643c2 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -1,14 +1,13 @@
/*
* Linker script for 64-bit vDSO.
* We #include the file to define the layout details.
- * Here we only choose the prelinked virtual address.
*
* This file defines the version script giving the user-exported symbols in
- * the DSO. We can define local symbols here called VDSO* to make their
- * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ * the DSO.
*/
-#define VDSO_PRELINK 0xffffffffff700000
+#define BUILD_VDSO64
+
#include "vdso-layout.lds.S"
/*
@@ -28,5 +27,3 @@ VERSION {
local: *;
};
}
-
-VDSO64_PRELINK = VDSO_PRELINK;
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
new file mode 100644
index 000000000000..8627db24a7f6
--- /dev/null
+++ b/arch/x86/vdso/vdso2c.c
@@ -0,0 +1,253 @@
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ * Licensed under the GPL v2
+ *
+ * vdso2c requires stripped and unstripped input. It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table. Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <err.h>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <tools/le_byteshift.h>
+
+#include <linux/elf.h>
+#include <linux/types.h>
+
+const char *outfilename;
+
+/* Symbols that we need in vdso2c. */
+enum {
+ sym_vvar_start,
+ sym_vvar_page,
+ sym_hpet_page,
+ sym_VDSO_FAKE_SECTION_TABLE_START,
+ sym_VDSO_FAKE_SECTION_TABLE_END,
+};
+
+const int special_pages[] = {
+ sym_vvar_page,
+ sym_hpet_page,
+};
+
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ [sym_vvar_start] = {"vvar_start", true},
+ [sym_vvar_page] = {"vvar_page", true},
+ [sym_hpet_page] = {"hpet_page", true},
+ [sym_VDSO_FAKE_SECTION_TABLE_START] = {
+ "VDSO_FAKE_SECTION_TABLE_START", false
+ },
+ [sym_VDSO_FAKE_SECTION_TABLE_END] = {
+ "VDSO_FAKE_SECTION_TABLE_END", false
+ },
+ {"VDSO32_NOTE_MASK", true},
+ {"VDSO32_SYSENTER_RETURN", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
+};
+
+__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
+static void fail(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ fprintf(stderr, "Error: ");
+ vfprintf(stderr, format, ap);
+ if (outfilename)
+ unlink(outfilename);
+ exit(1);
+ va_end(ap);
+}
+
+/*
+ * Evil macros for little-endian reads and writes
+ */
+#define GLE(x, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
+
+extern void bad_get_le(void);
+#define LAST_GLE(x) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
+
+#define GET_LE(x) \
+ GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
+
+#define PLE(x, val, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ put_unaligned_le##bits((val), (x)), ifnot)
+
+extern void bad_put_le(void);
+#define LAST_PLE(x, val) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
+
+#define PUT_LE(x, val) \
+ PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
+
+
+#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
+
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
+#include "vdso2c.h"
+#undef ELF_BITS
+
+#define ELF_BITS 32
+#include "vdso2c.h"
+#undef ELF_BITS
+
+static void go(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
+{
+ Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
+
+ if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
+ go64(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
+ go32(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else {
+ fail("unknown ELF class\n");
+ }
+}
+
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+ off_t tmp_len;
+
+ int fd = open(name, O_RDONLY);
+ if (fd == -1)
+ err(1, "%s", name);
+
+ tmp_len = lseek(fd, 0, SEEK_END);
+ if (tmp_len == (off_t)-1)
+ err(1, "lseek");
+ *len = (size_t)tmp_len;
+
+ *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+ if (*addr == MAP_FAILED)
+ err(1, "mmap");
+
+ close(fd);
+}
+
+int main(int argc, char **argv)
+{
+ size_t raw_len, stripped_len;
+ void *raw_addr, *stripped_addr;
+ FILE *outfile;
+ char *name, *tmp;
+ int namelen;
+
+ if (argc != 4) {
+ printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
+ return 1;
+ }
+
+ /*
+ * Figure out the struct name. If we're writing to a .so file,
+ * generate raw output insted.
+ */
+ name = strdup(argv[3]);
+ namelen = strlen(name);
+ if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
+ name = NULL;
+ } else {
+ tmp = strrchr(name, '/');
+ if (tmp)
+ name = tmp + 1;
+ tmp = strchr(name, '.');
+ if (tmp)
+ *tmp = '\0';
+ for (tmp = name; *tmp; tmp++)
+ if (*tmp == '-')
+ *tmp = '_';
+ }
+
+ map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+ map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
+
+ outfilename = argv[3];
+ outfile = fopen(outfilename, "w");
+ if (!outfile)
+ err(1, "%s", argv[2]);
+
+ go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
+
+ munmap(raw_addr, raw_len);
+ munmap(stripped_addr, stripped_len);
+ fclose(outfile);
+
+ return 0;
+}
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
new file mode 100644
index 000000000000..fd57829b30d8
--- /dev/null
+++ b/arch/x86/vdso/vdso2c.h
@@ -0,0 +1,173 @@
+/*
+ * This file is included twice from vdso2c.c. It generates code for 32-bit
+ * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
+ * are built for 32-bit userspace.
+ */
+
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
+{
+ int found_load = 0;
+ unsigned long load_size = -1; /* Work around bogus warning */
+ unsigned long mapping_size;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
+ int i;
+ unsigned long j;
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ *alt_sec = NULL;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
+ const char *secstrings;
+ INT_BITS syms[NSYMS] = {};
+
+ ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
+
+ /* Walk the segment table. */
+ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
+ if (GET_LE(&pt[i].p_type) == PT_LOAD) {
+ if (found_load)
+ fail("multiple PT_LOAD segs\n");
+
+ if (GET_LE(&pt[i].p_offset) != 0 ||
+ GET_LE(&pt[i].p_vaddr) != 0)
+ fail("PT_LOAD in wrong place\n");
+
+ if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
+ fail("cannot handle memsz != filesz\n");
+
+ load_size = GET_LE(&pt[i].p_memsz);
+ found_load = 1;
+ } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
+ dyn = raw_addr + GET_LE(&pt[i].p_offset);
+ dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
+ GET_LE(&pt[i].p_memsz);
+ }
+ }
+ if (!found_load)
+ fail("no PT_LOAD seg\n");
+
+ if (stripped_len < load_size)
+ fail("stripped input is too short\n");
+
+ /* Walk the dynamic table */
+ for (i = 0; dyn + i < dyn_end &&
+ GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
+ typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
+ tag == DT_RELENT || tag == DT_TEXTREL)
+ fail("vdso image contains dynamic relocations\n");
+ }
+
+ /* Walk the section table */
+ secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
+ secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
+ for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+ ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * i;
+ if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
+ symtab_hdr = sh;
+
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name),
+ ".altinstructions"))
+ alt_sec = sh;
+ }
+
+ if (!symtab_hdr)
+ fail("no symbol table\n");
+
+ strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
+
+ /* Walk the symbol table */
+ for (i = 0;
+ i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
+ i++) {
+ int k;
+ ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
+ GET_LE(&symtab_hdr->sh_entsize) * i;
+ const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
+ GET_LE(&sym->st_name);
+
+ for (k = 0; k < NSYMS; k++) {
+ if (!strcmp(name, required_syms[k].name)) {
+ if (syms[k]) {
+ fail("duplicate symbol %s\n",
+ required_syms[k].name);
+ }
+
+ /*
+ * Careful: we use negative addresses, but
+ * st_value is unsigned, so we rely
+ * on syms[k] being a signed type of the
+ * correct width.
+ */
+ syms[k] = GET_LE(&sym->st_value);
+ }
+ }
+ }
+
+ /* Validate mapping addresses. */
+ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
+ if (!syms[i])
+ continue; /* The mapping isn't used; ignore it. */
+
+ if (syms[i] % 4096)
+ fail("%s must be a multiple of 4096\n",
+ required_syms[i].name);
+ if (syms[sym_vvar_start] > syms[i] + 4096)
+ fail("%s underruns begin_vvar\n",
+ required_syms[i].name);
+ if (syms[i] + 4096 > 0)
+ fail("%s is on the wrong side of the vdso text\n",
+ required_syms[i].name);
+ }
+ if (syms[sym_vvar_start] % 4096)
+ fail("vvar_begin must be a multiple of 4096\n");
+
+ if (!name) {
+ fwrite(stripped_addr, stripped_len, 1, outfile);
+ return;
+ }
+
+ mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
+ fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
+ fprintf(outfile, "#include <linux/linkage.h>\n");
+ fprintf(outfile, "#include <asm/page_types.h>\n");
+ fprintf(outfile, "#include <asm/vdso.h>\n");
+ fprintf(outfile, "\n");
+ fprintf(outfile,
+ "static unsigned char raw_data[%lu] __page_aligned_data = {",
+ mapping_size);
+ for (j = 0; j < stripped_len; j++) {
+ if (j % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ",
+ (int)((unsigned char *)stripped_addr)[j]);
+ }
+ fprintf(outfile, "\n};\n\n");
+
+ fprintf(outfile, "static struct page *pages[%lu];\n\n",
+ mapping_size / 4096);
+
+ fprintf(outfile, "const struct vdso_image %s = {\n", name);
+ fprintf(outfile, "\t.data = raw_data,\n");
+ fprintf(outfile, "\t.size = %lu,\n", mapping_size);
+ fprintf(outfile, "\t.text_mapping = {\n");
+ fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
+ fprintf(outfile, "\t\t.pages = pages,\n");
+ fprintf(outfile, "\t},\n");
+ if (alt_sec) {
+ fprintf(outfile, "\t.alt = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_offset));
+ fprintf(outfile, "\t.alt_len = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_size));
+ }
+ for (i = 0; i < NSYMS; i++) {
+ if (required_syms[i].export && syms[i])
+ fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+ required_syms[i].name, (int64_t)syms[i]);
+ }
+ fprintf(outfile, "};\n");
+}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index e1f220e3ca68..e904c270573b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -8,27 +8,12 @@
#include <linux/init.h>
#include <linux/smp.h>
-#include <linux/thread_info.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <linux/string.h>
-#include <linux/elf.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
#include <asm/cpufeature.h>
-#include <asm/msr.h>
-#include <asm/pgtable.h>
-#include <asm/unistd.h>
-#include <asm/elf.h>
-#include <asm/tlbflush.h>
+#include <asm/processor.h>
#include <asm/vdso.h>
-#include <asm/proto.h>
-#include <asm/fixmap.h>
-#include <asm/hpet.h>
-#include <asm/vvar.h>
#ifdef CONFIG_COMPAT_VDSO
#define VDSO_DEFAULT 0
@@ -36,23 +21,17 @@
#define VDSO_DEFAULT 1
#endif
-#ifdef CONFIG_X86_64
-#define vdso_enabled sysctl_vsyscall32
-#define arch_setup_additional_pages syscall32_setup_pages
-extern int sysctl_ldt16;
-#endif
-
/*
* Should the kernel map a VDSO page into processes and pass its
* address down to glibc upon exec()?
*/
-unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
-static int __init vdso_setup(char *s)
+static int __init vdso32_setup(char *s)
{
- vdso_enabled = simple_strtoul(s, NULL, 0);
+ vdso32_enabled = simple_strtoul(s, NULL, 0);
- if (vdso_enabled > 1)
+ if (vdso32_enabled > 1)
pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
return 1;
@@ -63,177 +42,45 @@ static int __init vdso_setup(char *s)
* behavior on both 64-bit and 32-bit kernels.
* On 32-bit kernels, vdso=[012] means the same thing.
*/
-__setup("vdso32=", vdso_setup);
+__setup("vdso32=", vdso32_setup);
#ifdef CONFIG_X86_32
-__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
-
-EXPORT_SYMBOL_GPL(vdso_enabled);
+__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
-static struct page **vdso32_pages;
-static unsigned vdso32_size;
-
#ifdef CONFIG_X86_64
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
-/* May not be __init: called during resume */
-void syscall32_cpu_init(void)
-{
- /* Load these always in case some future AMD CPU supports
- SYSENTER from compat mode too. */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
#define vdso32_syscall() (0)
-void enable_sep_cpu(void)
-{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
- if (!boot_cpu_has(X86_FEATURE_SEP)) {
- put_cpu();
- return;
- }
-
- tss->x86_tss.ss1 = __KERNEL_CS;
- tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
- put_cpu();
-}
-
#endif /* CONFIG_X86_64 */
+#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+const struct vdso_image *selected_vdso32;
+#endif
+
int __init sysenter_setup(void)
{
- char *vdso32_start, *vdso32_end;
- int npages, i;
-
#ifdef CONFIG_COMPAT
- if (vdso32_syscall()) {
- vdso32_start = vdso32_syscall_start;
- vdso32_end = vdso32_syscall_end;
- vdso32_pages = vdso32_syscall_pages;
- } else
+ if (vdso32_syscall())
+ selected_vdso32 = &vdso_image_32_syscall;
+ else
#endif
- if (vdso32_sysenter()) {
- vdso32_start = vdso32_sysenter_start;
- vdso32_end = vdso32_sysenter_end;
- vdso32_pages = vdso32_sysenter_pages;
- } else {
- vdso32_start = vdso32_int80_start;
- vdso32_end = vdso32_int80_end;
- vdso32_pages = vdso32_int80_pages;
- }
-
- npages = ((vdso32_end - vdso32_start) + PAGE_SIZE - 1) / PAGE_SIZE;
- vdso32_size = npages << PAGE_SHIFT;
- for (i = 0; i < npages; i++)
- vdso32_pages[i] = virt_to_page(vdso32_start + i*PAGE_SIZE);
+ if (vdso32_sysenter())
+ selected_vdso32 = &vdso_image_32_sysenter;
+ else
+ selected_vdso32 = &vdso_image_32_int80;
- patch_vdso32(vdso32_start, vdso32_size);
+ init_vdso_image(selected_vdso32);
return 0;
}
-/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
- struct vm_area_struct *vma;
-
-#ifdef CONFIG_X86_X32_ABI
- if (test_thread_flag(TIF_X32))
- return x32_setup_additional_pages(bprm, uses_interp);
-#endif
-
- if (vdso_enabled != 1) /* Other values all mean "disabled" */
- return 0;
-
- down_write(&mm->mmap_sem);
-
- addr = get_unmapped_area(NULL, 0, vdso32_size + VDSO_OFFSET(VDSO_PREV_PAGES), 0, 0);
- if (IS_ERR_VALUE(addr)) {
- ret = addr;
- goto up_fail;
- }
-
- addr += VDSO_OFFSET(VDSO_PREV_PAGES);
-
- current->mm->context.vdso = (void *)addr;
-
- /*
- * MAYWRITE to allow gdb to COW and set breakpoints
- */
- ret = install_special_mapping(mm,
- addr,
- vdso32_size,
- VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- vdso32_pages);
-
- if (ret)
- goto up_fail;
-
- vma = _install_special_mapping(mm,
- addr - VDSO_OFFSET(VDSO_PREV_PAGES),
- VDSO_OFFSET(VDSO_PREV_PAGES),
- VM_READ,
- NULL);
-
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto up_fail;
- }
-
- ret = remap_pfn_range(vma,
- addr - VDSO_OFFSET(VDSO_VVAR_PAGE),
- __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
- PAGE_SIZE,
- PAGE_READONLY);
-
- if (ret)
- goto up_fail;
-
-#ifdef CONFIG_HPET_TIMER
- if (hpet_address) {
- ret = io_remap_pfn_range(vma,
- addr - VDSO_OFFSET(VDSO_HPET_PAGE),
- hpet_address >> PAGE_SHIFT,
- PAGE_SIZE,
- pgprot_noncached(PAGE_READONLY));
-
- if (ret)
- goto up_fail;
- }
-#endif
-
- current_thread_info()->sysenter_return =
- VDSO32_SYMBOL(addr, SYSENTER_RETURN);
-
- up_fail:
- if (ret)
- current->mm->context.vdso = NULL;
-
- up_write(&mm->mmap_sem);
-
- return ret;
-}
-
#ifdef CONFIG_X86_64
subsys_initcall(sysenter_setup);
@@ -245,14 +92,7 @@ subsys_initcall(sysenter_setup);
static struct ctl_table abi_table2[] = {
{
.procname = "vsyscall32",
- .data = &sysctl_vsyscall32,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "ldt16",
- .data = &sysctl_ldt16,
+ .data = &vdso32_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -275,30 +115,6 @@ static __init int ia32_binfmt_init(void)
return 0;
}
__initcall(ia32_binfmt_init);
-#endif
-
-#else /* CONFIG_X86_32 */
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
- return "[vdso]";
- return NULL;
-}
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
deleted file mode 100644
index 018bcd9f97b4..000000000000
--- a/arch/x86/vdso/vdso32.S
+++ /dev/null
@@ -1,9 +0,0 @@
-#include <asm/vdso.h>
-
-DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so")
-
-#ifdef CONFIG_COMPAT
-DEFINE_VDSO_IMAGE(vdso32_syscall, "arch/x86/vdso/vdso32-syscall.so")
-#endif
-
-DEFINE_VDSO_IMAGE(vdso32_sysenter, "arch/x86/vdso/vdso32-sysenter.so")
diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c
new file mode 100644
index 000000000000..541468e25265
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso-fakesections.c
@@ -0,0 +1 @@
+#include "../vdso-fakesections.c"
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
index aadb8b9994cd..31056cf294bf 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -1,17 +1,14 @@
/*
* Linker script for 32-bit vDSO.
* We #include the file to define the layout details.
- * Here we only choose the prelinked virtual address.
*
* This file defines the version script giving the user-exported symbols in
- * the DSO. We can define local symbols here called VDSO* to make their
- * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ * the DSO.
*/
#include <asm/page.h>
#define BUILD_VDSO32
-#define VDSO_PRELINK 0
#include "../vdso-layout.lds.S"
@@ -38,13 +35,3 @@ VERSION
local: *;
};
}
-
-/*
- * Symbols we define here called VDSO* get their values into vdso32-syms.h.
- */
-VDSO32_vsyscall = __kernel_vsyscall;
-VDSO32_sigreturn = __kernel_sigreturn;
-VDSO32_rt_sigreturn = __kernel_rt_sigreturn;
-VDSO32_clock_gettime = clock_gettime;
-VDSO32_gettimeofday = gettimeofday;
-VDSO32_time = time;
diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S
deleted file mode 100644
index f4aa34e7f370..000000000000
--- a/arch/x86/vdso/vdsox32.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <asm/vdso.h>
-
-DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so")
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S
index 62272aa2ae0a..697c11ece90c 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/vdso/vdsox32.lds.S
@@ -1,14 +1,13 @@
/*
* Linker script for x32 vDSO.
* We #include the file to define the layout details.
- * Here we only choose the prelinked virtual address.
*
* This file defines the version script giving the user-exported symbols in
- * the DSO. We can define local symbols here called VDSO* to make their
- * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ * the DSO.
*/
-#define VDSO_PRELINK 0
+#define BUILD_VDSOX32
+
#include "vdso-layout.lds.S"
/*
@@ -24,5 +23,3 @@ VERSION {
local: *;
};
}
-
-VDSOX32_PRELINK = VDSO_PRELINK;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 1ad102613127..970463b566cf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -15,117 +15,56 @@
#include <asm/proto.h>
#include <asm/vdso.h>
#include <asm/page.h>
+#include <asm/hpet.h>
#if defined(CONFIG_X86_64)
-unsigned int __read_mostly vdso_enabled = 1;
+unsigned int __read_mostly vdso64_enabled = 1;
-DECLARE_VDSO_IMAGE(vdso);
extern unsigned short vdso_sync_cpuid;
-static unsigned vdso_size;
-
-#ifdef CONFIG_X86_X32_ABI
-DECLARE_VDSO_IMAGE(vdsox32);
-static unsigned vdsox32_size;
-#endif
#endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_X86_X32_ABI) || \
- defined(CONFIG_COMPAT)
-void __init patch_vdso32(void *vdso, size_t len)
+void __init init_vdso_image(const struct vdso_image *image)
{
- Elf32_Ehdr *hdr = vdso;
- Elf32_Shdr *sechdrs, *alt_sec = 0;
- char *secstrings;
- void *alt_data;
int i;
+ int npages = (image->size) / PAGE_SIZE;
- BUG_ON(len < sizeof(Elf32_Ehdr));
- BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
-
- sechdrs = (void *)hdr + hdr->e_shoff;
- secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (i = 1; i < hdr->e_shnum; i++) {
- Elf32_Shdr *shdr = &sechdrs[i];
- if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
- alt_sec = shdr;
- goto found;
- }
- }
-
- /* If we get here, it's probably a bug. */
- pr_warning("patch_vdso32: .altinstructions not found\n");
- return; /* nothing to patch */
+ BUG_ON(image->size % PAGE_SIZE != 0);
+ for (i = 0; i < npages; i++)
+ image->text_mapping.pages[i] =
+ virt_to_page(image->data + i*PAGE_SIZE);
-found:
- alt_data = (void *)hdr + alt_sec->sh_offset;
- apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
+ apply_alternatives((struct alt_instr *)(image->data + image->alt),
+ (struct alt_instr *)(image->data + image->alt +
+ image->alt_len));
}
-#endif
#if defined(CONFIG_X86_64)
-static void __init patch_vdso64(void *vdso, size_t len)
-{
- Elf64_Ehdr *hdr = vdso;
- Elf64_Shdr *sechdrs, *alt_sec = 0;
- char *secstrings;
- void *alt_data;
- int i;
-
- BUG_ON(len < sizeof(Elf64_Ehdr));
- BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
-
- sechdrs = (void *)hdr + hdr->e_shoff;
- secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (i = 1; i < hdr->e_shnum; i++) {
- Elf64_Shdr *shdr = &sechdrs[i];
- if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
- alt_sec = shdr;
- goto found;
- }
- }
-
- /* If we get here, it's probably a bug. */
- pr_warning("patch_vdso64: .altinstructions not found\n");
- return; /* nothing to patch */
-
-found:
- alt_data = (void *)hdr + alt_sec->sh_offset;
- apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
-}
-
static int __init init_vdso(void)
{
- int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
- int i;
-
- patch_vdso64(vdso_start, vdso_end - vdso_start);
-
- vdso_size = npages << PAGE_SHIFT;
- for (i = 0; i < npages; i++)
- vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
+ init_vdso_image(&vdso_image_64);
#ifdef CONFIG_X86_X32_ABI
- patch_vdso32(vdsox32_start, vdsox32_end - vdsox32_start);
- npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE;
- vdsox32_size = npages << PAGE_SHIFT;
- for (i = 0; i < npages; i++)
- vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE);
+ init_vdso_image(&vdso_image_x32);
#endif
return 0;
}
subsys_initcall(init_vdso);
+#endif
struct linux_binprm;
/* Put the vdso above the (randomized) stack with another randomized offset.
This way there is no hole in the middle of address space.
To save memory make sure it is still in the same PTE as the stack top.
- This doesn't give that many random bits */
+ This doesn't give that many random bits.
+
+ Only used for the 64-bit and x32 vdsos. */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
+#ifdef CONFIG_X86_32
+ return 0;
+#else
unsigned long addr, end;
unsigned offset;
end = (start + PMD_SIZE - 1) & PMD_MASK;
@@ -147,63 +86,154 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
addr = align_vdso_addr(addr);
return addr;
+#endif
}
-/* Setup a VMA at program startup for the vsyscall page.
- Not called for compat tasks */
-static int setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp,
- struct page **pages,
- unsigned size)
+static int map_vdso(const struct vdso_image *image, bool calculate_addr)
{
struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret;
-
- if (!vdso_enabled)
- return 0;
+ struct vm_area_struct *vma;
+ unsigned long addr, text_start;
+ int ret = 0;
+ static struct page *no_pages[] = {NULL};
+ static struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .pages = no_pages,
+ };
+
+ if (calculate_addr) {
+ addr = vdso_addr(current->mm->start_stack,
+ image->size - image->sym_vvar_start);
+ } else {
+ addr = 0;
+ }
down_write(&mm->mmap_sem);
- addr = vdso_addr(mm->start_stack, size);
- addr = get_unmapped_area(NULL, addr, size, 0, 0);
+
+ addr = get_unmapped_area(NULL, addr,
+ image->size - image->sym_vvar_start, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
- current->mm->context.vdso = (void *)addr;
+ text_start = addr - image->sym_vvar_start;
+ current->mm->context.vdso = (void __user *)text_start;
- ret = install_special_mapping(mm, addr, size,
- VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- pages);
- if (ret) {
- current->mm->context.vdso = NULL;
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ */
+ vma = _install_special_mapping(mm,
+ text_start,
+ image->size,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ &image->text_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto up_fail;
+ }
+
+ vma = _install_special_mapping(mm,
+ addr,
+ -image->sym_vvar_start,
+ VM_READ|VM_MAYREAD,
+ &vvar_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto up_fail;
+ }
+
+ if (image->sym_vvar_page)
+ ret = remap_pfn_range(vma,
+ text_start + image->sym_vvar_page,
+ __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
+ PAGE_SIZE,
+ PAGE_READONLY);
+
+ if (ret)
goto up_fail;
+
+#ifdef CONFIG_HPET_TIMER
+ if (hpet_address && image->sym_hpet_page) {
+ ret = io_remap_pfn_range(vma,
+ text_start + image->sym_hpet_page,
+ hpet_address >> PAGE_SHIFT,
+ PAGE_SIZE,
+ pgprot_noncached(PAGE_READONLY));
+
+ if (ret)
+ goto up_fail;
}
+#endif
up_fail:
+ if (ret)
+ current->mm->context.vdso = NULL;
+
up_write(&mm->mmap_sem);
return ret;
}
+#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+static int load_vdso32(void)
+{
+ int ret;
+
+ if (vdso32_enabled != 1) /* Other values all mean "disabled" */
+ return 0;
+
+ ret = map_vdso(selected_vdso32, false);
+ if (ret)
+ return ret;
+
+ if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
+ current_thread_info()->sysenter_return =
+ current->mm->context.vdso +
+ selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
+
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_X86_64
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
- return setup_additional_pages(bprm, uses_interp, vdso_pages,
- vdso_size);
+ if (!vdso64_enabled)
+ return 0;
+
+ return map_vdso(&vdso_image_64, true);
}
+#ifdef CONFIG_COMPAT
+int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp)
+{
#ifdef CONFIG_X86_X32_ABI
-int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+ if (test_thread_flag(TIF_X32)) {
+ if (!vdso64_enabled)
+ return 0;
+
+ return map_vdso(&vdso_image_x32, true);
+ }
+#endif
+
+ return load_vdso32();
+}
+#endif
+#else
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
- return setup_additional_pages(bprm, uses_interp, vdsox32_pages,
- vdsox32_size);
+ return load_vdso32();
}
#endif
+#ifdef CONFIG_X86_64
static __init int vdso_setup(char *s)
{
- vdso_enabled = simple_strtoul(s, NULL, 0);
+ vdso64_enabled = simple_strtoul(s, NULL, 0);
return 0;
}
__setup("vdso=", vdso_setup);
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c09cb68..7322755f337a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
new file mode 100644
index 000000000000..a02e09e18f57
--- /dev/null
+++ b/arch/x86/xen/efi.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Oracle Co., Daniel Kiper
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/string.h>
+
+#include <xen/xen-ops.h>
+
+#include <asm/setup.h>
+
+void __init xen_efi_init(void)
+{
+ efi_system_table_t *efi_systab_xen;
+
+ efi_systab_xen = xen_efi_probe();
+
+ if (efi_systab_xen == NULL)
+ return;
+
+ strncpy((char *)&boot_params.efi_info.efi_loader_signature, "Xen",
+ sizeof(boot_params.efi_info.efi_loader_signature));
+ boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
+ boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
+
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_PARAVIRT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c34bfc4bbe7f..c0cb11fb5008 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1339,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
static struct notifier_block xen_panic_block = {
.notifier_call= xen_panic_event,
+ .priority = INT_MIN
};
int xen_panic_handler_init(void)
@@ -1536,7 +1537,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (!xen_pvh_domain())
pv_cpu_ops = xen_cpu_ops;
- x86_init.resources.memory_setup = xen_memory_setup;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
+ else
+ x86_init.resources.memory_setup = xen_memory_setup;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
@@ -1714,6 +1718,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_setup_runstate_info(0);
+ xen_efi_init();
+
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
@@ -1822,8 +1828,19 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_mmu_ops();
}
+static bool xen_nopv = false;
+static __init int xen_parse_nopv(char *arg)
+{
+ xen_nopv = true;
+ return 0;
+}
+early_param("xen_nopv", xen_parse_nopv);
+
static uint32_t __init xen_hvm_platform(void)
{
+ if (xen_nopv)
+ return 0;
+
if (xen_pv_domain())
return 0;
@@ -1832,6 +1849,8 @@ static uint32_t __init xen_hvm_platform(void)
bool xen_hvm_need_lapic(void)
{
+ if (xen_nopv)
+ return false;
if (xen_pv_domain())
return false;
if (!xen_hvm_domain())
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index c98583588580..c0413046483a 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -36,99 +36,83 @@
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/xen.h>
#include <asm/pgtable.h>
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
{
- unsigned long **frames = (unsigned long **)data;
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
- return 0;
-}
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
-/*
- * This function is used to map shared frames to store grant status. It is
- * different from map_pte_fn above, the frames type here is uint64_t.
- */
-static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
-{
- uint64_t **frames = (uint64_t **)data;
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
return 0;
}
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
+ unsigned long addr;
+ unsigned long i;
- set_pte_at(&init_mm, addr, pte, __pte(0));
- return 0;
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ __pte(0));
+ addr += PAGE_SIZE;
+ }
}
-int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- void **__shared)
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
- int rc;
- void *shared = *__shared;
+ area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
- if (shared == NULL) {
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
}
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn, &frames);
- return rc;
+ return 0;
}
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- grant_status_t **__shared)
+int arch_gnttab_init(unsigned long nr_shared)
{
- int rc;
- grant_status_t *shared = *__shared;
-
- if (shared == NULL) {
- /* No need to pass in PTE as we are going to do it
- * in apply_to_page_range anyhow. */
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
- }
+ if (!xen_pv_domain())
+ return 0;
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn_status, &frames);
- return rc;
+ return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
}
-void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
-{
- apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
-}
#ifdef CONFIG_XEN_PVH
#include <xen/balloon.h>
#include <xen/events.h>
-#include <xen/xen.h>
#include <linux/slab.h>
static int __init xlated_setup_gnttab_pages(void)
{
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 86e02eabb640..e8a1201c3293 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1494,7 +1494,7 @@ static int xen_pgd_alloc(struct mm_struct *mm)
page->private = (unsigned long)user_pgd;
if (user_pgd != NULL) {
- user_pgd[pgd_index(VSYSCALL_START)] =
+ user_pgd[pgd_index(VSYSCALL_ADDR)] =
__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
ret = 0;
}
@@ -2062,8 +2062,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
# endif
#else
- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
- case VVAR_PAGE:
+ case VSYSCALL_PAGE:
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
@@ -2104,8 +2103,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#ifdef CONFIG_X86_64
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
- if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
- idx == VVAR_PAGE) {
+ if (idx == VSYSCALL_PAGE) {
unsigned long vaddr = __fix_to_virt(idx);
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
}
@@ -2510,6 +2508,95 @@ void __init xen_hvm_init_mmu_ops(void)
}
#endif
+#ifdef CONFIG_XEN_PVH
+/*
+ * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
+ * space creating new guest on pvh dom0 and needing to map domU pages.
+ */
+static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
+ unsigned int domid)
+{
+ int rc, err = 0;
+ xen_pfn_t gpfn = lpfn;
+ xen_ulong_t idx = fgfn;
+
+ struct xen_add_to_physmap_range xatp = {
+ .domid = DOMID_SELF,
+ .foreign_domid = domid,
+ .size = 1,
+ .space = XENMAPSPACE_gmfn_foreign,
+ };
+ set_xen_guest_handle(xatp.idxs, &idx);
+ set_xen_guest_handle(xatp.gpfns, &gpfn);
+ set_xen_guest_handle(xatp.errs, &err);
+
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+ if (rc < 0)
+ return rc;
+ return err;
+}
+
+static int xlate_remove_from_p2m(unsigned long spfn, int count)
+{
+ struct xen_remove_from_physmap xrp;
+ int i, rc;
+
+ for (i = 0; i < count; i++) {
+ xrp.domid = DOMID_SELF;
+ xrp.gpfn = spfn+i;
+ rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+struct xlate_remap_data {
+ unsigned long fgfn; /* foreign domain's gfn */
+ pgprot_t prot;
+ domid_t domid;
+ int index;
+ struct page **pages;
+};
+
+static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+ void *data)
+{
+ int rc;
+ struct xlate_remap_data *remap = data;
+ unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
+ pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
+
+ rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
+ if (rc)
+ return rc;
+ native_set_pte(ptep, pteval);
+
+ return 0;
+}
+
+static int xlate_remap_gfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long mfn,
+ int nr, pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ int err;
+ struct xlate_remap_data pvhdata;
+
+ BUG_ON(!pages);
+
+ pvhdata.fgfn = mfn;
+ pvhdata.prot = prot;
+ pvhdata.domid = domid;
+ pvhdata.index = 0;
+ pvhdata.pages = pages;
+ err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+ xlate_map_pte_fn, &pvhdata);
+ flush_tlb_all();
+ return err;
+}
+#endif
+
#define REMAP_BATCH_SIZE 16
struct remap_data {
@@ -2522,7 +2609,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
unsigned long addr, void *data)
{
struct remap_data *rmd = data;
- pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+ pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
rmd->mmu_update->val = pte_val_ma(pte);
@@ -2544,13 +2631,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
unsigned long range;
int err = 0;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -EINVAL;
-
- prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
-
BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_XEN_PVH
+ /* We need to update the local page tables and the xen HAP */
+ return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+ domid, pages);
+#else
+ return -EINVAL;
+#endif
+ }
+
rmd.mfn = mfn;
rmd.prot = prot;
@@ -2588,6 +2680,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
return 0;
+#ifdef CONFIG_XEN_PVH
+ while (numpgs--) {
+ /*
+ * The mmu has already cleaned up the process mmu
+ * resources at this point (lookup_address will return
+ * NULL).
+ */
+ unsigned long pfn = page_to_pfn(pages[numpgs]);
+
+ xlate_remove_from_p2m(pfn, 1);
+ }
+ /*
+ * We don't need to flush tlbs because as part of
+ * xlate_remove_from_p2m, the hypervisor will do tlb flushes
+ * after removing the p2m entries from the EPT/NPT
+ */
+ return 0;
+#else
return -EINVAL;
+#endif
}
EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 85e5d78c9874..3172692381ae 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -36,7 +36,7 @@
* pfn_to_mfn(0xc0000)=0xc0000
*
* The benefit of this is, that we can assume for non-RAM regions (think
- * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
+ * PCI BARs, or ACPI spaces), we can create mappings easily because we
* get the PFN value to match the MFN.
*
* For this to work efficiently we have one new page p2m_identity and
@@ -60,7 +60,7 @@
* There is also a digram of the P2M at the end that can help.
* Imagine your E820 looking as so:
*
- * 1GB 2GB
+ * 1GB 2GB 4GB
* /-------------------+---------\/----\ /----------\ /---+-----\
* | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
* \-------------------+---------/\----/ \----------/ \---+-----/
@@ -77,9 +77,8 @@
* of the PFN and the end PFN (263424 and 512256 respectively). The first step
* is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
* covers 512^2 of page estate (1GB) and in case the start or end PFN is not
- * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
- * to end pfn. We reserve_brk top leaf pages if they are missing (means they
- * point to p2m_mid_missing).
+ * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
+ * required to split any existing p2m_mid_missing middle pages.
*
* With the E820 example above, 263424 is not 1GB aligned so we allocate a
* reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
@@ -88,7 +87,7 @@
* Next stage is to determine if we need to do a more granular boundary check
* on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
* We check if the start pfn and end pfn violate that boundary check, and if
- * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
+ * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
* granularity of setting which PFNs are missing and which ones are identity.
* In our example 263424 and 512256 both fail the check so we reserve_brk two
* pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
@@ -102,9 +101,10 @@
*
* The next step is to walk from the start pfn to the end pfn setting
* the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
- * If we find that the middle leaf is pointing to p2m_missing we can swap it
- * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
- * point we do not need to worry about boundary aligment (so no need to
+ * If we find that the middle entry is pointing to p2m_missing we can swap it
+ * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
+ * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
+ * At this point we do not need to worry about boundary aligment (so no need to
* reserve_brk a middle page, figure out which PFNs are "missing" and which
* ones are identity), as that has been done earlier. If we find that the
* middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
@@ -118,6 +118,9 @@
* considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
* contain the INVALID_P2M_ENTRY value and are considered "missing."
*
+ * Finally, the region beyond the end of of the E820 (4 GB in this example)
+ * is set to be identity (in case there are MMIO regions placed here).
+ *
* This is what the p2m ends up looking (for the E820 above) with this
* fabulous drawing:
*
@@ -129,21 +132,27 @@
* |-----| \ | [p2m_identity]+\\ | .... |
* | 2 |--\ \-------------------->| ... | \\ \----------------/
* |-----| \ \---------------/ \\
- * | 3 |\ \ \\ p2m_identity
- * |-----| \ \-------------------->/---------------\ /-----------------\
- * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
- * \-----/ / | [p2m_identity]+-->| ..., ~0 |
- * / /---------------\ | .... | \-----------------/
- * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
- * / | IDENTITY[@256]|<----/ \---------------/
- * / | ~0, ~0, .... |
- * | \---------------/
- * |
- * p2m_mid_missing p2m_missing
- * /-----------------\ /------------\
- * | [p2m_missing] +---->| ~0, ~0, ~0 |
- * | [p2m_missing] +---->| ..., ~0 |
- * \-----------------/ \------------/
+ * | 3 |-\ \ \\ p2m_identity [1]
+ * |-----| \ \-------------------->/---------------\ /-----------------\
+ * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
+ * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
+ * | | | .... | \-----------------/
+ * | | +-[x], ~0, ~0.. +\
+ * | | \---------------/ \
+ * | | \-> /---------------\
+ * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
+ * | /-----------------\ /------------\ | IDENTITY[@256]|
+ * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
+ * | | [p2m_missing] +---->| ..., ~0 | \---------------/
+ * | | ... | \------------/
+ * | \-----------------/
+ * |
+ * | p2m_mid_identity
+ * | /-----------------\
+ * \-->| [p2m_identity] +---->[1]
+ * | [p2m_identity] +---->[1]
+ * | ... |
+ * \-----------------/
*
* where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
*/
@@ -187,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
/* We might hit two boundary violations at the start and end, at max each
* boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
+RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
/* When we populate back during bootup, the amount of pages can vary. The
* max we have is seen is 395979, but that does not mean it can't be more.
@@ -242,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top)
top[i] = p2m_mid_missing_mfn;
}
-static void p2m_mid_init(unsigned long **mid)
+static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
+ mid[i] = leaf;
}
-static void p2m_mid_mfn_init(unsigned long *mid)
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
+ mid[i] = virt_to_mfn(leaf);
}
static void p2m_init(unsigned long *p2m)
@@ -286,7 +297,9 @@ void __ref xen_build_mfn_list_list(void)
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_p_init(p2m_top_mfn_p);
@@ -295,7 +308,8 @@ void __ref xen_build_mfn_list_list(void)
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
}
for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
@@ -327,7 +341,7 @@ void __ref xen_build_mfn_list_list(void)
* it too late.
*/
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
@@ -365,16 +379,17 @@ void __init xen_build_dynamic_phys_to_machine(void)
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_missing);
+ p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_identity);
p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
+ p2m_mid_init(p2m_mid_missing, p2m_missing);
+ p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_identity, p2m_identity);
p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_init(p2m_top);
- p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_identity);
-
/*
* The domain builder gives us a pre-constructed p2m array in
* mfn_list for all the pages initially given to us, so we just
@@ -386,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
if (p2m_top[topidx] == p2m_mid_missing) {
unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
}
@@ -492,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn)
unsigned topidx, mididx, idx;
if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
+ return IDENTITY_FRAME(pfn);
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
@@ -545,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn)
if (!mid)
return false;
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
free_p2m_page(mid);
@@ -565,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn)
if (!mid_mfn)
return false;
- p2m_mid_mfn_init(mid_mfn);
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
@@ -596,7 +611,7 @@ static bool alloc_p2m(unsigned long pfn)
return true;
}
-static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
+static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
{
unsigned topidx, mididx, idx;
unsigned long *p2m;
@@ -638,7 +653,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary
return true;
}
-static bool __init early_alloc_p2m(unsigned long pfn)
+static bool __init early_alloc_p2m_middle(unsigned long pfn)
{
unsigned topidx = p2m_top_index(pfn);
unsigned long *mid_mfn_p;
@@ -649,7 +664,7 @@ static bool __init early_alloc_p2m(unsigned long pfn)
if (mid == p2m_mid_missing) {
mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
@@ -658,12 +673,12 @@ static bool __init early_alloc_p2m(unsigned long pfn)
/* And the save/restore P2M tables.. */
if (mid_mfn_p == p2m_mid_missing_mfn) {
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
/* Note: we don't set mid_mfn_p[midix] here,
- * look in early_alloc_p2m_middle */
+ * look in early_alloc_p2m() */
}
return true;
}
@@ -739,7 +754,7 @@ found:
/* This shouldn't happen */
if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
- early_alloc_p2m(set_pfn);
+ early_alloc_p2m_middle(set_pfn);
if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
return false;
@@ -754,13 +769,13 @@ found:
bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!early_alloc_p2m(pfn))
+ if (!early_alloc_p2m_middle(pfn))
return false;
if (early_can_reuse_p2m_middle(pfn, mfn))
return __set_phys_to_machine(pfn, mfn);
- if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
+ if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
return false;
if (!__set_phys_to_machine(pfn, mfn))
@@ -769,12 +784,30 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+
+static void __init early_split_p2m(unsigned long pfn)
+{
+ unsigned long mididx, idx;
+
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * Allocate new middle and leaf pages if this pfn lies in the
+ * middle of one.
+ */
+ if (mididx || idx)
+ early_alloc_p2m_middle(pfn);
+ if (idx)
+ early_alloc_p2m(pfn, false);
+}
+
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
- if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
+ if (unlikely(pfn_s >= MAX_P2M_PFN))
return 0;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
@@ -783,24 +816,34 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
if (pfn_s > pfn_e)
return 0;
- for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
- pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
- pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
- {
- WARN_ON(!early_alloc_p2m(pfn));
- }
+ if (pfn_e > MAX_P2M_PFN)
+ pfn_e = MAX_P2M_PFN;
- early_alloc_p2m_middle(pfn_s, true);
- early_alloc_p2m_middle(pfn_e, true);
+ early_split_p2m(pfn_s);
+ early_split_p2m(pfn_e);
+
+ for (pfn = pfn_s; pfn < pfn_e;) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
- for (pfn = pfn_s; pfn < pfn_e; pfn++)
if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
break;
+ pfn++;
- if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ /*
+ * If the PFN was set to a middle or leaf identity
+ * page the remainder must also be identity, so skip
+ * ahead to the next middle or leaf entry.
+ */
+ if (p2m_top[topidx] == p2m_mid_identity)
+ pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
+ else if (p2m_top[topidx][mididx] == p2m_identity)
+ pfn = ALIGN(pfn, P2M_PER_PAGE);
+ }
+
+ WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
- (pfn_e - pfn_s) - (pfn - pfn_s)))
- printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+ (pfn_e - pfn_s) - (pfn - pfn_s));
return pfn - pfn_s;
}
@@ -825,8 +868,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
/* For sparse holes were the p2m leaf has real PFN along with
* PCI holes, stick in the PFN as the MFN value.
+ *
+ * set_phys_range_identity() will have allocated new middle
+ * and leaf pages as required so an existing p2m_mid_missing
+ * or p2m_missing mean that whole range will be identity so
+ * these can be switched to p2m_mid_identity or p2m_identity.
*/
if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+ if (p2m_top[topidx] == p2m_mid_identity)
+ return true;
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
+ p2m_mid_identity) != p2m_mid_missing);
+ return true;
+ }
+
if (p2m_top[topidx][mididx] == p2m_identity)
return true;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233b9b84..2e555163c2fe 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,7 +27,6 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
-#include "mmu.h"
#include "xen-ops.h"
#include "vdso.h"
@@ -82,17 +81,14 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
memblock_reserve(start, size);
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
xen_max_p2m_pfn = PFN_DOWN(start + size);
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
- if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+ if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
continue;
- WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
- pfn, mfn);
+ WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+ pfn, mfn);
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -107,7 +103,6 @@ static unsigned long __init xen_do_chunk(unsigned long start,
.domid = DOMID_SELF
};
unsigned long len = 0;
- int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
unsigned long pfn;
int ret;
@@ -121,7 +116,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
continue;
frame = mfn;
} else {
- if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
+ if (mfn != INVALID_P2M_ENTRY)
continue;
frame = pfn;
}
@@ -159,13 +154,6 @@ static unsigned long __init xen_do_chunk(unsigned long start,
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
- /*
- * Xen already ballooned out the E820 non RAM regions for us
- * and set them up properly in EPT.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return end - start;
-
return xen_do_chunk(start, end, true);
}
@@ -234,13 +222,7 @@ static void __init xen_set_identity_and_release_chunk(
* (except for the ISA region which must be 1:1 mapped) to
* release the refcounts (in Xen) on the original frames.
*/
-
- /*
- * PVH E820 matches the hypervisor's P2M which means we need to
- * account for the proper values of *release and *identity.
- */
- for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
- pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
pte_t pte = __pte_ma(0);
if (pfn < PFN_UP(ISA_END_ADDRESS))
@@ -469,6 +451,15 @@ char * __init xen_memory_setup(void)
}
/*
+ * Set the rest as identity mapped, in case PCI BARs are
+ * located here.
+ *
+ * PFNs above MAX_P2M_PFN are considered identity mapped as
+ * well.
+ */
+ set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+
+ /*
* In domU, the ISA region is normal, usable memory, but we
* reserve ISA memory anyway because too many things poke
* about in there.
@@ -509,6 +500,35 @@ char * __init xen_memory_setup(void)
}
/*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
+{
+ static struct e820entry map[E820MAX] __initdata;
+
+ struct xen_memory_map memmap;
+ int i;
+ int rc;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc < 0)
+ panic("No memory map (%d)\n", rc);
+
+ sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+
+ for (i = 0; i < memmap.nr_entries; i++)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ return "Xen";
+}
+
+/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
@@ -516,10 +536,17 @@ char * __init xen_memory_setup(void)
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
+ /*
+ * This could be called before selected_vdso32 is initialized, so
+ * just fiddle with both possible images. vdso_image_32_syscall
+ * can't be selected, since it only exists on 64-bit systems.
+ */
u32 *mask;
- mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
+ mask = vdso_image_32_int80.data +
+ vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
- mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+ mask = vdso_image_32_sysenter.data +
+ vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
@@ -574,13 +601,7 @@ void xen_enable_syscall(void)
}
#endif /* CONFIG_X86_64 */
}
-void xen_enable_nmi(void)
-{
-#ifdef CONFIG_X86_64
- if (register_callback(CALLBACKTYPE_nmi, (char *)nmi))
- BUG();
-#endif
-}
+
void __init xen_pvmmu_arch_setup(void)
{
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -595,7 +616,6 @@ void __init xen_pvmmu_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
- xen_enable_nmi();
}
/* This function is not called for HVM domains */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226e..c4df9dbd63b7 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,8 +12,10 @@
#include "xen-ops.h"
#include "mmu.h"
-void xen_arch_pre_suspend(void)
+static void xen_pv_pre_suspend(void)
{
+ xen_mm_pin_all();
+
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
@@ -26,7 +28,7 @@ void xen_arch_pre_suspend(void)
BUG();
}
-void xen_arch_hvm_post_suspend(int suspend_cancelled)
+static void xen_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
@@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
#endif
}
-void xen_arch_post_suspend(int suspend_cancelled)
+static void xen_pv_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
@@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled)
xen_vcpu_restore();
}
+ xen_mm_unpin_all();
+}
+
+void xen_arch_pre_suspend(void)
+{
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
+}
+
+void xen_arch_post_suspend(int cancelled)
+{
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1cb6f4c37300..28c7e0be56e4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -31,9 +31,12 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
extern unsigned long xen_max_p2m_pfn;
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
void xen_set_pat(u64);
char * __init xen_memory_setup(void);
+char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
@@ -102,6 +105,14 @@ static inline void __init xen_init_apic(void)
}
#endif
+#ifdef CONFIG_XEN_EFI
+extern void xen_efi_init(void);
+#else
+static inline void __init xen_efi_init(void)
+{
+}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \