summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-01-25 00:17:06 +0300
committerDavid S. Miller <davem@davemloft.net>2011-01-25 00:17:06 +0300
commite92427b289d252cfbd4cb5282d92f4ce1a5bb1fb (patch)
tree6d30e5e7b7f8e9aaa51d43b7128ac56860fa03bb /arch/x86
parentc506653d35249bb4738bb139c24362e1ae724bc1 (diff)
parentec30f343d61391ab23705e50a525da1d55395780 (diff)
downloadlinux-e92427b289d252cfbd4cb5282d92f4ce1a5bb1fb.tar.xz
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/misc.c4
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1832
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c540
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/amd_nb.h13
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/cacheflush.h42
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/gpio.h5
-rw-r--r--arch/x86/include/asm/hypervisor.h12
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h100
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/nmi.h20
-rw-r--r--arch/x86/include/asm/numa_32.h2
-rw-r--r--arch/x86/include/asm/numa_64.h3
-rw-r--r--arch/x86/include/asm/olpc.h10
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/paravirt.h25
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/percpu.h8
-rw-r--r--arch/x86/include/asm/perf_event_p4.h3
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/include/asm/prom.h1
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h35
-rw-r--r--arch/x86/include/asm/xen/page.h16
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_nb.c7
-rw-r--r--arch/x86/kernel/apb_timer.c14
-rw-r--r--arch/x86/kernel/aperture_64.c44
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c28
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64.S39
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/kgdb.c7
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/process.c33
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/smpboot.c7
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/traps.c102
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c376
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h156
-rw-r--r--arch/x86/kvm/svm.c865
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c156
-rw-r--r--arch/x86/kvm/x86.c474
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/mm/amdtopology_64.c86
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/numa_64.c181
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/mm/srat_64.c26
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c2
-rw-r--r--arch/x86/pci/amd_bus.c33
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/common.c41
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c2
-rw-r--r--arch/x86/platform/olpc/Makefile1
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c101
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c183
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c5
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c42
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c366
-rw-r--r--arch/x86/xen/p2m.c528
119 files changed, 6276 insertions, 1799 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b6fccb07123e..d5ed94d30aad 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -51,6 +51,7 @@ config X86
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_KERNEL_XZ
select HAVE_KERNEL_LZO
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
@@ -65,6 +66,7 @@ config X86
select HAVE_SPARSE_IRQ
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
+ select USE_GENERIC_SMP_HELPERS if SMP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -203,10 +205,6 @@ config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
-config USE_GENERIC_SMP_HELPERS
- def_bool y
- depends on SMP
-
config X86_32_SMP
def_bool y
depends on X86_32 && SMP
@@ -629,11 +627,11 @@ config APB_TIMER
as it is off-chip. APB timers are always running regardless of CPU
C states, they are used as per CPU clockevent device when possible.
-# Mark as embedded because too many people got it wrong.
+# Mark as expert because too many people got it wrong.
# The code disables itself when not needed.
config DMI
default y
- bool "Enable DMI scanning" if EMBEDDED
+ bool "Enable DMI scanning" if EXPERT
---help---
Enabled scanning of DMI to identify machine quirks. Say Y
here unless you have verified that your setup is not
@@ -641,7 +639,7 @@ config DMI
BIOS code.
config GART_IOMMU
- bool "GART IOMMU support" if EMBEDDED
+ bool "GART IOMMU support" if EXPERT
default y
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
@@ -891,7 +889,7 @@ config X86_THERMAL_VECTOR
depends on X86_MCE_INTEL
config VM86
- bool "Enable VM86 support" if EMBEDDED
+ bool "Enable VM86 support" if EXPERT
default y
depends on X86_32
---help---
@@ -1075,7 +1073,7 @@ endchoice
choice
depends on EXPERIMENTAL
- prompt "Memory split" if EMBEDDED
+ prompt "Memory split" if EXPERT
default VMSPLIT_3G
depends on X86_32
---help---
@@ -1137,7 +1135,7 @@ config ARCH_DMA_ADDR_T_64BIT
def_bool X86_64 || HIGHMEM64G
config DIRECT_GBPAGES
- bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ bool "Enable 1GB pages for kernel pagetables" if EXPERT
default y
depends on X86_64
---help---
@@ -1371,7 +1369,7 @@ config MATH_EMULATION
config MTRR
def_bool y
- prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
+ prompt "MTRR (Memory Type Range Register) support" if EXPERT
---help---
On Intel P6 family processors (Pentium Pro, Pentium II and later)
the Memory Type Range Registers (MTRRs) may be used to control
@@ -1437,7 +1435,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
config X86_PAT
def_bool y
- prompt "x86 PAT support" if EMBEDDED
+ prompt "x86 PAT support" if EXPERT
depends on MTRR
---help---
Use PAT attributes to setup page level cache control.
@@ -1541,7 +1539,7 @@ config KEXEC_JUMP
code in physical address mode via KEXEC
config PHYSICAL_START
- hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+ hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
default "0x1000000"
---help---
This gives the physical address where the kernel is loaded.
@@ -1936,13 +1934,19 @@ config PCI_MMCONFIG
depends on X86_64 && PCI && ACPI
config PCI_CNB20LE_QUIRK
- bool "Read CNB20LE Host Bridge Windows"
- depends on PCI
+ bool "Read CNB20LE Host Bridge Windows" if EXPERT
+ default n
+ depends on PCI && EXPERIMENTAL
help
Read the PCI windows out of the CNB20LE host bridge. This allows
PCI hotplug to work on systems with the CNB20LE chipset which do
not have ACPI.
+ There's no public spec for this chipset, and this functionality
+ is known to be incomplete.
+
+ You should say N unless you know you need this.
+
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -2064,13 +2068,14 @@ config OLPC
bool "One Laptop Per Child support"
select GPIOLIB
select OLPC_OPENFIRMWARE
+ depends on !X86_64 && !X86_PAE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
config OLPC_XO1
tristate "OLPC XO-1 support"
- depends on OLPC && PCI
+ depends on OLPC && MFD_CS5535
---help---
Add support for non-essential features of the OLPC XO-1 laptop.
@@ -2078,11 +2083,17 @@ config OLPC_OPENFIRMWARE
bool "Support for OLPC's Open Firmware"
depends on !X86_64 && !X86_PAE
default n
+ select OF
help
This option adds support for the implementation of Open Firmware
that is used on the OLPC XO-1 Children's Machine.
If unsure, say N here.
+config OLPC_OPENFIRMWARE_DT
+ bool
+ default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
+ select OF_PROMTREE
+
endif # X86_32
config AMD_NB
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 15588a0ef466..283c5a6a03a6 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -424,7 +424,7 @@ config X86_DEBUGCTLMSR
depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
menuconfig PROCESSOR_SELECT
- bool "Supported processor vendors" if EMBEDDED
+ bool "Supported processor vendors" if EXPERT
---help---
This lets you choose what x86 vendor support code your kernel
will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 45143bbcfe5e..615e18810f48 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
see errors. Disable this if you want silent bootup.
config EARLY_PRINTK
- bool "Early printk" if EMBEDDED
+ bool "Early printk" if EXPERT
default y
---help---
Write kernel log output directly into the VGA buffer or to a serial
@@ -138,7 +138,7 @@ config DEBUG_NX_TEST
config DOUBLEFAULT
default y
- bool "Enable doublefault exception handler" if EMBEDDED
+ bool "Enable doublefault exception handler" if EXPERT
depends on X86_32
---help---
This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0c229551eead..09664efb9cee 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
$(call if_changed,bzip2)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzma)
+$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,xzkern)
$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzo)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
suffix-$(CONFIG_KERNEL_LZMA) := lzma
+suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
quiet_cmd_mkpiggy = MKPIGGY $@
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 325c05294fc4..3a19d04cebeb 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -139,6 +139,10 @@ static int lines, cols;
#include "../../../../lib/decompress_unlzma.c"
#endif
+#ifdef CONFIG_KERNEL_XZ
+#include "../../../../lib/decompress_unxz.c"
+#endif
+
#ifdef CONFIG_KERNEL_LZO
#include "../../../../lib/decompress_unlzo.c"
#endif
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 5c228129d175..646aa78ba5fd 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -74,7 +74,7 @@ int main(int argc, char *argv[])
offs = (olen > ilen) ? olen - ilen : 0;
offs += olen >> 12; /* Add 8 bytes for each 32K block */
- offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
+ offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..8fe2a4966b7a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Adrian Hoban <adrian.hoban@intel.com>
+ * James Guilford (james.guilford@intel.com)
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Wajdi Feghali (wajdi.k.feghali@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
#include <linux/linkage.h>
#include <asm/inst.h>
+#ifdef __x86_64__
+.data
+POLY: .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and ZERO should follow ALL_F
+
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1: .octa 0x0000000000000000ffffffffffffffff
+MASK2: .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ZERO: .octa 0x00000000000000000000000000000000
+ONE: .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec: .octa 0x1
+enc: .octa 0x2
+
+
.text
+
+#define STACK_OFFSET 8*3
+#define HashKey 16*0 // store HashKey <<1 mod poly here
+#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
+#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
+#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
+#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
+ // bits of HashKey <<1 mod poly here
+ //(for Karatsuba purposes)
+#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^2 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^3 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^4 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define VARIABLE_OFFSET 16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+#endif
+
+
#define STATE1 %xmm0
#define STATE2 %xmm4
#define STATE3 %xmm5
@@ -32,12 +100,16 @@
#define IN IN1
#define KEY %xmm2
#define IV %xmm3
+
#define BSWAP_MASK %xmm10
#define CTR %xmm11
#define INC %xmm12
+#ifdef __x86_64__
+#define AREG %rax
#define KEYP %rdi
#define OUTP %rsi
+#define UKEYP OUTP
#define INP %rdx
#define LEN %rcx
#define IVP %r8
@@ -46,6 +118,1588 @@
#define TKEYP T1
#define T2 %r11
#define TCTR_LOW T2
+#else
+#define AREG %eax
+#define KEYP %edi
+#define OUTP AREG
+#define UKEYP OUTP
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define KLEN %ebx
+#define T1 %ecx
+#define TKEYP T1
+#endif
+
+
+#ifdef __x86_64__
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+ movdqa \GH, \TMP1
+ pshufd $78, \GH, \TMP2
+ pshufd $78, \HK, \TMP3
+ pxor \GH, \TMP2 # TMP2 = a1+a0
+ pxor \HK, \TMP3 # TMP3 = b1+b0
+ PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
+ PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
+ pxor \GH, \TMP2
+ pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \GH
+ pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
+
+ # first phase of the reduction
+
+ movdqa \GH, \TMP2
+ movdqa \GH, \TMP3
+ movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ pslld $31, \TMP2 # packed right shift <<31
+ pslld $30, \TMP3 # packed right shift <<30
+ pslld $25, \TMP4 # packed right shift <<25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift TMP5 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \GH
+
+ # second phase of the reduction
+
+ movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ movdqa \GH,\TMP3
+ movdqa \GH,\TMP4
+ psrld $1,\TMP2 # packed left shift >>1
+ psrld $2,\TMP3 # packed left shift >>2
+ psrld $7,\TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \GH
+ pxor \TMP1, \GH # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ mov arg7, %r10 # %r10 = AAD
+ mov arg8, %r12 # %r12 = aadLen
+ mov %r12, %r11
+ pxor %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+ movd (%r10), \TMP1
+ pslldq $12, \TMP1
+ psrldq $4, %xmm\i
+ pxor \TMP1, %xmm\i
+ add $4, %r10
+ sub $4, %r12
+ jne _get_AAD_loop\num_initial_blocks\operation
+ cmp $16, %r11
+ je _get_AAD_loop2_done\num_initial_blocks\operation
+ mov $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+ psrldq $4, %xmm\i
+ sub $4, %r12
+ cmp %r11, %r12
+ jne _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+
+ xor %r11, %r11 # initialise the data pointer offset as zero
+
+ # start AES for num_initial_blocks blocks
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), \XMM0 # XMM0 = Y0
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, %xmm\index
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+ pxor 16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+ movaps 0x10(%rdi), \TMP1
+ AESENC \TMP1, %xmm\index # Round 1
+.endr
+.irpc index, \i_seq
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x30(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x40(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x50(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x60(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x70(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x80(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x90(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0xa0(%arg1), \TMP1
+ AESENCLAST \TMP1, %xmm\index # Round 10
+.endr
+.irpc index, \i_seq
+ movdqu (%arg3 , %r11, 1), \TMP1
+ pxor \TMP1, %xmm\index
+ movdqu %xmm\index, (%arg2 , %r11, 1)
+ # write back plaintext/ciphertext for num_initial_blocks
+ add $16, %r11
+
+ movdqa \TMP1, %xmm\index
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index
+
+ # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+ pxor %xmm5, %xmm6
+ GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+ cmp $64, %r13
+ jl _initial_blocks_done\num_initial_blocks\operation
+ # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM1
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM2
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM3
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM4
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+ pxor 16*0(%arg1), \XMM1
+ pxor 16*0(%arg1), \XMM2
+ pxor 16*0(%arg1), \XMM3
+ pxor 16*0(%arg1), \XMM4
+ movdqa \TMP3, \TMP5
+ pshufd $78, \TMP3, \TMP1
+ pxor \TMP3, \TMP1
+ movdqa \TMP1, HashKey_k(%rsp)
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+ movdqa \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_3(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_4(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_4_k(%rsp)
+ movaps 0xa0(%arg1), \TMP2
+ AESENCLAST \TMP2, \XMM1
+ AESENCLAST \TMP2, \XMM2
+ AESENCLAST \TMP2, \XMM3
+ AESENCLAST \TMP2, \XMM4
+ movdqu 16*0(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM1
+ movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM1
+ movdqu 16*1(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM2
+ movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM2
+ movdqu 16*2(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM3
+ movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM3
+ movdqu 16*3(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM4
+ movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM4
+ add $64, %r11
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pxor \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ mov arg7, %r10 # %r10 = AAD
+ mov arg8, %r12 # %r12 = aadLen
+ mov %r12, %r11
+ pxor %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+ movd (%r10), \TMP1
+ pslldq $12, \TMP1
+ psrldq $4, %xmm\i
+ pxor \TMP1, %xmm\i
+ add $4, %r10
+ sub $4, %r12
+ jne _get_AAD_loop\num_initial_blocks\operation
+ cmp $16, %r11
+ je _get_AAD_loop2_done\num_initial_blocks\operation
+ mov $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+ psrldq $4, %xmm\i
+ sub $4, %r12
+ cmp %r11, %r12
+ jne _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+
+ xor %r11, %r11 # initialise the data pointer offset as zero
+
+ # start AES for num_initial_blocks blocks
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), \XMM0 # XMM0 = Y0
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, %xmm\index
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+ pxor 16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+ movaps 0x10(%rdi), \TMP1
+ AESENC \TMP1, %xmm\index # Round 1
+.endr
+.irpc index, \i_seq
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x30(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x40(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x50(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x60(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x70(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x80(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x90(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0xa0(%arg1), \TMP1
+ AESENCLAST \TMP1, %xmm\index # Round 10
+.endr
+.irpc index, \i_seq
+ movdqu (%arg3 , %r11, 1), \TMP1
+ pxor \TMP1, %xmm\index
+ movdqu %xmm\index, (%arg2 , %r11, 1)
+ # write back plaintext/ciphertext for num_initial_blocks
+ add $16, %r11
+
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index
+
+ # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+ pxor %xmm5, %xmm6
+ GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+ cmp $64, %r13
+ jl _initial_blocks_done\num_initial_blocks\operation
+ # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM1
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM2
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM3
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM4
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+ pxor 16*0(%arg1), \XMM1
+ pxor 16*0(%arg1), \XMM2
+ pxor 16*0(%arg1), \XMM3
+ pxor 16*0(%arg1), \XMM4
+ movdqa \TMP3, \TMP5
+ pshufd $78, \TMP3, \TMP1
+ pxor \TMP3, \TMP1
+ movdqa \TMP1, HashKey_k(%rsp)
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+ movdqa \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_3(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_4(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_4_k(%rsp)
+ movaps 0xa0(%arg1), \TMP2
+ AESENCLAST \TMP2, \XMM1
+ AESENCLAST \TMP2, \XMM2
+ AESENCLAST \TMP2, \XMM3
+ AESENCLAST \TMP2, \XMM4
+ movdqu 16*0(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM1
+ movdqu 16*1(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM2
+ movdqu 16*2(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM3
+ movdqu 16*3(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM4
+ movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
+ movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
+ movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
+ movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
+
+ add $64, %r11
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pxor \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa HashKey_4(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqa HashKey_4_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqa HashKey_3(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_3_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqa HashKey_2(%rsp ), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_2_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqa HashKey(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ movaps 0xa0(%arg1), \TMP3
+ AESENCLAST \TMP3, \XMM1 # Round 10
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqa HashKey_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu 16(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu 32(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu 48(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa HashKey_4(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqa HashKey_4_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqa HashKey_3(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_3_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqa HashKey_2(%rsp ), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_2_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqa HashKey(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ movaps 0xa0(%arg1), \TMP3
+ AESENCLAST \TMP3, \XMM1 # Round 10
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqa HashKey_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM1
+ movdqu 16(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM2
+ movdqu 32(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM3
+ movdqu 48(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+ # Multiply TMP6 * HashKey (using Karatsuba)
+
+ movdqa \XMM1, \TMP6
+ pshufd $78, \XMM1, \TMP2
+ pxor \XMM1, \TMP2
+ movdqa HashKey_4(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
+ movdqa HashKey_4_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqa \XMM1, \XMMDst
+ movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM2, \TMP1
+ pshufd $78, \XMM2, \TMP2
+ pxor \XMM2, \TMP2
+ movdqa HashKey_3(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
+ movdqa HashKey_3_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM2, \XMMDst
+ pxor \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM3, \TMP1
+ pshufd $78, \XMM3, \TMP2
+ pxor \XMM3, \TMP2
+ movdqa HashKey_2(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
+ movdqa HashKey_2_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM3, \XMMDst
+ pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+ movdqa \XMM4, \TMP1
+ pshufd $78, \XMM4, \TMP2
+ pxor \XMM4, \TMP2
+ movdqa HashKey(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
+ movdqa HashKey_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM4, \XMMDst
+ pxor \XMM1, \TMP2
+ pxor \TMP6, \TMP2
+ pxor \XMMDst, \TMP2
+ # middle section of the temp results combined as in karatsuba algorithm
+ movdqa \TMP2, \TMP4
+ pslldq $8, \TMP4 # left shift TMP4 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP4, \XMMDst
+ pxor \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+ # first phase of the reduction
+ movdqa \XMMDst, \TMP2
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+ pslld $31, \TMP2 # packed right shifting << 31
+ pslld $30, \TMP3 # packed right shifting << 30
+ pslld $25, \TMP4 # packed right shifting << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP7
+ psrldq $4, \TMP7 # right shift TMP7 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \XMMDst
+
+ # second phase of the reduction
+ movdqa \XMMDst, \TMP2
+ # make 3 copies of XMMDst for doing 3 shift operations
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+ psrld $1, \TMP2 # packed left shift >> 1
+ psrld $2, \TMP3 # packed left shift >> 2
+ psrld $7, \TMP4 # packed left shift >> 7
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ pxor \TMP7, \TMP2
+ pxor \TMP2, \XMMDst
+ pxor \TMP6, \XMMDst # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+ pxor (%arg1), \XMM0
+ movaps 16(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 32(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 48(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 64(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 80(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 96(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 112(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 128(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 144(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 160(%arg1), \TMP1
+ AESENCLAST \TMP1, \XMM0
+.endm
+
+
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* u8 *out, // Plaintext output. Encrypt in-place is allowed.
+* const u8 *in, // Ciphertext input
+* u64 plaintext_len, // Length of data in bytes for decryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
+* // given authentication tag and only return the plaintext if they match.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+* // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the first
+* set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+* from the definition of the spec, aadLen can only be 8 or 12 bytes.
+* The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+* For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+ push %r12
+ push %r13
+ push %r14
+ mov %rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+ mov %arg6, %r12
+ movdqu (%r12), %xmm13 # %xmm13 = HashKey
+ movdqa SHUF_MASK(%rip), %xmm2
+ PSHUFB_XMM %xmm2, %xmm13
+
+
+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
+
+ movdqa %xmm13, %xmm2
+ psllq $1, %xmm13
+ psrlq $63, %xmm2
+ movdqa %xmm2, %xmm1
+ pslldq $8, %xmm2
+ psrldq $8, %xmm1
+ por %xmm2, %xmm13
+
+ # Reduction
+
+ pshufd $0x24, %xmm1, %xmm2
+ pcmpeqd TWOONE(%rip), %xmm2
+ pand POLY(%rip), %xmm2
+ pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+ # Decrypt first few blocks
+
+ movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
+ mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
+ mov %r13, %r12
+ and $(3<<4), %r12
+ jz _initial_num_blocks_is_0_decrypt
+ cmp $(2<<4), %r12
+ jb _initial_num_blocks_is_1_decrypt
+ je _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+ INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+ sub $48, %r13
+ jmp _initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+ INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+ sub $32, %r13
+ jmp _initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+ INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+ sub $16, %r13
+ jmp _initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+ INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+ cmp $0, %r13
+ je _zero_cipher_left_decrypt
+ sub $64, %r13
+ je _four_cipher_left_decrypt
+_decrypt_by_4:
+ GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+ add $64, %r11
+ sub $64, %r13
+ jne _decrypt_by_4
+_four_cipher_left_decrypt:
+ GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+ mov %arg4, %r13
+ and $15, %r13 # %r13 = arg4 (mod 16)
+ je _multiple_of_16_bytes_decrypt
+
+ # Handle the last <16 byte block seperately
+
+ paddd ONE(%rip), %xmm0 # increment CNT to get Yn
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm0
+
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
+ sub $16, %r11
+ add %r13, %r11
+ movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12
+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+# (%r13 is the number of bytes in plaintext mod 16)
+ movdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
+
+ movdqa %xmm1, %xmm2
+ pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+ pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
+ pand %xmm1, %xmm2
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10 ,%xmm2
+
+ pxor %xmm2, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # GHASH computation for the last <16 byte block
+ sub %r13, %r11
+ add $16, %r11
+
+ # output %r13 bytes
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_decrypt
+ mov %rax, (%arg2 , %r11, 1)
+ add $8, %r11
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_decrypt:
+ mov %al, (%arg2, %r11, 1)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+ mov arg8, %r12 # %r13 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ movd %r12d, %xmm15 # len(A) in %xmm15
+ shl $3, %arg4 # len(C) in bits (*128)
+ MOVQ_R64_XMM %arg4, %xmm1
+ pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
+ pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+ pxor %xmm15, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # final GHASH computation
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm8
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), %xmm0 # %xmm0 = Y0
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
+ pxor %xmm8, %xmm0
+_return_T_decrypt:
+ mov arg9, %r10 # %r10 = authTag
+ mov arg10, %r11 # %r11 = auth_tag_len
+ cmp $16, %r11
+ je _T_16_decrypt
+ cmp $12, %r11
+ je _T_12_decrypt
+_T_8_decrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ jmp _return_T_done_decrypt
+_T_12_decrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ psrldq $8, %xmm0
+ movd %xmm0, %eax
+ mov %eax, 8(%r10)
+ jmp _return_T_done_decrypt
+_T_16_decrypt:
+ movdqu %xmm0, (%r10)
+_return_T_done_decrypt:
+ mov %r14, %rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+* // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the
+* first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+* from the definition of the spec, aadLen can only be 8 or 12 bytes.
+* The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+* For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+ push %r12
+ push %r13
+ push %r14
+ mov %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp
+ mov %arg6, %r12
+ movdqu (%r12), %xmm13
+ movdqa SHUF_MASK(%rip), %xmm2
+ PSHUFB_XMM %xmm2, %xmm13
+
+
+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+ movdqa %xmm13, %xmm2
+ psllq $1, %xmm13
+ psrlq $63, %xmm2
+ movdqa %xmm2, %xmm1
+ pslldq $8, %xmm2
+ psrldq $8, %xmm1
+ por %xmm2, %xmm13
+
+ # reduce HashKey<<1
+
+ pshufd $0x24, %xmm1, %xmm2
+ pcmpeqd TWOONE(%rip), %xmm2
+ pand POLY(%rip), %xmm2
+ pxor %xmm2, %xmm13
+ movdqa %xmm13, HashKey(%rsp)
+ mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
+ and $-16, %r13
+ mov %r13, %r12
+
+ # Encrypt first few blocks
+
+ and $(3<<4), %r12
+ jz _initial_num_blocks_is_0_encrypt
+ cmp $(2<<4), %r12
+ jb _initial_num_blocks_is_1_encrypt
+ je _initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+ INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+ sub $48, %r13
+ jmp _initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+ INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+ sub $32, %r13
+ jmp _initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+ INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+ sub $16, %r13
+ jmp _initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+ INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+ # Main loop - Encrypt remaining blocks
+
+ cmp $0, %r13
+ je _zero_cipher_left_encrypt
+ sub $64, %r13
+ je _four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+ GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+ add $64, %r11
+ sub $64, %r13
+ jne _encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+ GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+ mov %arg4, %r13
+ and $15, %r13 # %r13 = arg4 (mod 16)
+ je _multiple_of_16_bytes_encrypt
+
+ # Handle the last <16 Byte block seperately
+ paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm0
+
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
+ sub $16, %r11
+ add %r13, %r11
+ movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (%r13 is the number of bytes in plaintext mod 16)
+ movdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
+ pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+ pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10,%xmm0
+
+ pxor %xmm0, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # GHASH computation for the last <16 byte block
+ sub %r13, %r11
+ add $16, %r11
+ PSHUFB_XMM %xmm10, %xmm1
+
+ # shuffle xmm0 back to output as ciphertext
+
+ # Output %r13 bytes
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_encrypt
+ mov %rax, (%arg2 , %r11, 1)
+ add $8, %r11
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_encrypt:
+ mov %al, (%arg2, %r11, 1)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+ mov arg8, %r12 # %r12 = addLen (number of bytes)
+ shl $3, %r12
+ movd %r12d, %xmm15 # len(A) in %xmm15
+ shl $3, %arg4 # len(C) in bits (*128)
+ MOVQ_R64_XMM %arg4, %xmm1
+ pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
+ pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+ pxor %xmm15, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # final GHASH computation
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), %xmm0 # %xmm0 = Y0
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
+ pxor %xmm8, %xmm0
+_return_T_encrypt:
+ mov arg9, %r10 # %r10 = authTag
+ mov arg10, %r11 # %r11 = auth_tag_len
+ cmp $16, %r11
+ je _T_16_encrypt
+ cmp $12, %r11
+ je _T_12_encrypt
+_T_8_encrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ jmp _return_T_done_encrypt
+_T_12_encrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ psrldq $8, %xmm0
+ movd %xmm0, %eax
+ mov %eax, 8(%r10)
+ jmp _return_T_done_encrypt
+_T_16_encrypt:
+ movdqu %xmm0, (%r10)
+_return_T_done_encrypt:
+ mov %r14, %rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+
+#endif
+
_key_expansion_128:
_key_expansion_256a:
@@ -55,10 +1709,11 @@ _key_expansion_256a:
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
ret
+.align 4
_key_expansion_192a:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@ _key_expansion_192a:
movaps %xmm0, %xmm1
shufps $0b01000100, %xmm0, %xmm6
- movaps %xmm6, (%rcx)
+ movaps %xmm6, (TKEYP)
shufps $0b01001110, %xmm2, %xmm1
- movaps %xmm1, 16(%rcx)
- add $0x20, %rcx
+ movaps %xmm1, 0x10(TKEYP)
+ add $0x20, TKEYP
ret
+.align 4
_key_expansion_192b:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@ _key_expansion_192b:
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
ret
+.align 4
_key_expansion_256b:
pshufd $0b10101010, %xmm1, %xmm1
shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@ _key_expansion_256b:
shufps $0b10001100, %xmm2, %xmm4
pxor %xmm4, %xmm2
pxor %xmm1, %xmm2
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
ret
/*
@@ -116,17 +1773,23 @@ _key_expansion_256b:
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
- movups (%rsi), %xmm0 # user key (first 16 bytes)
- movaps %xmm0, (%rdi)
- lea 0x10(%rdi), %rcx # key addr
- movl %edx, 480(%rdi)
+#ifndef __x86_64__
+ pushl KEYP
+ movl 8(%esp), KEYP # ctx
+ movl 12(%esp), UKEYP # in_key
+ movl 16(%esp), %edx # key_len
+#endif
+ movups (UKEYP), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (KEYP)
+ lea 0x10(KEYP), TKEYP # key addr
+ movl %edx, 480(KEYP)
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
cmp $24, %dl
jb .Lenc_key128
je .Lenc_key192
- movups 0x10(%rsi), %xmm2 # other user key
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
+ movups 0x10(UKEYP), %xmm2 # other user key
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_256a
AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key)
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
- movq 0x10(%rsi), %xmm2 # other user key
+ movq 0x10(UKEYP), %xmm2 # other user key
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_192a
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key)
AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
- sub $0x10, %rcx
- movaps (%rdi), %xmm0
- movaps (%rcx), %xmm1
- movaps %xmm0, 240(%rcx)
- movaps %xmm1, 240(%rdi)
- add $0x10, %rdi
- lea 240-16(%rcx), %rsi
+ sub $0x10, TKEYP
+ movaps (KEYP), %xmm0
+ movaps (TKEYP), %xmm1
+ movaps %xmm0, 240(TKEYP)
+ movaps %xmm1, 240(KEYP)
+ add $0x10, KEYP
+ lea 240-16(TKEYP), UKEYP
.align 4
.Ldec_key_loop:
- movaps (%rdi), %xmm0
+ movaps (KEYP), %xmm0
AESIMC %xmm0 %xmm1
- movaps %xmm1, (%rsi)
- add $0x10, %rdi
- sub $0x10, %rsi
- cmp %rcx, %rdi
+ movaps %xmm1, (UKEYP)
+ add $0x10, KEYP
+ sub $0x10, UKEYP
+ cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor %rax, %rax
+ xor AREG, AREG
+#ifndef __x86_64__
+ popl KEYP
+#endif
ret
/*
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
call _aesni_enc1
movups STATE, (OUTP) # output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
ret
/*
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc)
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_enc1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -298,6 +1976,7 @@ _aesni_enc1:
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_enc4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -391,11 +2070,22 @@ _aesni_enc4:
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
movups (INP), STATE # input
call _aesni_dec1
movups STATE, (OUTP) #output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
ret
/*
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec)
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_dec1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -472,6 +2163,7 @@ _aesni_dec1:
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_dec4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -566,6 +2258,15 @@ _aesni_dec4:
* size_t len)
*/
ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc)
cmp $16, LEN
jge .Lecb_enc_loop1
.Lecb_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
ret
/*
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc)
* size_t len);
*/
ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+#endif
test LEN, LEN
jz .Lecb_dec_ret
mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec)
cmp $16, LEN
jge .Lecb_dec_loop1
.Lecb_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
ret
/*
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_enc)
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 20(%esp), KEYP
+ movl 24(%esp), OUTP
+ movl 28(%esp), INP
+ movl 32(%esp), LEN
+ movl 36(%esp), IVP
+#endif
cmp $16, LEN
jb .Lcbc_enc_ret
mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc)
jge .Lcbc_enc_loop
movups STATE, (IVP)
.Lcbc_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
ret
/*
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_dec)
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 20(%esp), KEYP
+ movl 24(%esp), OUTP
+ movl 28(%esp), INP
+ movl 32(%esp), LEN
+ movl 36(%esp), IVP
+#endif
cmp $16, LEN
jb .Lcbc_dec_just_ret
mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec)
movaps IN1, STATE1
movups 0x10(INP), IN2
movaps IN2, STATE2
+#ifdef __x86_64__
movups 0x20(INP), IN3
movaps IN3, STATE3
movups 0x30(INP), IN4
movaps IN4, STATE4
+#else
+ movups 0x20(INP), IN1
+ movaps IN1, STATE3
+ movups 0x30(INP), IN2
+ movaps IN2, STATE4
+#endif
call _aesni_dec4
pxor IV, STATE1
+#ifdef __x86_64__
pxor IN1, STATE2
pxor IN2, STATE3
pxor IN3, STATE4
movaps IN4, IV
+#else
+ pxor (INP), STATE2
+ pxor 0x10(INP), STATE3
+ pxor IN1, STATE4
+ movaps IN2, IV
+#endif
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec)
.Lcbc_dec_ret:
movups IV, (IVP)
.Lcbc_dec_just_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
ret
+#ifdef __x86_64__
.align 16
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec)
* INC: == 1, in little endian
* BSWAP_MASK == endian swapping mask
*/
+.align 4
_aesni_inc_init:
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
@@ -768,6 +2538,7 @@ _aesni_inc_init:
* CTR: == output IV, in little endian
* TCTR_LOW: == lower qword of CTR
*/
+.align 4
_aesni_inc:
paddq INC, CTR
add $1, TCTR_LOW
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
ret
+#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..e1e60c7d5813 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Adrian Hoban <adrian.hoban@intel.com>
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
#include <crypto/ctr.h>
#include <asm/i387.h>
#include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
struct cryptd_ablkcipher *cryptd_tfm;
};
-#define AESNI_ALIGN 16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+ u8 hash_subkey[16];
+ struct crypto_aes_ctx aes_key_expanded;
+ u8 nonce[4];
+ struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+ int err;
+ struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+ u8 iv[16];
+ struct aesni_gcm_set_hash_subkey_result result;
+ struct scatterlist sg;
+};
+
+#define AESNI_ALIGN (16)
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len);
@@ -59,9 +94,62 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ * concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ * is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ * concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+ return
+ (struct aesni_rfc4106_gcm_ctx *)
+ PTR_ALIGN((u8 *)
+ crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+#endif
+
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = {
},
};
+#ifdef CONFIG_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
struct blkcipher_walk *walk)
{
@@ -389,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = {
},
},
};
+#endif
static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -536,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = {
},
};
+#ifdef CONFIG_X86_64
static int ablk_ctr_init(struct crypto_tfm *tfm)
{
struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
},
};
#endif
+#endif
#ifdef HAS_LRW
static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +822,424 @@ static struct crypto_alg ablk_xts_alg = {
};
#endif
+#ifdef CONFIG_X86_64
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+ PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+ cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ctx->cryptd_tfm = cryptd_tfm;
+ tfm->crt_aead.reqsize = sizeof(struct aead_request)
+ + crypto_aead_reqsize(&cryptd_tfm->base);
+ return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+ struct aesni_rfc4106_gcm_ctx *ctx =
+ (struct aesni_rfc4106_gcm_ctx *)
+ PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+ if (!IS_ERR(ctx->cryptd_tfm))
+ cryptd_free_aead(ctx->cryptd_tfm);
+ return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+ struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+ if (err == -EINPROGRESS)
+ return;
+ result->err = err;
+ complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+ struct crypto_ablkcipher *ctr_tfm;
+ struct ablkcipher_request *req;
+ int ret = -EINVAL;
+ struct aesni_hash_subkey_req_data *req_data;
+
+ ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+ if (IS_ERR(ctr_tfm))
+ return PTR_ERR(ctr_tfm);
+
+ crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+ ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+ if (ret) {
+ crypto_free_ablkcipher(ctr_tfm);
+ return ret;
+ }
+
+ req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+ if (!req) {
+ crypto_free_ablkcipher(ctr_tfm);
+ return -EINVAL;
+ }
+
+ req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+ if (!req_data) {
+ crypto_free_ablkcipher(ctr_tfm);
+ return -ENOMEM;
+ }
+ memset(req_data->iv, 0, sizeof(req_data->iv));
+
+ /* Clear the data in the hash sub key container to zero.*/
+ /* We want to cipher all zeros to create the hash sub key. */
+ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+ init_completion(&req_data->result.completion);
+ sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+ ablkcipher_request_set_tfm(req, ctr_tfm);
+ ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ rfc4106_set_hash_subkey_done,
+ &req_data->result);
+
+ ablkcipher_request_set_crypt(req, &req_data->sg,
+ &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+ ret = crypto_ablkcipher_encrypt(req);
+ if (ret == -EINPROGRESS || ret == -EBUSY) {
+ ret = wait_for_completion_interruptible
+ (&req_data->result.completion);
+ if (!ret)
+ ret = req_data->result.err;
+ }
+ ablkcipher_request_free(req);
+ kfree(req_data);
+ crypto_free_ablkcipher(ctr_tfm);
+ return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+ unsigned int key_len)
+{
+ int ret = 0;
+ struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+ u8 *new_key_mem = NULL;
+
+ if (key_len < 4) {
+ crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ /*Account for 4 byte nonce at the end.*/
+ key_len -= 4;
+ if (key_len != AES_KEYSIZE_128) {
+ crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+ /*This must be on a 16 byte boundary!*/
+ if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+ return -EINVAL;
+
+ if ((unsigned long)key % AESNI_ALIGN) {
+ /*key is not aligned: use an auxuliar aligned pointer*/
+ new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+ if (!new_key_mem)
+ return -ENOMEM;
+
+ new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+ memcpy(new_key_mem, key, key_len);
+ key = new_key_mem;
+ }
+
+ if (!irq_fpu_usable())
+ ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+ key, key_len);
+ else {
+ kernel_fpu_begin();
+ ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+ kernel_fpu_end();
+ }
+ /*This must be on a 16 byte boundary!*/
+ if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+ kfree(new_key_mem);
+ return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+ unsigned int authsize)
+{
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+ struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+ crypto_aead_crt(parent)->authsize = authsize;
+ crypto_aead_crt(cryptd_child)->authsize = authsize;
+ return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ int ret;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+ if (!irq_fpu_usable()) {
+ struct aead_request *cryptd_req =
+ (struct aead_request *) aead_request_ctx(req);
+ memcpy(cryptd_req, req, sizeof(*req));
+ aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+ return crypto_aead_encrypt(cryptd_req);
+ } else {
+ kernel_fpu_begin();
+ ret = cryptd_child->base.crt_aead.encrypt(req);
+ kernel_fpu_end();
+ return ret;
+ }
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ int ret;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+ if (!irq_fpu_usable()) {
+ struct aead_request *cryptd_req =
+ (struct aead_request *) aead_request_ctx(req);
+ memcpy(cryptd_req, req, sizeof(*req));
+ aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+ return crypto_aead_decrypt(cryptd_req);
+ } else {
+ kernel_fpu_begin();
+ ret = cryptd_child->base.crt_aead.decrypt(req);
+ kernel_fpu_end();
+ return ret;
+ }
+}
+
+static struct crypto_alg rfc4106_alg = {
+ .cra_name = "rfc4106(gcm(aes))",
+ .cra_driver_name = "rfc4106-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+ .cra_alignmask = 0,
+ .cra_type = &crypto_nivaead_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+ .cra_init = rfc4106_init,
+ .cra_exit = rfc4106_exit,
+ .cra_u = {
+ .aead = {
+ .setkey = rfc4106_set_key,
+ .setauthsize = rfc4106_set_authsize,
+ .encrypt = rfc4106_encrypt,
+ .decrypt = rfc4106_decrypt,
+ .geniv = "seqiv",
+ .ivsize = 8,
+ .maxauthsize = 16,
+ },
+ },
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+ u8 one_entry_in_sg = 0;
+ u8 *src, *dst, *assoc;
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 iv_tab[16+AESNI_ALIGN];
+ u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+ struct scatter_walk src_sg_walk;
+ struct scatter_walk assoc_sg_walk;
+ struct scatter_walk dst_sg_walk;
+ unsigned int i;
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length equal */
+ /* to 8 or 12 bytes */
+ if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+ return -EINVAL;
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+ one_entry_in_sg = 1;
+ scatterwalk_start(&src_sg_walk, req->src);
+ scatterwalk_start(&assoc_sg_walk, req->assoc);
+ src = scatterwalk_map(&src_sg_walk, 0);
+ assoc = scatterwalk_map(&assoc_sg_walk, 0);
+ dst = src;
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_start(&dst_sg_walk, req->dst);
+ dst = scatterwalk_map(&dst_sg_walk, 0);
+ }
+
+ } else {
+ /* Allocate memory for src, dst, assoc */
+ src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+ GFP_ATOMIC);
+ if (unlikely(!src))
+ return -ENOMEM;
+ assoc = (src + req->cryptlen + auth_tag_len);
+ scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+ scatterwalk_map_and_copy(assoc, req->assoc, 0,
+ req->assoclen, 0);
+ dst = src;
+ }
+
+ aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+ ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+ + ((unsigned long)req->cryptlen), auth_tag_len);
+
+ /* The authTag (aka the Integrity Check Value) needs to be written
+ * back to the packet. */
+ if (one_entry_in_sg) {
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_unmap(dst, 0);
+ scatterwalk_done(&dst_sg_walk, 0, 0);
+ }
+ scatterwalk_unmap(src, 0);
+ scatterwalk_unmap(assoc, 0);
+ scatterwalk_done(&src_sg_walk, 0, 0);
+ scatterwalk_done(&assoc_sg_walk, 0, 0);
+ } else {
+ scatterwalk_map_and_copy(dst, req->dst, 0,
+ req->cryptlen + auth_tag_len, 1);
+ kfree(src);
+ }
+ return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+ u8 one_entry_in_sg = 0;
+ u8 *src, *dst, *assoc;
+ unsigned long tempCipherLen = 0;
+ __be32 counter = cpu_to_be32(1);
+ int retval = 0;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 iv_and_authTag[32+AESNI_ALIGN];
+ u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+ u8 *authTag = iv + 16;
+ struct scatter_walk src_sg_walk;
+ struct scatter_walk assoc_sg_walk;
+ struct scatter_walk dst_sg_walk;
+ unsigned int i;
+
+ if (unlikely((req->cryptlen < auth_tag_len) ||
+ (req->assoclen != 8 && req->assoclen != 12)))
+ return -EINVAL;
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length */
+ /* equal to 8 or 12 bytes */
+
+ tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+ one_entry_in_sg = 1;
+ scatterwalk_start(&src_sg_walk, req->src);
+ scatterwalk_start(&assoc_sg_walk, req->assoc);
+ src = scatterwalk_map(&src_sg_walk, 0);
+ assoc = scatterwalk_map(&assoc_sg_walk, 0);
+ dst = src;
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_start(&dst_sg_walk, req->dst);
+ dst = scatterwalk_map(&dst_sg_walk, 0);
+ }
+
+ } else {
+ /* Allocate memory for src, dst, assoc */
+ src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+ if (!src)
+ return -ENOMEM;
+ assoc = (src + req->cryptlen + auth_tag_len);
+ scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+ scatterwalk_map_and_copy(assoc, req->assoc, 0,
+ req->assoclen, 0);
+ dst = src;
+ }
+
+ aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+ ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+ authTag, auth_tag_len);
+
+ /* Compare generated tag with passed in tag. */
+ retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+ -EBADMSG : 0;
+
+ if (one_entry_in_sg) {
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_unmap(dst, 0);
+ scatterwalk_done(&dst_sg_walk, 0, 0);
+ }
+ scatterwalk_unmap(src, 0);
+ scatterwalk_unmap(assoc, 0);
+ scatterwalk_done(&src_sg_walk, 0, 0);
+ scatterwalk_done(&assoc_sg_walk, 0, 0);
+ } else {
+ scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+ kfree(src);
+ }
+ return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+ .cra_name = "__gcm-aes-aesni",
+ .cra_driver_name = "__driver-gcm-aes-aesni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_AEAD,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+ .cra_alignmask = 0,
+ .cra_type = &crypto_aead_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+ .cra_u = {
+ .aead = {
+ .encrypt = __driver_rfc4106_encrypt,
+ .decrypt = __driver_rfc4106_decrypt,
+ },
+ },
+};
+#endif
+
static int __init aesni_init(void)
{
int err;
@@ -738,6 +1248,7 @@ static int __init aesni_init(void)
printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
return -ENODEV;
}
+
if ((err = crypto_register_alg(&aesni_alg)))
goto aes_err;
if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1257,24 @@ static int __init aesni_init(void)
goto blk_ecb_err;
if ((err = crypto_register_alg(&blk_cbc_alg)))
goto blk_cbc_err;
- if ((err = crypto_register_alg(&blk_ctr_alg)))
- goto blk_ctr_err;
if ((err = crypto_register_alg(&ablk_ecb_alg)))
goto ablk_ecb_err;
if ((err = crypto_register_alg(&ablk_cbc_alg)))
goto ablk_cbc_err;
+#ifdef CONFIG_X86_64
+ if ((err = crypto_register_alg(&blk_ctr_alg)))
+ goto blk_ctr_err;
if ((err = crypto_register_alg(&ablk_ctr_alg)))
goto ablk_ctr_err;
+ if ((err = crypto_register_alg(&__rfc4106_alg)))
+ goto __aead_gcm_err;
+ if ((err = crypto_register_alg(&rfc4106_alg)))
+ goto aead_gcm_err;
#ifdef HAS_CTR
if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
goto ablk_rfc3686_ctr_err;
#endif
+#endif
#ifdef HAS_LRW
if ((err = crypto_register_alg(&ablk_lrw_alg)))
goto ablk_lrw_err;
@@ -770,7 +1287,6 @@ static int __init aesni_init(void)
if ((err = crypto_register_alg(&ablk_xts_alg)))
goto ablk_xts_err;
#endif
-
return err;
#ifdef HAS_XTS
@@ -784,18 +1300,24 @@ ablk_pcbc_err:
crypto_unregister_alg(&ablk_lrw_alg);
ablk_lrw_err:
#endif
+#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
ablk_rfc3686_ctr_err:
#endif
+ crypto_unregister_alg(&rfc4106_alg);
+aead_gcm_err:
+ crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
crypto_unregister_alg(&ablk_ctr_alg);
ablk_ctr_err:
+ crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
ablk_cbc_err:
crypto_unregister_alg(&ablk_ecb_alg);
ablk_ecb_err:
- crypto_unregister_alg(&blk_ctr_alg);
-blk_ctr_err:
crypto_unregister_alg(&blk_cbc_alg);
blk_cbc_err:
crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +1340,17 @@ static void __exit aesni_exit(void)
#ifdef HAS_LRW
crypto_unregister_alg(&ablk_lrw_alg);
#endif
+#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
#endif
+ crypto_unregister_alg(&rfc4106_alg);
+ crypto_unregister_alg(&__rfc4106_alg);
crypto_unregister_alg(&ablk_ctr_alg);
+ crypto_unregister_alg(&blk_ctr_alg);
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
crypto_unregister_alg(&ablk_ecb_alg);
- crypto_unregister_alg(&blk_ctr_alg);
crypto_unregister_alg(&blk_cbc_alg);
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 55d106b5e31b..211ca3f7fd16 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,17 +185,16 @@ struct bootnode;
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
-extern int acpi_get_nodes(struct bootnode *physnodes);
+extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end);
extern int acpi_scan_nodes(unsigned long start, unsigned long end);
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+#ifdef CONFIG_NUMA_EMU
extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
int num_nodes);
-#else
-static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
- int num_nodes)
-{
-}
#endif
+#endif /* CONFIG_ACPI_NUMA */
#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 6aee50d655d1..64dc82ee19f0 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,16 +3,27 @@
#include <linux/pci.h>
+struct amd_nb_bus_dev_range {
+ u8 bus;
+ u8 dev_base;
+ u8 dev_limit;
+};
+
extern struct pci_device_id amd_nb_misc_ids[];
+extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
struct bootnode;
extern int early_is_amd_nb(u32 value);
extern int amd_cache_northbridges(void);
extern void amd_flush_garts(void);
-extern int amd_get_nodes(struct bootnode *nodes);
extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int amd_scan_nodes(void);
+#ifdef CONFIG_NUMA_EMU
+extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
+extern void amd_get_nodes(struct bootnode *nodes);
+#endif
+
struct amd_northbridge {
struct pci_dev *misc;
};
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3b62ab56c7a0..5e1a2eef3e7c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -32,11 +32,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
-#ifdef CONFIG_X86_64
-#define BOOT_HEAP_SIZE 0x7000
-#else
-#define BOOT_HEAP_SIZE 0x4000
-#endif
+#define BOOT_HEAP_SIZE 0x8000
#endif /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 63e35ec9075c..62f084478f7e 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,48 +1,8 @@
#ifndef _ASM_X86_CACHEFLUSH_H
#define _ASM_X86_CACHEFLUSH_H
-/* Keep includes the same across arches. */
-#include <linux/mm.h>
-
/* Caches aren't brain-dead on the intel. */
-static inline void flush_cache_all(void) { }
-static inline void flush_cache_mm(struct mm_struct *mm) { }
-static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
-static inline void flush_cache_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end) { }
-static inline void flush_cache_page(struct vm_area_struct *vma,
- unsigned long vmaddr, unsigned long pfn) { }
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
-static inline void flush_dcache_page(struct page *page) { }
-static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
-static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
-static inline void flush_icache_range(unsigned long start,
- unsigned long end) { }
-static inline void flush_icache_page(struct vm_area_struct *vma,
- struct page *page) { }
-static inline void flush_icache_user_range(struct vm_area_struct *vma,
- struct page *page,
- unsigned long addr,
- unsigned long len) { }
-static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
-static inline void flush_cache_vunmap(unsigned long start,
- unsigned long end) { }
-
-static inline void copy_to_user_page(struct vm_area_struct *vma,
- struct page *page, unsigned long vaddr,
- void *dst, const void *src,
- unsigned long len)
-{
- memcpy(dst, src, len);
-}
-
-static inline void copy_from_user_page(struct vm_area_struct *vma,
- struct page *page, unsigned long vaddr,
- void *dst, const void *src,
- unsigned long len)
-{
- memcpy(dst, src, len);
-}
+#include <asm-generic/cacheflush.h>
#ifdef CONFIG_X86_PAT
/*
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4fab24de26b1..6e6e7558e702 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int);
DECLARE_PER_CPU(int, cpu_state);
+int __cpuinit mwait_usable(const struct cpuinfo_x86 *);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 0141b234406f..4729b2b63117 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -116,11 +116,11 @@ enum fixed_addresses {
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
- __end_of_permanent_fixed_addresses,
-
#ifdef CONFIG_X86_MRST
FIX_LNW_VRTC,
#endif
+ __end_of_permanent_fixed_addresses,
+
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 49dbfdfa50f9..91d915a65259 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio)
return __gpio_cansleep(gpio);
}
-/*
- * Not implemented, yet.
- */
static inline int gpio_to_irq(unsigned int gpio)
{
- return -ENOSYS;
+ return __gpio_to_irq(gpio);
}
static inline int irq_to_gpio(unsigned int irq)
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index ff2546ce7178..7a15153c675d 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,6 +20,9 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
+#include <asm/kvm_para.h>
+#include <asm/xen/hypervisor.h>
+
extern void init_hypervisor(struct cpuinfo_x86 *c);
extern void init_hypervisor_platform(void);
@@ -47,4 +50,13 @@ extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+static inline bool hypervisor_x2apic_available(void)
+{
+ if (kvm_para_available())
+ return true;
+ if (xen_x2apic_para_available())
+ return true;
+ return false;
+}
+
#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ba870bb6dd8e..c704b38c57a2 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,6 +10,9 @@
#include <asm/apicdef.h>
#include <asm/irq_vectors.h>
+/* Even though we don't support this, supply it to appease OF */
+static inline void irq_dispose_mapping(unsigned int virq) { }
+
static inline int irq_canonicalize(int irq)
{
return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index f52d42e80585..574dbc22893a 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
do { \
asm goto("1:" \
JUMP_LABEL_INITIAL_NOP \
- ".pushsection __jump_table, \"a\" \n\t"\
+ ".pushsection __jump_table, \"aw\" \n\t"\
_ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
".popsection \n\t" \
: : "i" (key) : : label); \
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f23eb2528464..ca242d35e873 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
DIE_TRAP,
DIE_GPF,
DIE_CALL,
- DIE_NMI_IPI,
DIE_PAGE_FAULT,
DIE_NMIUNKNOWN,
};
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
struct x86_emulate_ctxt;
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+};
+
/*
* x86_emulate_ops:
*
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to write to memory.
*/
int (*write_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*fetch)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
};
union {
unsigned long *reg;
- unsigned long mem;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
} addr;
union {
unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
- int exception; /* exception that happens during emulation or -1 */
- u32 error_code; /* error code for exception */
- bool error_code_valid;
+ bool have_exception;
+ struct x86_exception exception;
/* decode cache */
struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
+#define ASYNC_PF_PER_VCPU 64
+
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
struct kvm;
+struct kvm_async_pf;
enum kvm_reg {
VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+ VCPU_EXREG_CR3,
};
enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*inject_page_fault)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+ bool prefault);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
- u32 *error);
+ struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync);
+ struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
*/
struct kvm_mmu *walk_mmu;
- /*
- * This struct is filled with the necessary information to propagate a
- * page fault into the guest
- */
- struct {
- u64 address;
- unsigned error_code;
- bool nested;
- } fault;
-
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
u64 hv_vapic;
cpumask_var_t wbinvd_dirty_mask;
+
+ struct {
+ bool halted;
+ gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+ struct gfn_to_hva_cache data;
+ u64 msr_val;
+ u32 id;
+ bool send_user_only;
+ } apf;
};
struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
+
+ #ifdef CONFIG_KVM_MMU_AUDIT
+ int audit_point;
+ #endif
};
struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*decache_cr3)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
const struct trace_print_flags *exit_reasons_str;
};
+struct kvm_arch_async_pf {
+ u32 token;
+ gfn_t gfn;
+ unsigned long cr3;
+ bool direct_map;
+};
+
extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -623,8 +640,15 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2, u16 error_code, int emulation_type);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+ int emulation_type, void *insn, int insn_len);
+
+static inline int emulate_instruction(struct kvm_vcpu *vcpu,
+ int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
#define HF_VINTR_MASK (1 << 2)
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
+#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
/*
* Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running.
* Trap the fault and ignore the instruction if that happens.
*/
-asmlinkage void kvm_handle_fault_on_reboot(void);
+asmlinkage void kvm_spurious_fault(void);
+extern bool kvm_rebooting;
#define __kvm_handle_fault_on_reboot(insn) \
"666: " insn "\n\t" \
+ "668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
+ "cmpb $0, kvm_rebooting \n\t" \
+ "jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
- "jmp kvm_handle_fault_on_reboot \n\t" \
+ "call kvm_spurious_fault \n\t" \
".popsection \n\t" \
".pushsection __ex_table, \"a\" \n\t" \
_ASM_PTR " 666b, 667b \n\t" \
@@ -788,6 +822,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -799,4 +834,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
* are available. The use of 0x11 and 0x12 is deprecated
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define KVM_MAX_MMU_OP_BATCH 32
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u8 pad[60];
+ __u32 enabled;
+};
+
#ifdef __KERNEL__
#include <asm/processor.h>
extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
#ifdef CONFIG_KVM_GUEST
void __init kvm_guest_init(void);
+void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
#else
#define kvm_guest_init() do { } while (0)
+#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+static inline u32 kvm_read_and_reset_pf_reason(void)
+{
+ return 0;
+}
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..72a8b52e7dfd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
#include <asm/mc146818rtc.h>
+#define NMI_REASON_PORT 0x61
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+#define NMI_REASON_CLEAR_SERR 0x04
+#define NMI_REASON_CLEAR_IOCHK 0x08
+#define NMI_REASON_CLEAR_MASK 0x0f
+
static inline unsigned char get_nmi_reason(void)
{
- return inb(0x61);
+ return inb(NMI_REASON_PORT);
}
static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c4021b953510..c76f5b92b840 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -23,6 +23,26 @@ void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
+/*
+ * Define some priorities for the nmi notifier call chain.
+ *
+ * Create a local nmi bit that has a higher priority than
+ * external nmis, because the local ones are more frequent.
+ *
+ * Also setup some default high/normal/low settings for
+ * subsystems to registers with. Using 4 bits to seperate
+ * the priorities. This can go alot higher if needed be.
+ */
+
+#define NMI_LOCAL_SHIFT 16 /* randomly picked */
+#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
+#define NMI_HIGH_PRIOR (1ULL << 8)
+#define NMI_NORMAL_PRIOR (1ULL << 4)
+#define NMI_LOW_PRIOR (1ULL << 0)
+#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
+#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
+#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
+
void stop_nmi(void);
void restart_nmi(void);
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b56..b0ef2b449a9d 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_NUMA_32_H
#define _ASM_X86_NUMA_32_H
+extern int numa_off;
+
extern int pxm_to_nid(int pxm);
extern void numa_remove_cpu(int cpu);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070e7c26..0493be39607c 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -38,8 +38,9 @@ extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+void numa_emu_cmdline(char *);
#endif /* CONFIG_NUMA_EMU */
#else
static inline void init_cpu_to_node(void) { }
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 42a978c0c1b3..f482010350fb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
/* GPIO assignments */
#define OLPC_GPIO_MIC_AC 1
-#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
+#define OLPC_GPIO_DCON_STAT0 5
+#define OLPC_GPIO_DCON_STAT1 6
+#define OLPC_GPIO_DCON_IRQ 7
#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
-#define OLPC_GPIO_SMB_CLK geode_gpio(14)
-#define OLPC_GPIO_SMB_DATA geode_gpio(15)
+#define OLPC_GPIO_DCON_LOAD 11
+#define OLPC_GPIO_DCON_BLANK 12
+#define OLPC_GPIO_SMB_CLK 14
+#define OLPC_GPIO_SMB_DATA 15
#define OLPC_GPIO_WORKAUX geode_gpio(24)
#define OLPC_GPIO_LID geode_gpio(26)
#define OLPC_GPIO_ECSCI geode_gpio(27)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 2a8478140bb3..641988efe063 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -8,6 +8,8 @@
#ifdef CONFIG_OLPC_OPENFIRMWARE
+extern bool olpc_ofw_is_installed(void);
+
/* run an OFW command by calling into the firmware */
#define olpc_ofw(name, args, res) \
__olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
@@ -26,10 +28,17 @@ extern bool olpc_ofw_present(void);
#else /* !CONFIG_OLPC_OPENFIRMWARE */
+static inline bool olpc_ofw_is_installed(void) { return false; }
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
static inline bool olpc_ofw_present(void) { return false; }
#endif /* !CONFIG_OLPC_OPENFIRMWARE */
+#ifdef CONFIG_OLPC_OPENFIRMWARE_DT
+extern void olpc_dt_build_devicetree(void);
+#else
+static inline void olpc_dt_build_devicetree(void) { }
+#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */
+
#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b8..2071a8b2b32f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
{
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
+static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
+}
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
}
+static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
+}
+
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#if PAGETABLE_LEVELS >= 3
+ if (sizeof(pmdval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
+ else
+ PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
+#endif
+}
+#endif
+
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+ void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
void (*pte_update_defer)(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+ void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp);
+ void (*pmd_update_defer)(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 8ee45167e817..3788f4649db4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -414,8 +414,6 @@ do { \
#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -432,8 +430,6 @@ do { \
#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#ifndef CONFIG_M386
#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
@@ -475,11 +471,15 @@ do { \
#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
+#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#endif
/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 295e2ff18a6a..e2f6a99f14ab 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -20,6 +20,9 @@
#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
#define ARCH_P4_MAX_CCCR (18)
+#define ARCH_P4_CNTRVAL_BITS (40)
+#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
+
#define P4_ESCR_EVENT_MASK 0x7e000000U
#define P4_ESCR_EVENT_SHIFT 25
#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long adddress)
+ unsigned long address)
{
___pmd_free_tlb(tlb, pmd);
}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+ return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
/*
* Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
* split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
+#ifdef CONFIG_SMP
+union split_pmd {
+ struct {
+ u32 pmd_low;
+ u32 pmd_high;
+ };
+ pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ res.pmd_low = xchg(&orig->pmd_low, 0);
+ res.pmd_high = orig->pmd_high;
+ orig->pmd_high = 0;
+
+ return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
/*
* Bits 0, 6 and 7 are taken in the low part of the pte,
* put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#else /* !CONFIG_PARAVIRT */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#define pmd_update(mm, addr, ptep) do { } while (0)
+#define pmd_update_defer(mm, addr, ptep) do { } while (0)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
static inline int pte_write(pte_t pte)
{
return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
(_PAGE_PSE | _PAGE_PRESENT);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+ return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
return __pte(val);
}
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd);
+
+ val &= _HPAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+ return __pmd(val);
+}
+
/* mprotect needs to preserve PAT bits when updating vm_page_prot */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
/*
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
return res;
}
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ pmd_t res = *pmdp;
+
+ native_pmd_clear(pmdp);
+ return res;
+}
+
static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , pte_t pte)
{
native_set_pte(ptep, pte);
}
+static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp , pmd_t pmd)
+{
+ native_set_pmd(pmdp, pmd);
+}
+
#ifndef CONFIG_PARAVIRT
/*
* Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address)
+#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+ pmd_update(mm, addr, pmdp);
+ return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+ pmd_update(mm, addr, pmdp);
+}
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
+}
+
static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#endif
}
-static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
- *pmdp = pmd;
-}
-
-static inline void native_pmd_clear(pmd_t *pmd)
-{
- native_set_pmd(pmd, native_make_pmd(0));
+#ifdef CONFIG_SMP
+ return native_make_pmd(xchg(&xp->pmd, 0));
+#else
+ /* native_local_pmdp_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pmd_t ret = *xp;
+ native_pmd_clear(xp);
+ return ret;
+#endif
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c6efecf85a6a..45636cefa186 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void init_c1e_mask(void);
extern unsigned long boot_option_idle_override;
-extern unsigned long idle_halt;
-extern unsigned long idle_nomwait;
extern bool c1e_detected;
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+ IDLE_POLL, IDLE_FORCE_MWAIT};
+
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
@@ -901,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
/*
* The below -8 is to reserve 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
- * is accessable even if the CPU haven't stored the SS/ESP registers
+ * is accessible even if the CPU haven't stored the SS/ESP registers
* on the stack (interrupt gate does not save these registers
* when switching to the same priv ring).
* Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..b4ec95f07518
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1 @@
+/* dummy prom.h; here to make linux/of.h's #includes happy */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
+ u32 intercept_cr;
+ u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
- u64 reserved_5;
+ u32 clean;
+ u32 reserved_5;
u64 next_rip;
- u8 reserved_6[816];
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u8 reserved_6[800];
};
#define TLB_CONTROL_DO_NOTHING 0
#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
#define V_TPR_MASK 0x0f
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
#define SVM_EVTINJ_VEC_MASK 0xff
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+#define SVM_EXITINFO_REG_MASK 0x0F
+
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_MONITOR 0x08a
#define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
+asmlinkage void async_page_fault(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void coprocessor_error(void);
asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
#define PIN_BASED_NMI_EXITING 0x00000008
#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
#define VM_EXIT_SAVE_IA32_PAT 0x00040000
#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
/* VMCS Encodings */
enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE 0
+#define GUEST_ACTIVITY_HLT 1
+#define GUEST_ACTIVITY_SHUTDOWN 2
+#define GUEST_ACTIVITY_WAIT_SIPI 3
+
/*
* Exit Qualifications for MOV for Control Register Access
*/
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 396ff4cc8ed4..66d0fff1ee84 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,4 +37,39 @@
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
+#include <asm/processor.h>
+
+static inline uint32_t xen_cpuid_base(void)
+{
+ uint32_t base, eax, ebx, ecx, edx;
+ char signature[13];
+
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &ebx, &ecx, &edx);
+ *(uint32_t *)(signature + 0) = ebx;
+ *(uint32_t *)(signature + 4) = ecx;
+ *(uint32_t *)(signature + 8) = edx;
+ signature[12] = 0;
+
+ if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+ return base;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_XEN
+extern bool xen_hvm_need_lapic(void);
+
+static inline bool xen_x2apic_para_available(void)
+{
+ return xen_hvm_need_lapic();
+}
+#else
+static inline bool xen_x2apic_para_available(void)
+{
+ return (xen_cpuid_base() != 0);
+}
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21c..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern int m2p_add_override(unsigned long mfn, struct page *page);
+extern int m2p_remove_override(struct page *page);
+extern struct page *m2p_find_override(unsigned long mfn);
+extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
- if (unlikely((mfn >> machine_to_phys_order) != 0))
- return ~0;
-
pfn = 0;
/*
* The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
*/
__get_user(pfn, &machine_to_phys_mapping[mfn]);
+ /*
+ * If this appears to be a foreign mfn (because the pfn
+ * doesn't map back to the mfn), then check the local override
+ * table to see if there's a better pfn to use.
+ */
+ if (get_phys_to_machine(pfn) != mfn)
+ pfn = m2p_find_override_pfn(mfn, pfn);
+
return pfn;
}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ec881c6bfee0..b3a71137983a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -509,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
return 0;
}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d2fdb0826df2..57ca77787220 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
dma_dom->aperture_size += APERTURE_RANGE_SIZE;
- /* Intialize the exclusion range if necessary */
+ /* Initialize the exclusion range if necessary */
for_each_iommu(iommu) {
if (iommu->exclusion_start &&
iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
/*
* Allocates a new protection domain usable for the dma_ops functions.
- * It also intializes the page table and the address allocator data
+ * It also initializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
static struct dma_ops_domain *dma_ops_domain_alloc(void)
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index affacb5e0065..0a99f7198bc3 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,13 @@ struct pci_device_id amd_nb_misc_ids[] = {
};
EXPORT_SYMBOL(amd_nb_misc_ids);
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
struct amd_northbridge_info amd_northbridges;
EXPORT_SYMBOL(amd_northbridges);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 7c9ab59653e8..51ef31a89be9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev)
if (adev->irq == 0)
return;
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+ /* APB timer irqs are set up as mp_irqs, timer is edge type */
+ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+
if (system_state == SYSTEM_BOOTING) {
- irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge type */
- __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- adev->name, adev)) {
+ IRQF_TIMER | IRQF_DISABLED |
+ IRQF_NOBALANCING,
+ adev->name, adev)) {
printk(KERN_ERR "Failed request IRQ for APBT%d\n",
adev->num);
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index dcd7c83e1659..5955a7800a96 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -39,18 +39,6 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-struct bus_dev_range {
- int bus;
- int dev_base;
- int dev_limit;
-};
-
-static struct bus_dev_range bus_dev_ranges[] __initdata = {
- { 0x00, 0x18, 0x20},
- { 0xff, 0x00, 0x20},
- { 0xfe, 0x00, 0x20}
-};
-
static struct resource gart_resource = {
.name = "GART",
.flags = IORESOURCE_MEM,
@@ -294,13 +282,13 @@ void __init early_gart_iommu_check(void)
search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -349,13 +337,13 @@ void __init early_gart_iommu_check(void)
return;
/* disable them all at first */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -390,14 +378,14 @@ int __init gart_iommu_hole_init(void)
fix = 0;
node = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
u32 ctl;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -505,7 +493,7 @@ out:
}
/* Fix up the north bridges */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus, dev_base, dev_limit;
/*
@@ -514,9 +502,9 @@ out:
*/
u32 ctl = DISTLBWALKPRB | aper_order << 1;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 79e6baa8aa0a..06c196d7e59c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -49,8 +49,8 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
-#include <asm/kvm_para.h>
#include <asm/tsc.h>
+#include <asm/hypervisor.h>
unsigned int num_processors;
@@ -684,7 +684,7 @@ static int __init calibrate_APIC_clock(void)
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
@@ -1476,7 +1476,8 @@ void __init enable_IR_x2apic(void)
/* IR is required if there is APIC ID > 255 even when running
* under KVM
*/
- if (max_physical_apicid > 255 || !kvm_para_available())
+ if (max_physical_apicid > 255 ||
+ !hypervisor_x2apic_available())
goto nox2apic;
/*
* without IR all CPUs can be addressed by IOAPIC/MSI
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 72ec29e1ae06..79fd43ca6f96 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -68,7 +68,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
default:
@@ -96,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
static __read_mostly struct notifier_block backtrace_notifier = {
.notifier_call = arch_trigger_all_cpu_backtrace_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static int __init register_trigger_all_cpu_backtrace(void)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ecca5f41ad2c..bd16b58b8850 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -378,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
static __cpuinit void set_x2apic_extra_bits(int pnode)
{
- __this_cpu_write(x2apic_extra_bits, (pnode << 6));
+ __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
}
/*
@@ -641,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
*/
int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
- if (reason != DIE_NMI_IPI)
+ if (reason != DIE_NMIUNKNOWN)
return NOTIFY_OK;
if (in_crash_kexec)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7283e98deaae..ec2c19a7b8ef 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
{ 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
{ 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
+ { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
{ 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
{ 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
{ 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
{ 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
+ { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
{ 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
{ 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
{ 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
{ 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
+ { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
{ 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
{ 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
{ 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..a77971979564 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
struct die_args *args = (struct die_args *)data;
int cpu = smp_processor_id();
struct mce *m = &__get_cpu_var(injectm);
- if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
+ if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
return NOTIFY_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
static struct notifier_block mce_raise_nb = {
.notifier_call = mce_raise_notify,
- .priority = 1000,
+ .priority = NMI_LOCAL_NORMAL_PRIOR,
};
/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e12246ff5aa6..6f8c5e9da97f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -59,6 +59,7 @@ struct thermal_state {
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
+EXPORT_SYMBOL(platform_thermal_notify);
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 04921017abe0..9d977a2ea693 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1267,7 +1267,6 @@ perf_event_nmi_handler(struct notifier_block *self,
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
case DIE_NMIUNKNOWN:
this_nmi = percpu_read(irq_stat.__nmi_count);
@@ -1317,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self,
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static struct event_constraint unconstrained;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 81400b93e694..e56b9bfbabd1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,21 @@ out:
static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
{
- int overflow = 0;
- u32 low, high;
+ u64 v;
- rdmsr(hwc->config_base + hwc->idx, low, high);
-
- /* we need to check high bit for unflagged overflows */
- if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
- overflow = 1;
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)low) & ~P4_CCCR_OVF);
+ /* an official way for overflow indication */
+ rdmsrl(hwc->config_base + hwc->idx, v);
+ if (v & P4_CCCR_OVF) {
+ wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
+ return 1;
}
- return overflow;
+ /* it might be unflagged overflow */
+ rdmsrl(hwc->event_base + hwc->idx, v);
+ if (!(v & ARCH_P4_CNTRVAL_MASK))
+ return 1;
+
+ return 0;
}
static void p4_pmu_disable_pebs(void)
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = {
*/
.num_counters = ARCH_P4_MAX_CCCR,
.apic = 1,
- .cntval_bits = 40,
- .cntval_mask = (1ULL << 40) - 1,
- .max_period = (1ULL << 39) - 1,
+ .cntval_bits = ARCH_P4_CNTRVAL_BITS,
+ .cntval_mask = ARCH_P4_CNTRVAL_MASK,
+ .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
/*
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 8474c998cbd4..df20723a6a1b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -197,14 +197,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
*/
void dump_stack(void)
{
- unsigned long bp = 0;
unsigned long stack;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
@@ -240,6 +234,7 @@ unsigned __kprobes long oops_begin(void)
bust_spinlocks(1);
return flags;
}
+EXPORT_SYMBOL_GPL(oops_begin);
void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
#include <linux/bootmem.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
+#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/memblock.h>
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e60104278..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
CFI_ENDPROC
END(general_protection)
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+ RING0_EC_FRAME
+ pushl $do_async_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(apf_page_fault)
+#endif
+
/*
* End of kprobes section
*/
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e3ba417e8697..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -299,17 +299,21 @@ ENDPROC(native_usergs_sysret64)
ENTRY(save_args)
XCPT_FRAME
cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ /*
+ * start from rbp in pt_regs and jump over
+ * return address.
+ */
+ movq_cfi rdi, RDI+8-RBP
+ movq_cfi rsi, RSI+8-RBP
+ movq_cfi rdx, RDX+8-RBP
+ movq_cfi rcx, RCX+8-RBP
+ movq_cfi rax, RAX+8-RBP
+ movq_cfi r8, R8+8-RBP
+ movq_cfi r9, R9+8-RBP
+ movq_cfi r10, R10+8-RBP
+ movq_cfi r11, R11+8-RBP
+
+ leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
movq_cfi rbp, 8 /* push %rbp */
leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
testl $3, CS(%rdi)
@@ -782,8 +786,9 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $ORIG_RAX-ARGOFFSET+8, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
+ /* reserve pt_regs for scratch regs and rbp */
+ subq $ORIG_RAX-RBP, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
call save_args
PARTIAL_FRAME 0
call \func
@@ -808,9 +813,14 @@ ret_from_intr:
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
leaveq
+
CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
+
+ /* we did not save rbx, restore only from ARGOFFSET */
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -1319,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
#ifdef CONFIG_X86_MCE
paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 9f54b209c378..fc293dc8dc35 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -126,7 +126,7 @@ ENTRY(startup_32)
movsl
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
- jz 1f # No comand line
+ jz 1f # No command line
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
set_stopped_child_used_math(tsk);
return 0;
}
+EXPORT_SYMBOL_GPL(init_fpu);
/*
* The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3a43caa3beb7..52945da52a94 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/of.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/ftrace.h>
@@ -275,6 +276,15 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+#ifdef CONFIG_OF
+unsigned int irq_create_of_mapping(struct device_node *controller,
+ const u32 *intspec, unsigned int intsize)
+{
+ return intspec[0];
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 48ff6dcffa02..9974d21048fd 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu)
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
THREAD_FLAGS,
THREAD_ORDER));
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
+ memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu)
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
THREAD_FLAGS,
THREAD_ORDER));
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
+ memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
per_cpu(softirq_ctx, cpu) = irqctx;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index cd21b654dec6..a4130005028a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
#include <asm/apicdef.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
@@ -525,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
}
return NOTIFY_DONE;
- case DIE_NMI_IPI:
- /* Just ignore, we will handle the roundup on DIE_NMI. */
- return NOTIFY_DONE;
-
case DIE_NMIUNKNOWN:
if (was_in_debug_nmi[raw_smp_processor_id()]) {
was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -606,7 +603,7 @@ static struct notifier_block kgdb_notifier = {
/*
* Lowest-prio notifier priority, we want to be notified last:
*/
- .priority = -INT_MAX,
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
/**
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
#define MMU_QUEUE_SIZE 1024
+static int kvmapf = 1;
+
+static int parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static struct kvm_para_state *kvm_para_state(void)
{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
{
}
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ wait_queue_head_t wq;
+ u32 token;
+ int cpu;
+ bool halted;
+ struct mm_struct *mm;
+};
+
+static struct kvm_task_sleep_head {
+ spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
+}
+
+void kvm_async_pf_task_wait(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node n, *e;
+ DEFINE_WAIT(wait);
+ int cpu, idle;
+
+ cpu = get_cpu();
+ idle = idle_cpu(cpu);
+ put_cpu();
+
+ spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ kfree(e);
+ spin_unlock(&b->lock);
+ return;
+ }
+
+ n.token = token;
+ n.cpu = smp_processor_id();
+ n.mm = current->active_mm;
+ n.halted = idle || preempt_count() > 1;
+ atomic_inc(&n.mm->mm_count);
+ init_waitqueue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ spin_unlock(&b->lock);
+
+ for (;;) {
+ if (!n.halted)
+ prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ if (!n.halted) {
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ } else {
+ /*
+ * We cannot reschedule. So halt.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
+ }
+ if (!n.halted)
+ finish_wait(&n.wq, &wait);
+
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+ hlist_del_init(&n->link);
+ if (!n->mm)
+ return;
+ mmdrop(n->mm);
+ if (n->halted)
+ smp_send_reschedule(n->cpu);
+ else if (waitqueue_active(&n->wq))
+ wake_up(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+ int i;
+
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct hlist_node *p, *next;
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ spin_unlock(&b->lock);
+ }
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * async PF was not yet handled.
+ * Add dummy entry for the token.
+ */
+ n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ /*
+ * Allocation failed! Busy wait while other cpu
+ * handles async PF.
+ */
+ spin_unlock(&b->lock);
+ cpu_relax();
+ goto again;
+ }
+ n->token = token;
+ n->cpu = smp_processor_id();
+ n->mm = NULL;
+ init_waitqueue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ } else
+ apf_task_wake_one(n);
+ spin_unlock(&b->lock);
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+ u32 reason = 0;
+
+ if (__get_cpu_var(apf_reason).enabled) {
+ reason = __get_cpu_var(apf_reason).reason;
+ __get_cpu_var(apf_reason).reason = 0;
+ }
+
+ return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void __kprobes
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ switch (kvm_read_and_reset_pf_reason()) {
+ default:
+ do_page_fault(regs, error_code);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ /* page is swapped out by the host. */
+ kvm_async_pf_task_wait((u32)read_cr2());
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ kvm_async_pf_task_wake((u32)read_cr2());
+ break;
+ }
+}
+
static void kvm_mmu_op(void *buffer, unsigned len)
{
int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
#endif
}
+void __cpuinit kvm_guest_cpu_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+ u64 pa = __pa(&__get_cpu_var(apf_reason));
+
+#ifdef CONFIG_PREEMPT
+ pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+ __get_cpu_var(apf_reason).enabled = 1;
+ printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+ smp_processor_id());
+ }
+}
+
+static void kvm_pv_disable_apf(void *unused)
+{
+ if (!__get_cpu_var(apf_reason).enabled)
+ return;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ __get_cpu_var(apf_reason).enabled = 0;
+
+ printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+ smp_processor_id());
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+#ifdef CONFIG_KVM_CLOCK
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+#endif
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static void kvm_guest_cpu_online(void *dummy)
+{
+ kvm_guest_cpu_init();
+}
+
+static void kvm_guest_cpu_offline(void *dummy)
+{
+ kvm_pv_disable_apf(NULL);
+ apf_task_wake_all();
+}
+
+static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE_FROZEN:
+ smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_notify,
+};
+#endif
+
+static void __init kvm_apf_trap_init(void)
+{
+ set_intr_gate(14, &async_page_fault);
+}
+
void __init kvm_guest_init(void)
{
+ int i;
+
if (!kvm_para_available())
return;
paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ spin_lock_init(&async_pf_sleepers[i].lock);
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+ x86_init.irqs.trap_init = kvm_apf_trap_init;
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ register_cpu_notifier(&kvm_cpu_notifier);
+#else
+ kvm_guest_cpu_init();
+#endif
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
}
#endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
-}
-#endif
-
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
x86_cpuinit.setup_percpu_clockev =
kvm_setup_secondary_clock;
#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
void *module_alloc(unsigned long size)
{
- struct vm_struct *area;
-
- if (!size)
- return NULL;
- size = PAGE_ALIGN(size);
- if (size > MODULES_LEN)
+ if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC);
+ return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ -1, __builtin_return_address(0));
}
/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
+ .set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
+ .pmd_update = paravirt_nop,
+ .pmd_update_defer = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 09c08a1c706f..e764fc05d700 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
#include <linux/utsname.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
@@ -22,11 +23,6 @@
#include <asm/i387.h>
#include <asm/debugreg.h>
-unsigned long idle_halt;
-EXPORT_SYMBOL(idle_halt);
-unsigned long idle_nomwait;
-EXPORT_SYMBOL(idle_nomwait);
-
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -327,7 +323,7 @@ long sys_execve(const char __user *name,
/*
* Idle related variables and functions
*/
-unsigned long boot_option_idle_override = 0;
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
/*
@@ -386,6 +382,8 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -443,8 +441,6 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
- trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
- trace_cpu_idle((ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -471,6 +467,8 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
} else
local_irq_enable();
}
@@ -503,17 +501,16 @@ static void poll_idle(void)
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
-static int __cpuinitdata force_mwait;
#define MWAIT_INFO 0x05
#define MWAIT_ECX_EXTENDED_INFO 0x01
#define MWAIT_EDX_C1 0xf0
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- if (force_mwait)
+ if (boot_option_idle_override == IDLE_FORCE_MWAIT)
return 1;
if (c->cpuid_level < MWAIT_INFO)
@@ -633,9 +630,10 @@ static int __init idle_setup(char *str)
if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else if (!strcmp(str, "halt")) {
+ boot_option_idle_override = IDLE_POLL;
+ } else if (!strcmp(str, "mwait")) {
+ boot_option_idle_override = IDLE_FORCE_MWAIT;
+ } else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
* forced to be used for CPU idle. In such case CPU C2/C3
@@ -644,8 +642,7 @@ static int __init idle_setup(char *str)
* the boot_option_idle_override.
*/
pm_idle = default_idle;
- idle_halt = 1;
- return 0;
+ boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
* If the boot option of "idle=nomwait" is added,
@@ -653,12 +650,10 @@ static int __init idle_setup(char *str)
* states. In such case it won't touch the variable
* of boot_option_idle_override.
*/
- idle_nomwait = 1;
- return 0;
+ boot_option_idle_override = IDLE_NOMWAIT;
} else
return -1;
- boot_option_idle_override = 1;
return 0;
}
early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4b9befa0e347..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <trace/events/power.h>
-
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4c818a738396..bd387e8f73b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <trace/events/power.h>
-
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,10 +139,6 @@ void cpu_idle(void)
pm_idle();
start_critical_timings();
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT,
- smp_processor_id());
-
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8d4815..fc7aae1e2bc7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
#include <asm/pci_x86.h>
#include <asm/virtext.h>
#include <asm/cpu.h>
+#include <asm/nmi.h>
#ifdef CONFIG_X86_32
# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
{
int cpu;
- if (val != DIE_NMI_IPI)
+ if (val != DIE_NMI)
return NOTIFY_OK;
cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
static struct notifier_block crash_nmi_nb = {
.notifier_call = crash_nmi_callback,
+ /* we want to be the first one called */
+ .priority = NMI_LOCAL_HIGH_PRIOR+1,
};
/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..6f39cab052d5 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -76,7 +76,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
CMOS_WRITE(real_seconds, RTC_SECONDS);
CMOS_WRITE(real_minutes, RTC_MINUTES);
} else {
- printk(KERN_WARNING
+ printk_once(KERN_NOTICE
"set_rtc_mmss: can't update from %d to %d\n",
cmos_minutes, real_minutes);
retval = -1;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c7149c96d079..0cbe8c0b35ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
*/
static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
-void cpu_hotplug_driver_lock()
+void cpu_hotplug_driver_lock(void)
{
mutex_lock(&x86_cpu_hotplug_driver_mutex);
}
-void cpu_hotplug_driver_unlock()
+void cpu_hotplug_driver_unlock(void)
{
mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}
@@ -1402,8 +1402,9 @@ static inline void mwait_play_dead(void)
unsigned int highest_subcstate = 0;
int i;
void *mwait_ptr;
+ struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
- if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT))
+ if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
return;
if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
return;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
if (!pmd)
return -1;
- pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
if (!pte)
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c76aaca5694d..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -84,6 +84,11 @@ EXPORT_SYMBOL_GPL(used_vectors);
static int ignore_nmis;
int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
static inline void conditional_sti(struct pt_regs *regs)
{
@@ -310,15 +315,15 @@ static int __init setup_unknown_nmi_panic(char *str)
__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
-
- printk(KERN_EMERG
- "You have some hardware problem, likely on the PCI bus.\n");
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ /*
+ * On some machines, PCI SERR line is used to report memory
+ * errors. EDAC makes use of it.
+ */
#if defined(CONFIG_EDAC)
if (edac_handler_set()) {
edac_atomic_assert_error();
@@ -329,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -341,15 +346,17 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
show_registers(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
/* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
i = 20000;
while (--i) {
@@ -357,8 +364,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
udelay(100);
}
- reason &= ~8;
- outb(reason, 0x61);
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -377,57 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
return;
}
#endif
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
}
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
- int cpu;
- cpu = smp_processor_id();
-
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu)
- reason = get_nmi_reason();
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
+ /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
+ raw_spin_lock(&nmi_reason_lock);
+ reason = get_nmi_reason();
-#ifdef CONFIG_X86_LOCAL_APIC
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ reassert_nmi();
#endif
- unknown_nmi_error(reason, regs);
-
+ raw_spin_unlock(&nmi_reason_lock);
return;
}
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
+ raw_spin_unlock(&nmi_reason_lock);
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered:
- */
- reassert_nmi();
-#endif
+ unknown_nmi_error(reason, regs);
}
dotraplinkage notrace __kprobes void
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 03d2ea82f35a..ffe5755caa8b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !ref1 && !ref2)
+ if (ref1 == ref2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -935,7 +935,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
tsc_stop = tsc_read_refs(&ref_stop, hpet);
/* hpet or pmtimer available ? */
- if (!hpet && !ref_start && !ref_stop)
+ if (ref_start == ref_stop)
goto out;
/* Check, whether the sampling was disturbed by an SMI */
@@ -965,7 +965,7 @@ out:
static int __init init_tsc_clocksource(void)
{
- if (!cpu_has_tsc || tsc_disabled > 0)
+ if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
return 0;
if (tsc_clocksource_reliable)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
+ select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
assigned-dev.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
#include <linux/module.h>
#include <asm/kvm_emulate.h>
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+register_address(struct decode_cache *c, unsigned long reg)
{
- return base + address_mask(c, reg);
+ return address_mask(c, reg);
}
static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return seg_base(ctxt, ops, c->seg_override);
+ return c->seg_override;
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static ulong linear(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr)
{
- return seg_base(ctxt, ops, VCPU_SREG_ES);
-}
-
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- return seg_base(ctxt, ops, VCPU_SREG_SS);
-}
+ struct decode_cache *c = &ctxt->decode;
+ ulong la;
-static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
- u32 error, bool valid)
-{
- ctxt->exception = vec;
- ctxt->error_code = error;
- ctxt->error_code_valid = valid;
+ la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+ if (c->ad_bytes != 8)
+ la &= (u32)-1;
+ return la;
}
-static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
{
- emulate_exception(ctxt, GP_VECTOR, err, true);
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
}
-static void emulate_pf(struct x86_emulate_ctxt *ctxt)
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
- emulate_exception(ctxt, PF_VECTOR, 0, true);
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
{
- emulate_exception(ctxt, UD_VECTOR, 0, false);
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
}
-static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
{
- emulate_exception(ctxt, TS_VECTOR, err, true);
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
}
static int emulate_de(struct x86_emulate_ctxt *ctxt)
{
- emulate_exception(ctxt, DE_VECTOR, 0, false);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
cur_size = fc->end - fc->start;
size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, NULL);
+ size, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- ulong addr,
+ struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
+ rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
+ ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
+ addr.ea += 2;
+ rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
+ ctxt->vcpu, &ctxt->exception);
return rc;
}
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
break;
}
}
- op->addr.mem = modrm_ea;
+ op->addr.mem.ea = modrm_ea;
done:
return rc;
}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
- op->addr.mem = insn_fetch(u16, 2, c->eip);
+ op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
break;
case 4:
- op->addr.mem = insn_fetch(u32, 4, c->eip);
+ op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
break;
case 8:
- op->addr.mem = insn_fetch(u64, 8, c->eip);
+ op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
break;
}
done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
else if (c->src.bytes == 4)
sv = (s32)c->src.val & (s32)mask;
- c->dst.addr.mem += (sv >> 3);
+ c->dst.addr.mem.ea += (sv >> 3);
}
/* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct read_cache *mc = &ctxt->decode.mem_read;
- u32 err;
while (size) {
int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
- rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
- ctxt->vcpu);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ rc = ops->read_emulated(addr, mc->data + mc->end, n,
+ &ctxt->exception, ctxt->vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
struct desc_ptr dt;
u16 index = selector >> 3;
int ret;
- u32 err;
ulong addr;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
- if (dt.size < index * 8 + 7) {
- emulate_gp(ctxt, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
return ret;
}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
{
struct desc_ptr dt;
u16 index = selector >> 3;
- u32 err;
ulong addr;
int ret;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
- if (dt.size < index * 8 + 7) {
- emulate_gp(ctxt, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
return ret;
}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct decode_cache *c = &ctxt->decode;
- u32 err;
switch (c->dst.type) {
case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
case OP_MEM:
if (c->lock_prefix)
rc = ops->cmpxchg_emulated(
- c->dst.addr.mem,
+ linear(ctxt, c->dst.addr.mem),
&c->dst.orig_val,
&c->dst.val,
c->dst.bytes,
- &err,
+ &ctxt->exception,
ctxt->vcpu);
else
rc = ops->write_emulated(
- c->dst.addr.mem,
+ linear(ctxt, c->dst.addr.mem),
&c->dst.val,
c->dst.bytes,
- &err,
+ &ctxt->exception,
ctxt->vcpu);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.seg = VCPU_SREG_SS;
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
{
struct decode_cache *c = &ctxt->decode;
int rc;
+ struct segmented_address addr;
- rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]),
- dest, len);
+ addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ addr.seg = VCPU_SREG_SS;
+ rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
change_mask |= EFLG_IF;
break;
case X86EMUL_MODE_VM86:
- if (iopl < 3) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
change_mask |= EFLG_IF;
break;
default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
*(unsigned long *)dest =
(ctxt->eflags & ~change_mask) | (val & change_mask);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
-
return rc;
}
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
gva_t cs_addr;
gva_t eip_addr;
u16 cs, eip;
- u32 err;
/* TODO: Add limit checks */
c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
- rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+ rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+ rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
- if (temp_eip & ~0xffff) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- emulate_ud(ctxt);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
- if (ctxt->mode == X86EMUL_MODE_PROT64) {
- emulate_ud(ctxt);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
break;
case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
break;
}
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
switch (usermode) {
case X86EMUL_MODE_PROT32:
cs_sel = (u16)(msr_data + 16);
- if ((msr_data & 0xfffc) == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
cs_sel = (u16)(msr_data + 32);
- if (msr_data == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_16 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
save_state_to_tss16(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
}
return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+ return emulate_gp(ctxt, 0);
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_32 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
save_state_to_tss32(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
}
return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+ return emulate_gp(ctxt, 0);
}
desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
int reg, struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->addr.mem = register_address(c, base, c->regs[reg]);
+ op->addr.mem.ea = register_address(c, c->regs[reg]);
+ op->addr.mem.seg = seg;
}
static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
+ return emulate_gp(ctxt, 0);
ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
c->regs[VCPU_REGS_RAX] = (u32)tsc;
c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_IMM;
op->bytes = size;
- op->addr.mem = c->eip;
+ op->addr.mem.ea = c->eip;
/* NB. Immediates are sign-extended as necessary. */
switch (op->bytes) {
case 1:
@@ -2678,7 +2610,7 @@ done:
}
int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
{
struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
struct operand memop = { .type = OP_NONE };
c->eip = ctxt->eip;
- c->fetch.start = c->fetch.end = c->eip;
+ c->fetch.start = c->eip;
+ c->fetch.end = c->fetch.start + insn_len;
+ if (insn_len > 0)
+ memcpy(c->fetch.data, insn, insn_len);
ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
c->execute = opcode.u.execute;
/* Unrecognised? */
- if (c->d == 0 || (c->d & Undefined)) {
- DPRINTF("Cannot emulate %02x\n", c->b);
+ if (c->d == 0 || (c->d & Undefined))
return -1;
- }
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
if (!c->has_seg_override)
set_seg_override(c, VCPU_SREG_DS);
- if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
- memop.addr.mem += seg_override_base(ctxt, ops, c);
+ memop.addr.mem.seg = seg_override(ctxt, ops, c);
if (memop.type == OP_MEM && c->ad_bytes != 8)
- memop.addr.mem = (u32)memop.addr.mem;
+ memop.addr.mem.ea = (u32)memop.addr.mem.ea;
if (memop.type == OP_MEM && c->rip_relative)
- memop.addr.mem += c->eip;
+ memop.addr.mem.ea += c->eip;
/*
* Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
case SrcSI:
c->src.type = OP_MEM;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.mem =
- register_address(c, seg_override_base(ctxt, ops, c),
- c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.seg = seg_override(ctxt, ops, c),
c->src.val = 0;
break;
case SrcImmFAddr:
c->src.type = OP_IMM;
- c->src.addr.mem = c->eip;
+ c->src.addr.mem.ea = c->eip;
c->src.bytes = c->op_bytes + 2;
insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
break;
@@ -2944,7 +2876,7 @@ done_prefixes:
break;
case DstImmUByte:
c->dst.type = OP_IMM;
- c->dst.addr.mem = c->eip;
+ c->dst.addr.mem.ea = c->eip;
c->dst.bytes = 1;
c->dst.val = insn_fetch(u8, 1, c->eip);
break;
@@ -2969,9 +2901,9 @@ done_prefixes:
case DstDI:
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.mem =
- register_address(c, es_base(ctxt, ops),
- c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.seg = VCPU_SREG_ES;
c->dst.val = 0;
break;
case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
- rc = read_emulated(ctxt, ops, c->src.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if (c->src2.type == OP_MEM) {
- rc = read_emulated(ctxt, ops, c->src2.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
&c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = read_emulated(ctxt, ops, c->dst.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
&c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3215,13 +3147,13 @@ special_insn:
break;
case 0x8c: /* mov r/m, sreg */
if (c->modrm_reg > VCPU_SREG_GS) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->src.addr.mem;
+ c->dst.val = c->src.addr.mem.ea;
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
@@ -3268,7 +3200,6 @@ special_insn:
break;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
- DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
goto cmp;
case 0xa8 ... 0xa9: /* test ax, imm */
goto test;
@@ -3363,7 +3294,7 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->dst.val,
c->src.bytes)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
break;
case 0xfa: /* cli */
if (emulator_bad_iopl(ctxt, ops)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
} else
ctxt->eflags &= ~X86_EFLAGS_IF;
break;
case 0xfb: /* sti */
if (emulator_bad_iopl(ctxt, ops)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
} else {
ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+ string_addr_inc(ctxt, seg_override(ctxt, ops, c),
VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+ string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
&c->dst);
if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
ctxt->eip = c->eip;
done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
break;
case 5: /* not defined */
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
+ emulate_invlpg(ctxt->vcpu,
+ linear(ctxt, c->src.addr.mem));
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
case 5 ... 7:
case 9 ... 15:
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
case 0x22: /* mov reg, cr */
if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@@ -3604,6 +3543,7 @@ twobyte_insn:
~0ULL : ~0U), ctxt->vcpu) < 0) {
/* #UD condition is already handled by the code above */
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@@ -3615,6 +3555,7 @@ twobyte_insn:
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
/* rdmsr */
if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", c->b);
return -1;
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->decache_cr3(vcpu);
+ return vcpu->arch.cr3;
+}
+
static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
if (old_ppr != ppr) {
apic_set_reg(apic, APIC_PROCPRI, ppr);
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ if (ppr < old_ppr)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
*
*/
+#include "irq.h"
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
- shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
}
/*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
*/
-static int *slot_largepage_idx(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
{
unsigned long idx;
idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
- return &slot->lpage_info[level - 2][idx].write_count;
+ return &slot->lpage_info[level - 2][idx];
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- int *write_count;
+ struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count += 1;
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count += 1;
}
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- int *write_count;
+ struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count -= 1;
- WARN_ON(*write_count < 0);
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count -= 1;
+ WARN_ON(linfo->write_count < 0);
}
}
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
int level)
{
struct kvm_memory_slot *slot;
- int *largepage_idx;
+ struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot, level);
- return *largepage_idx;
+ linfo = lpage_info_slot(gfn, slot, level);
+ return linfo->write_count;
}
return 1;
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
- int host_level, level, max_level;
-
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
- return PT_PAGE_TABLE_LEVEL;
+ return true;
+ return false;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ int host_level, level, max_level;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
- unsigned long idx;
+ struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+ linfo = lpage_info_slot(gfn, slot, level);
- return &slot->lpage_info[level - 2][idx].rmap_pde;
+ return &linfo->rmap_pde;
}
/*
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ gfn_t gfn = memslot->base_gfn + gfn_offset;
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- unsigned long idx;
- int sh;
-
- sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
- idx = ((memslot->base_gfn+gfn_offset) >> sh) -
- (memslot->base_gfn >> sh);
- ret |= handler(kvm,
- &memslot->lpage_info[j][idx].rmap_pde,
- data);
+ struct kvm_lpage_info *linfo;
+
+ linfo = lpage_info_slot(gfn, memslot,
+ PT_DIRECTORY_LEVEL + j);
+ ret |= handler(kvm, &linfo->rmap_pde, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return young;
}
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int young = 0;
+
+ /*
+ * If there's no access bit in the secondary pte set by the
+ * hardware it's up to gup-fast/gup to set the access bit in
+ * the primary pte or in the page structure.
+ */
+ if (!shadow_accessed_mask)
+ goto out;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ young = _spte & PT_ACCESSED_MASK;
+ if (young) {
+ young = 1;
+ break;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+out:
+ return young;
+}
+
#define RMAP_RECYCLE_THRESHOLD 1000
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync)
+ struct kvm_mmu_page *sp)
{
return 1;
}
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ (vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
continue;
}
- kvm_unlink_unsync_page(vcpu->kvm, s);
flush = true;
}
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
+ bool can_unsync, bool host_writable)
{
- u64 spte;
+ u64 spte, entry = *sptep;
int ret = 0;
/*
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
- spte = shadow_base_present_pte;
+ spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
- if (reset_host_protection)
+ if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
update_spte(sptep, spte);
+ /*
+ * If we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
}
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool reset_host_protection)
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
+ host_writable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int level, gfn_t gfn, pfn_t pfn)
+ int map_writable, int level, gfn_t gfn, pfn_t pfn,
+ bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+ unsigned pte_access = ACC_ALL;
+
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, 1, &pt_write,
- level, gfn, pfn, false, true);
+ level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+ pfn_t pfn = *pfnp;
+ gfn_t gfn = *gfnp;
+ int level = *levelp;
+
+ /*
+ * Check if it's a transparent hugepage. If this would be an
+ * hugetlbfs page, level wouldn't be set to
+ * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+ * here.
+ */
+ if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ level == PT_PAGE_TABLE_LEVEL &&
+ PageTransCompound(pfn_to_page(pfn)) &&
+ !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn to tail to
+ * head.
+ */
+ *levelp = level = PT_DIRECTORY_LEVEL;
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ gfn &= ~mask;
+ *gfnp = gfn;
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ if (!get_page_unless_zero(pfn_to_page(pfn)))
+ BUG();
+ *pfnp = pfn;
+ }
+ }
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+ bool prefault)
{
int r;
int level;
+ int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
+ bool map_writable;
- level = mapping_level(vcpu, gfn);
-
- /*
- * This path builds a PAE pagetable - so we can map 2mb pages at
- * maximum. Therefore check if the level is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ /*
+ * This path builds a PAE pagetable - so we can map
+ * 2mb pages at maximum. Therefore check if the level
+ * is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+ return 0;
/* mmio */
if (is_error_pfn(pfn))
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, level, gfn, pfn);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
+ r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+ prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2530,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2552,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access, struct x86_exception *exception)
{
- if (error)
- *error = 0;
+ if (exception)
+ exception->error_code = 0;
return vaddr;
}
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access,
+ struct x86_exception *exception)
{
- if (error)
- *error = 0;
+ if (exception)
+ exception->error_code = 0;
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
+ u32 error_code, bool prefault)
{
gfn_t gfn;
int r;
@@ -2584,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
+ error_code & PFERR_WRITE_MASK, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;
+ arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+ return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+}
+
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ kvm_event_needs_reinjection(vcpu)))
+ return false;
+
+ return kvm_x86_ops->interrupt_allowed(vcpu);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
- u32 error_code)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable)
+{
+ bool async;
+
+ *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
+
+ if (!async)
+ return false; /* *pfn has correct page already */
+
+ put_page(pfn_to_page(*pfn));
+
+ if (!prefault && can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(gva, gfn);
+ if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+ trace_kvm_async_pf_doublefault(gva, gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return true;
+ } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ return true;
+ }
+
+ *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
+
+ return false;
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+ bool prefault)
{
pfn_t pfn;
int r;
int level;
+ int force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
+ int write = error_code & PFERR_WRITE_MASK;
+ bool map_writable;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2603,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- level = mapping_level(vcpu, gfn);
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return 0;
+
+ /* mmio */
if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- level, gfn, pfn);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable,
+ level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -2659,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
+ pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
mmu_free_roots(vcpu);
}
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cr3;
+ return kvm_read_cr3(vcpu);
}
-static void inject_page_fault(struct kvm_vcpu *vcpu)
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- vcpu->arch.mmu.inject_page_fault(vcpu);
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
static void paging_free(struct kvm_vcpu *vcpu)
@@ -2816,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ context->base_role.word = 0;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
@@ -3008,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -3264,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
}
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+ void *insn, int insn_len)
{
int r;
enum emulation_result er;
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
goto out;
@@ -3282,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
if (r)
goto out;
- er = emulate_instruction(vcpu, cr2, error_code, 0);
+ er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
switch (er) {
case EMULATE_DONE:
@@ -3377,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap))
continue;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (is_writable_pte(pt[i]))
- pt[i] &= ~PT_WRITABLE_MASK;
+ update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -3463,13 +3627,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
-}
-
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3566,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
return 1;
}
@@ -3662,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -3675,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
#include <linux/ratelimit.h>
-static int audit_point;
-
-#define audit_printk(fmt, args...) \
+#define audit_printk(kvm, fmt, args...) \
printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[audit_point], ##args)
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)
typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
if (sp->unsync) {
if (level != PT_PAGE_TABLE_LEVEL) {
- audit_printk("unsync sp: %p level = %d\n", sp, level);
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
return;
}
if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in unsync sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in unsync "
+ "sp: %p\n", sp);
return;
}
}
if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in direct sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+ sp);
return;
}
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
- vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
}
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
return;
- audit_printk("no memslot for gfn %llx\n", gfn);
- audit_printk("index %ld of sp (gfn=%llx)\n",
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
(long int)(sptep - rev_sp->spt), rev_sp->gfn);
dump_stack();
return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!*rmapp) {
if (!printk_ratelimit())
return;
- audit_printk("no rmap for writable spte %llx\n", *sptep);
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
dump_stack();
}
}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));
- if (audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
}
static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
if (is_writable_pte(*spte))
- audit_printk("shadow page has writable mappings: gfn "
- "%llx role %x\n", sp->gfn, sp->role.word);
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
if (!__ratelimit(&ratelimit_state))
return;
- audit_point = point;
+ vcpu->kvm->arch.audit_point = point;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
- u32 error_code;
+ struct x86_exception fault;
};
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
return 1;
error:
- walker->error_code = 0;
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = 0;
if (present)
- walker->error_code |= PFERR_PRESENT_MASK;
+ walker->fault.error_code |= PFERR_PRESENT_MASK;
- walker->error_code |= write_fault | user_fault;
+ walker->fault.error_code |= write_fault | user_fault;
if (fetch_fault && mmu->nx)
- walker->error_code |= PFERR_FETCH_MASK;
+ walker->fault.error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
- walker->error_code |= PFERR_RSVD_MASK;
+ walker->fault.error_code |= PFERR_RSVD_MASK;
- vcpu->arch.fault.address = addr;
- vcpu->arch.fault.error_code = walker->error_code;
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
- trace_kvm_mmu_walker_error(walker->error_code);
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
}
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return true;
+}
+
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- u64 new_spte;
gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return;
- }
+
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gpte = gptep[i];
- if (!is_present_gpte(gpte) ||
- is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn)
+ int *ptwrite, pfn_t pfn, bool map_writable,
+ bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
user_fault, write_fault, dirty, ptwrite, it.level,
- gw->gfn, pfn, false, true);
+ gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+ bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
+ int force_pt_level;
unsigned long mmu_seq;
+ bool map_writable;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu);
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+ if (!prefault) {
+ inject_page_fault(vcpu, &walker.fault);
+ /* reset fork detector */
+ vcpu->arch.last_pt_write_count = 0;
+ }
return 0;
}
- if (walker.level >= PT_DIRECTORY_LEVEL) {
+ if (walker.level >= PT_DIRECTORY_LEVEL)
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+ else
+ force_pt_level = 1;
+ if (!force_pt_level) {
level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+ &map_writable))
+ return 0;
/* mmio */
if (is_error_pfn(pfn))
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
+ if (!force_pt_level)
+ transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn);
+ level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
- u32 *error)
+ struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
+ } else if (exception)
+ *exception = walker.fault;
return gpa;
}
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access,
+ struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
+ } else if (exception)
+ *exception = walker.fault;
return gpa;
}
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
- bool reset_host_protection;
+ bool host_writable;
gpa_t first_pte_gpa;
offset = nr_present = 0;
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return -EINVAL;
gfn = gpte_to_gfn(gpte);
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
- || gfn != sp->gfns[i] || !is_present_gpte(gpte)
- || !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
- if (is_present_gpte(gpte) || !clear_unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
+ vcpu->kvm->tlbs_dirty++;
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
- pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
- } else {
- reset_host_protection = 1;
- }
+ host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
+ host_writable);
}
return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7c2ca4..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/kvm_para.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_TSC_RATE (1 << 4)
+#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
+#define SVM_FEATURE_FLUSH_ASID (1 << 6)
+#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
unsigned long vmexit_rax;
/* cache for intercepts of the guest */
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
+ u32 intercept_cr;
+ u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
@@ -123,7 +126,12 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
u32 *msrpm;
@@ -133,6 +141,7 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
+ u32 apf_reason;
};
#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
-static inline bool is_nested(struct vcpu_svm *svm)
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr | g->intercept_cr;
+ c->intercept_dr = h->intercept_dr | g->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept = h->intercept | g->intercept;
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
{
- return svm->nested.vmcb;
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
}
static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static inline u32 svm_has(u32 feat)
-{
- return svm_features & feat;
-}
-
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm->next_rip = svm->vmcb->control.next_rip;
if (!svm->next_rip) {
- if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
+ if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
nested_svm_check_exception(svm, nr, has_error_code, error_code))
return;
- if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+ if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
/*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
svm_features = cpuid_edx(SVM_CPUID_FUNC);
- if (!svm_has(SVM_FEATURE_NPT))
+ if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
}
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.tsc_offset += adjustment;
- if (is_nested(svm))
+ if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_save_area *save = &svm->vmcb->save;
svm->vcpu.fpu_active = 1;
+ svm->vcpu.arch.hflags = 0;
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK |
- INTERCEPT_CR8_MASK;
-
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_exceptions = (1 << PF_VECTOR) |
- (1 << UD_VECTOR) |
- (1 << MC_VECTOR);
-
-
- control->intercept = (1ULL << INTERCEPT_INTR) |
- (1ULL << INTERCEPT_NMI) |
- (1ULL << INTERCEPT_SMI) |
- (1ULL << INTERCEPT_SELECTIVE_CR0) |
- (1ULL << INTERCEPT_CPUID) |
- (1ULL << INTERCEPT_INVD) |
- (1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPG) |
- (1ULL << INTERCEPT_INVLPGA) |
- (1ULL << INTERCEPT_IOIO_PROT) |
- (1ULL << INTERCEPT_MSR_PROT) |
- (1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_SHUTDOWN) |
- (1ULL << INTERCEPT_VMRUN) |
- (1ULL << INTERCEPT_VMMCALL) |
- (1ULL << INTERCEPT_VMLOAD) |
- (1ULL << INTERCEPT_VMSAVE) |
- (1ULL << INTERCEPT_STGI) |
- (1ULL << INTERCEPT_CLGI) |
- (1ULL << INTERCEPT_SKINIT) |
- (1ULL << INTERCEPT_WBINVD) |
- (1ULL << INTERCEPT_MONITOR) |
- (1ULL << INTERCEPT_MWAIT);
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR3_READ);
+ set_cr_intercept(svm, INTERCEPT_CR4_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_READ);
+ set_dr_intercept(svm, INTERCEPT_DR1_READ);
+ set_dr_intercept(svm, INTERCEPT_DR2_READ);
+ set_dr_intercept(svm, INTERCEPT_DR3_READ);
+ set_dr_intercept(svm, INTERCEPT_DR4_READ);
+ set_dr_intercept(svm, INTERCEPT_DR5_READ);
+ set_dr_intercept(svm, INTERCEPT_DR6_READ);
+ set_dr_intercept(svm, INTERCEPT_DR7_READ);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+
+ set_intercept(svm, INTERCEPT_INTR);
+ set_intercept(svm, INTERCEPT_NMI);
+ set_intercept(svm, INTERCEPT_SMI);
+ set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_CPUID);
+ set_intercept(svm, INTERCEPT_INVD);
+ set_intercept(svm, INTERCEPT_HLT);
+ set_intercept(svm, INTERCEPT_INVLPG);
+ set_intercept(svm, INTERCEPT_INVLPGA);
+ set_intercept(svm, INTERCEPT_IOIO_PROT);
+ set_intercept(svm, INTERCEPT_MSR_PROT);
+ set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ set_intercept(svm, INTERCEPT_SHUTDOWN);
+ set_intercept(svm, INTERCEPT_VMRUN);
+ set_intercept(svm, INTERCEPT_VMMCALL);
+ set_intercept(svm, INTERCEPT_VMLOAD);
+ set_intercept(svm, INTERCEPT_VMSAVE);
+ set_intercept(svm, INTERCEPT_STGI);
+ set_intercept(svm, INTERCEPT_CLGI);
+ set_intercept(svm, INTERCEPT_SKINIT);
+ set_intercept(svm, INTERCEPT_WBINVD);
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ set_intercept(svm, INTERCEPT_XSETBV);
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_INVLPG));
- control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
- control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
+ clr_intercept(svm, INTERCEPT_TASK_SWITCH);
+ clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = 0x0007040600070406ULL;
save->cr3 = 0;
save->cr4 = 0;
}
- force_new_asid(&svm->vcpu);
+ svm->asid_generation = 0;
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
control->pause_filter_count = 3000;
- control->intercept |= (1ULL << INTERCEPT_PAUSE);
+ set_intercept(svm, INTERCEPT_PAUSE);
}
+ mark_all_dirty(svm->vmcb);
+
enable_gif(svm);
}
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(cpu != vcpu->cpu)) {
svm->asid_generation = 0;
+ mark_all_dirty(svm->vmcb);
}
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
+
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
int i;
++vcpu->stat.host_state_reload;
+ kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+ loadsegment(fs, svm->host.fs);
+ load_gs_index(svm->host.gs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, svm->host.gs);
+#endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
switch (reg) {
case VCPU_EXREG_PDPTR:
BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
static void svm_set_vintr(struct vcpu_svm *svm)
{
- svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+ set_intercept(svm, INTERCEPT_VINTR);
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+ clr_intercept(svm, INTERCEPT_VINTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
static void update_cr0_intercept(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = svm->vmcb;
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
+ mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
- vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
- }
+ clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
- svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- }
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
}
}
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
/*
* We are here because we run in nested mode, the host kvm
* intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
+ mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- force_new_asid(vcpu);
+ svm_flush_tlb(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
= (svm->vmcb->save.cs.attrib
>> SVM_SELECTOR_DPL_SHIFT) & 3;
+ mark_dirty(svm->vmcb, VMCB_SEG);
}
static void update_db_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions &=
- ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+ clr_exception_intercept(svm, DB_VECTOR);
+ clr_exception_intercept(svm, BP_VECTOR);
if (svm->nmi_singlestep)
- svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- svm->vmcb->control.intercept_exceptions |=
- 1 << DB_VECTOR;
+ set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- svm->vmcb->control.intercept_exceptions |=
- 1 << BP_VECTOR;
+ set_exception_intercept(svm, BP_VECTOR);
} else
vcpu->guest_debug = 0;
}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
else
svm->vmcb->save.dr7 = vcpu->arch.dr7;
- update_db_intercept(vcpu);
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
+ mark_dirty(svm->vmcb, VMCB_DR);
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
+ update_db_intercept(vcpu);
}
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
+
+ mark_dirty(svm->vmcb, VMCB_ASID);
}
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr7 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
}
static int pf_interception(struct vcpu_svm *svm)
{
- u64 fault_address;
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u32 error_code;
+ int r = 1;
- fault_address = svm->vmcb->control.exit_info_2;
- error_code = svm->vmcb->control.exit_info_1;
-
- trace_kvm_page_fault(fault_address, error_code);
- if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+ switch (svm->apf_reason) {
+ default:
+ error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(fault_address);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(fault_address);
+ local_irq_enable();
+ break;
+ }
+ return r;
}
static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
{
int er;
- er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 excp;
-
- if (is_nested(svm)) {
- u32 h_excp, n_excp;
-
- h_excp = svm->nested.hsave->control.intercept_exceptions;
- n_excp = svm->nested.intercept_exceptions;
- h_excp &= ~(1 << NM_VECTOR);
- excp = h_excp | n_excp;
- } else {
- excp = svm->vmcb->control.intercept_exceptions;
- excp &= ~(1 << NM_VECTOR);
- }
- svm->vmcb->control.intercept_exceptions = excp;
+ clr_exception_intercept(svm, NM_VECTOR);
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
- force_new_asid(vcpu);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ svm_flush_tlb(vcpu);
}
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.exit_code = SVM_EXIT_NPF;
svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
- svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+ svm->vmcb->control.exit_info_1 = fault->error_code;
+ svm->vmcb->control.exit_info_2 = fault->address;
nested_svm_vmexit(svm);
}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
{
int vmexit;
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return 0;
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
/* This function returns true if it is save to enable the irq window */
static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
/* This function returns true if it is save to enable the nmi window */
static inline bool nested_svm_nmi(struct vcpu_svm *svm)
{
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs */
- if (!npt_enabled)
+ /* When we're shadowing, trap PFs, but not async PF */
+ if (!npt_enabled && svm->apf_reason == 0)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr_read & cr_bits)
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
- case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (svm->nested.intercept_cr_write & cr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr_read & dr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (svm->nested.intercept_dr_write & dr_bits)
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
if (svm->nested.intercept_exceptions & excp_bits)
vmexit = NESTED_EXIT_DONE;
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->apf_reason != 0)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
struct vmcb_control_area *dst = &dst_vmcb->control;
struct vmcb_control_area *from = &from_vmcb->control;
- dst->intercept_cr_read = from->intercept_cr_read;
- dst->intercept_cr_write = from->intercept_cr_write;
- dst->intercept_dr_read = from->intercept_dr_read;
- dst->intercept_dr_write = from->intercept_dr_write;
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
dst->intercept_exceptions = from->intercept_exceptions;
dst->intercept = from->intercept;
dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_vmcb)
return 1;
- /* Exit nested SVM mode */
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
svm->nested.vmcb = 0;
/* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.idtr = vmcb->save.idtr;
nested_vmcb->save.efer = svm->vcpu.arch.efer;
nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
+ mark_all_dirty(svm->vmcb);
+
nested_svm_unmap(page);
nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
nested_vmcb->control.intercept_exceptions,
nested_vmcb->control.intercept);
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (npt_enabled)
hsave->save.cr3 = vmcb->save.cr3;
else
- hsave->save.cr3 = svm->vcpu.arch.cr3;
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
copy_vmcb_control_area(hsave, vmcb);
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
/* cache intercepts */
- svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
- svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
- svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
- svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
- force_new_asid(&svm->vcpu);
+ svm_flush_tlb(&svm->vcpu);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
/* We only want the cr8 intercept bits of the guest */
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
/* We don't want to see VMMCALLs from a nested guest */
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
- /*
- * We don't want a nested guest to be more powerful than the guest, so
- * all intercepts are ORed
- */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+ clr_intercept(svm, INTERCEPT_VMMCALL);
svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_svm_unmap(page);
- /* nested_vmcb is our indicator if nested SVM is activated */
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
svm->nested.vmcb = vmcb_gpa;
enable_gif(svm);
+ mark_all_dirty(svm->vmcb);
+
return true;
}
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+
return 1;
}
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
return 1;
}
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+ u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+ u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ }
+
+ return 1;
+}
+
static int invalid_op_interception(struct vcpu_svm *svm)
{
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+ clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
}
static int invlpg_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+ kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(&svm->vcpu, reg);
+ switch (cr) {
+ case 0:
+ err = kvm_set_cr0(&svm->vcpu, val);
+ break;
+ case 3:
+ err = kvm_set_cr3(&svm->vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(&svm->vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(&svm->vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(&svm->vcpu);
+ break;
+ case 2:
+ val = svm->vcpu.arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(&svm->vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(&svm->vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(&svm->vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+ kvm_complete_insn_gp(&svm->vcpu, err);
+
+ return 1;
}
static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
int r;
- r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+ r = cr_interception(svm);
if (svm->nested.vmexit_rip) {
kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
svm->nested.vmexit_rip = 0;
}
- return r == EMULATE_DONE;
+ return r;
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+ int reg, dr;
+ unsigned long val;
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+ if (dr >= 16) { /* mov to DRn */
+ val = kvm_register_read(&svm->vcpu, reg);
+ kvm_set_dr(&svm->vcpu, dr - 16, val);
+ } else {
+ err = kvm_get_dr(&svm->vcpu, dr, &val);
+ if (!err)
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+
+ return 1;
}
static int cr8_write_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
+ int r;
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
- emulate_instruction(&svm->vcpu, 0, 0, 0);
+ r = cr_interception(svm);
if (irqchip_in_kernel(svm->vcpu.kvm)) {
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
- return 1;
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ return r;
}
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return 1;
+ return r;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc_offset;
+ struct vmcb *vmcb = get_host_vmcb(svm);
- if (is_nested(svm))
- tsc_offset = svm->nested.hsave->control.tsc_offset;
- else
- tsc_offset = svm->vmcb->control.tsc_offset;
-
- *data = tsc_offset + native_read_tsc();
+ *data = vmcb->control.tsc_offset + native_read_tsc();
break;
}
case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!svm_has(SVM_FEATURE_LBRV)) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV)) {
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
return 1;
svm->vmcb->save.dbgctl = data;
+ mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
svm_enable_lbrv(svm);
else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
}
static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
[SVM_EXIT_WRITE_CR0] = cr0_write_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
- [SVM_EXIT_READ_DR1] = emulate_on_interception,
- [SVM_EXIT_READ_DR2] = emulate_on_interception,
- [SVM_EXIT_READ_DR3] = emulate_on_interception,
- [SVM_EXIT_READ_DR4] = emulate_on_interception,
- [SVM_EXIT_READ_DR5] = emulate_on_interception,
- [SVM_EXIT_READ_DR6] = emulate_on_interception,
- [SVM_EXIT_READ_DR7] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
};
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_save_area *save = &svm->vmcb->save;
pr_err("VMCB Control Area:\n");
- pr_err("cr_read: %04x\n", control->intercept_cr_read);
- pr_err("cr_write: %04x\n", control->intercept_cr_write);
- pr_err("dr_read: %04x\n", control->intercept_dr_read);
- pr_err("dr_write: %04x\n", control->intercept_dr_write);
+ pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
+ pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
+ pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
+ pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
pr_err("exceptions: %08x\n", control->intercept_exceptions);
pr_err("intercepts: %016llx\n", control->intercept);
pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
}
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+}
+
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu);
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+ if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
return 1;
}
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
int vmexit;
trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+ set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ mark_dirty(svm->vmcb, VMCB_INTR);
}
static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
if (irr == -1)
return;
if (tpr >= irr)
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+ set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+ clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
- if (is_nested(svm))
+ if (is_guest_mode(vcpu))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
- force_new_asid(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->asid_generation--;
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+ if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
}
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u16 fs_selector;
- u16 gs_selector;
- u16 ldt_selector;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
- save_host_msrs(vcpu);
- savesegment(fs, fs_selector);
- savesegment(gs, gs_selector);
- ldt_selector = kvm_read_ldt();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
- load_host_msrs(vcpu);
- kvm_load_ldt(ldt_selector);
- loadsegment(fs, fs_selector);
#ifdef CONFIG_X86_64
- load_gs_index(gs_selector);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
- loadsegment(gs, gs_selector);
+ loadsegment(fs, svm->host.fs);
#endif
reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
stgi();
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ svm->apf_reason = kvm_read_and_reset_pf_reason();
+
if (npt_enabled) {
vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vmcb->control.exit_code ==
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
+
+ mark_all_clean(svm->vmcb);
}
#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.cr3 = root;
- force_new_asid(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
+ svm_flush_tlb(vcpu);
}
static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
+ mark_dirty(svm->vmcb, VMCB_NPT);
/* Also sync guest cr3 here in case we live migrate */
- svm->vmcb->save.cr3 = vcpu->arch.cr3;
+ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
- force_new_asid(vcpu);
+ svm_flush_tlb(vcpu);
}
static int is_disabled(void)
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
- case 0x00000001:
- /* Mask out xsave bit as long as it is not supported by SVM */
- entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
- break;
case 0x80000001:
if (nested)
entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
additional features */
/* Support next_rip if host supports it */
- if (svm_has(SVM_FEATURE_NRIP))
+ if (boot_cpu_has(X86_FEATURE_NRIPS))
entry->edx |= SVM_FEATURE_NRIP;
/* Support NPT for the guest if enabled */
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_WBINVD, "wbinvd" },
{ SVM_EXIT_MONITOR, "monitor" },
{ SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_XSETBV, "xsetbv" },
{ SVM_EXIT_NPF, "npf" },
{ -1, NULL }
};
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
- if (is_nested(svm))
- svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+ set_exception_intercept(svm, NM_VECTOR);
update_cr0_intercept(svm);
}
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+ .decache_cr3 = svm_decache_cr3,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
+ .get_exit_info = svm_get_exit_info,
.exit_reasons_str = svm_exit_reasons_str,
+
.get_lpage_level = svm_get_lpage_level,
.cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
/*
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
- TP_ARGS(exit_reason, vcpu),
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+ TP_ARGS(exit_reason, vcpu, isa),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
__field( unsigned long, guest_rip )
+ __field( u32, isa )
+ __field( u64, info1 )
+ __field( u64, info2 )
),
TP_fast_assign(
__entry->exit_reason = exit_reason;
__entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->isa = isa;
+ kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+ &__entry->info2);
),
- TP_printk("reason %s rip 0x%lx",
+ TP_printk("reason %s rip 0x%lx info %llx %llx",
ftrace_print_symbols_seq(p, __entry->exit_reason,
kvm_x86_ops->exit_reasons_str),
- __entry->guest_rip)
+ __entry->guest_rip, __entry->info1, __entry->info2)
);
/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fcbe9515c5..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static int __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
+static bool cpu_has_load_ia32_efer;
+
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
static unsigned long vmcs_readl(unsigned long field)
{
- unsigned long value;
+ unsigned long value = 0;
asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "=a"(value) : "d"(field) : "cc");
+ : "+a"(value) : "d"(field) : "cc");
return value;
}
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_write64(GUEST_IA32_EFER, guest_val);
+ vmcs_write64(HOST_IA32_EFER, host_val);
+ vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /* Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set, then
+ * the instruction is already executing and RIP has already been
+ * advanced. */
+ if (!yield_on_hlt &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+ vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
&& tboot_enabled())
return 1;
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
+ && !tboot_enabled()) {
+ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+ " activate TXT before enabling KVM\n");
return 1;
+ }
}
return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+ return vmx_msr_high & ctl;
+}
+
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_pin_based_exec_control) < 0)
return -EIO;
- min = CPU_BASED_HLT_EXITING |
+ min =
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
+
+ if (yield_on_hlt)
+ min |= CPU_BASED_HLT_EXITING;
+
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ cpu_has_load_ia32_efer =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_EFER)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_EFER);
+
return 0;
}
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xfffff);
+ vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+ " aligned when entering protected mode (seg=%d)",
+ seg);
}
static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+ if (enable_ept && is_paging(vcpu))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
+ vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+ guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
vcpu->kvm->arch.ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
+ if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+ vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+ vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
return 0;
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+ | GUEST_INTR_STATE_NMI));
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+ if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
return 1;
/*
* Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
}
if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err)
- kvm_inject_gp(vcpu, 0);
- else
- skip_emulated_instruction(vcpu);
-}
-
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
switch (cr) {
case 0:
err = kvm_set_cr0(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 3:
err = kvm_set_cr3(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 4:
err = kvm_set_cr4(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = kvm_register_read(vcpu, reg);
- kvm_set_cr8(vcpu, cr8);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr8(vcpu, cr8);
+ kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 1: /*mov from cr*/
switch (cr) {
case 3:
- kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- trace_kvm_cr_read(cr, vcpu->arch.cr3);
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+ return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
&& (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
return handle_interrupt_window(&vmx->vcpu);
- err = emulate_instruction(vcpu, 0, 0, 0);
+ err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ *info1 = vmcs_readl(EXIT_QUALIFICATION);
+ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, vcpu);
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
- /* Access CR3 don't cause VMExit in paging mode, so we need
- * to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_PDPTR));
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
+ .get_exit_info = vmx_get_exit_info,
.exit_reasons_str = vmx_exit_reasons_str,
+
.get_lpage_level = vmx_get_lpage_level,
.cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
if (enable_ept) {
bypass_guest_pf = 0;
- kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
- VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46a368cb651e..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/uaccess.h>
+#include <linux/hash.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+ int i;
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
- unsigned error_code = vcpu->arch.fault.error_code;
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = vcpu->arch.fault.address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+ vcpu->arch.cr2 = fault->address;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
-void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
- if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+ if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
- vcpu->arch.mmu.inject_page_fault(vcpu);
-
- vcpu->arch.fault.nested = false;
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
- offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+ gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.cr3))
+ kvm_read_cr3(vcpu)))
return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
+ if ((cr0 ^ old_cr0) & X86_CR0_PG)
+ kvm_clear_async_pf_completion_queue(vcpu);
+
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
return 0;
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
return 1;
if (cr4 & X86_CR4_VMXE)
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return 0;
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
return 1;
vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
vcpu->arch.cr8 = cr8;
return 0;
}
-
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (__kvm_set_cr8(vcpu, cr8))
- kvm_inject_gp(vcpu, 0);
-}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 7
+#define KVM_SAVE_MSRS_BEGIN 8
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
- kvm_mmu_reset_context(vcpu);
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 2:5 are resrved, Should be zero */
+ if (data & 0x3c)
+ return 1;
+
+ vcpu->arch.apf.msr_val = data;
+
+ if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ kvm_async_pf_wakeup_all(vcpu);
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
break;
}
+ case MSR_KVM_ASYNC_PF_EN:
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
+ case MSR_KVM_ASYNC_PF_EN:
+ data = vcpu->arch.apf.msr_val;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2185,6 +2229,11 @@ out:
return r;
}
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index)
{
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
+ cpuid_mask(&entry->edx, 0);
entry->ecx &= kvm_supported_word4_x86_features;
+ cpuid_mask(&entry->ecx, 4);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 0x80000001:
entry->edx &= kvm_supported_word1_x86_features;
+ cpuid_mask(&entry->edx, 1);
entry->ecx &= kvm_supported_word6_x86_features;
+ cpuid_mask(&entry->ecx, 6);
break;
}
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memslots *slots, *old_slots;
unsigned long *dirty_bitmap;
- r = -ENOMEM;
- dirty_bitmap = vmalloc(n);
- if (!dirty_bitmap)
- goto out;
+ dirty_bitmap = memslot->dirty_bitmap_head;
+ if (memslot->dirty_bitmap == dirty_bitmap)
+ dirty_bitmap += n / sizeof(long);
memset(dirty_bitmap, 0, n);
r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
- if (!slots) {
- vfree(dirty_bitmap);
+ if (!slots)
goto out;
- }
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+ slots->generation++;
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
spin_unlock(&kvm->mmu_lock);
r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
- vfree(dirty_bitmap);
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
goto out;
- }
- vfree(dirty_bitmap);
} else {
r = -EFAULT;
if (clear_user(log->dirty_bitmap, n))
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
&vpic->dev);
+ mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
+ mutex_unlock(&kvm->slots_lock);
}
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
{
gpa_t t_gpa;
- u32 error;
+ struct x86_exception exception;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
- if (t_gpa == UNMAPPED_GVA)
- vcpu->arch.fault.nested = true;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
return t_gpa;
}
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
/* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
- u32 *error)
+ struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- error);
+ exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -3630,31 +3682,35 @@ out:
/* used for instruction fetching */
static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK, error);
+ access | PFERR_FETCH_MASK,
+ exception);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- error);
+ exception);
}
static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
static int kvm_write_guest_virt_system(gva_t addr, void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu,
- u32 *error)
+ struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
PFERR_WRITE_MASK,
- error);
+ exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -3688,7 +3742,7 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
- == X86EMUL_CONTINUE)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+ == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
mmio:
@@ -3735,7 +3789,7 @@ mmio:
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+ const void *val, int bytes)
{
int ret;
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -3787,7 +3841,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+ rc = emulator_write_emulated_onepage(addr, val, now, exception,
vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+ return emulator_write_emulated_onepage(addr, val, bytes, exception,
vcpu);
}
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
+ return emulator_write_emulated(addr, new, bytes, exception, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
if (vcpu->arch.pio.count)
goto data_avail;
- trace_kvm_pio(0, port, size, 1);
+ trace_kvm_pio(0, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 1;
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
const void *val, unsigned int count,
struct kvm_vcpu *vcpu)
{
- trace_kvm_pio(1, port, size, 1);
+ trace_kvm_pio(1, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 0;
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
return X86EMUL_CONTINUE;
if (kvm_x86_ops->has_wbinvd_exit()) {
- preempt_disable();
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
- preempt_enable();
+ put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
- }
- wbinvd();
+ } else
+ wbinvd();
return X86EMUL_CONTINUE;
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
value = vcpu->arch.cr2;
break;
case 3:
- value = vcpu->arch.cr3;
+ value = kvm_read_cr3(vcpu);
break;
case 4:
value = kvm_read_cr4(vcpu);
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
- res = __kvm_set_cr8(vcpu, val & 0xfUL);
+ res = kvm_set_cr8(vcpu, val);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- if (ctxt->exception == PF_VECTOR)
- kvm_propagate_fault(vcpu);
- else if (ctxt->error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_propagate_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
else
- kvm_queue_exception(vcpu, ctxt->exception);
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu)
{
+ int r = EMULATE_DONE;
+
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (!is_guest_mode(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ r = EMULATE_FAIL;
+ }
kvm_queue_exception(vcpu, UD_VECTOR);
- return EMULATE_FAIL;
+
+ return r;
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
return false;
}
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- u16 error_code,
- int emulation_type)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ int emulation_type,
+ void *insn,
+ int insn_len)
{
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.interruptibility = 0;
- vcpu->arch.emulate_ctxt.exception = -1;
+ vcpu->arch.emulate_ctxt.have_exception = false;
vcpu->arch.emulate_ctxt.perm_ok = false;
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
if (r == X86EMUL_PROPAGATE_FAULT)
goto done;
@@ -4389,7 +4453,7 @@ restart:
}
done:
- if (vcpu->arch.emulate_ctxt.exception >= 0) {
+ if (vcpu->arch.emulate_ctxt.have_exception) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
@@ -4413,7 +4477,7 @@ done:
return r;
}
-EXPORT_SYMBOL_GPL(emulate_instruction);
+EXPORT_SYMBOL_GPL(x86_emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
- kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
}
r = kvm_mmu_reload(vcpu);
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = 1;
while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
else {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_SIPI_RECEIVED:
default:
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
+
+ kvm_check_async_pf_completion(vcpu);
+
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
+ if (!tsk_used_math(current) && init_fpu(current))
+ return -ENOMEM;
+
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm))
- kvm_set_cr8(vcpu, kvm_run->cr8);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
if (vcpu->arch.pio.count || vcpu->mmio_needed) {
if (vcpu->mmio_needed) {
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->mmio_needed = 0;
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r != EMULATE_DONE) {
r = 0;
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = vcpu->arch.cr3;
+ sregs->cr3 = kvm_read_cr3(vcpu);
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+ mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (sregs->cr4 & X86_CR4_OSXSAVE)
update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@@ -5773,6 +5855,8 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.apf.msr_val = 0;
+
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_val = 0;
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks;
+ kvm_async_pf_hash_reset(vcpu);
+
return 0;
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
{
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
- if (!kvm)
- return ERR_PTR(-ENOMEM);
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
spin_lock_init(&kvm->arch.tsc_write_lock);
- return kvm;
+ return 0;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
+ }
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
if (kvm->arch.apic_access_page)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
- cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
+ || !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ is_error_page(work->page))
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ return;
+
+ vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ if (is_error_page(work->page))
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
+ !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+ vcpu->arch.apf.halted = false;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ return true;
+ else
+ return !kvm_event_needs_reinjection(vcpu) &&
+ kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc3..6e121a2a49e1 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
bool "Lguest guest support"
select PARAVIRT
depends on X86_32
+ select VIRTUALIZATION
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4996cf5f73a0..eba687f0cc0c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -824,7 +824,7 @@ static void __init lguest_init_IRQ(void)
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Not us! */
- __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != SYSCALL_VECTOR)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 08a0069b87a5..f21962c435ed 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -27,6 +27,7 @@
#include <asm/amd_nb.h>
static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
static __init int find_northbridge(void)
@@ -68,19 +69,6 @@ static __init void early_get_boot_cpu_id(void)
#endif
}
-int __init amd_get_nodes(struct bootnode *physnodes)
-{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long start = PFN_PHYS(start_pfn);
@@ -113,7 +101,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
- nodeid = limit & 7;
+ nodeids[i] = nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
pr_info("Skipping disabled node %d\n", i);
@@ -193,6 +181,76 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
return 0;
}
+#ifdef CONFIG_NUMA_EMU
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+void __init amd_get_nodes(struct bootnode *physnodes)
+{
+ int i;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
+ }
+}
+
+static int __init find_node_by_addr(unsigned long addr)
+{
+ int ret = NUMA_NO_NODE;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ if (addr >= nodes[i].start && addr < nodes[i].end) {
+ ret = i;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+ * setup to represent the physical topology but reflect the emulated
+ * environment. For each emulated node, the real node which it appears on is
+ * found and a fake pxm to nid mapping is created which mirrors the actual
+ * locality. node_distance() then represents the correct distances between
+ * emulated nodes by using the fake acpi mappings to pxms.
+ */
+void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
+{
+ unsigned int bits;
+ unsigned int cores;
+ unsigned int apicid_base = 0;
+ int i;
+
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = 1 << bits;
+ early_get_boot_cpu_id();
+ if (boot_cpu_physical_apicid > 0)
+ apicid_base = boot_cpu_physical_apicid;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int index;
+ int nid;
+ int j;
+
+ nid = find_node_by_addr(nodes[i].start);
+ if (nid == NUMA_NO_NODE)
+ continue;
+
+ index = nodeids[nid] << bits;
+ if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ fake_apicid_to_node[index + j] = i;
+#ifdef CONFIG_ACPI_NUMA
+ __acpi_map_pxm_to_node(nid, i);
+#endif
+ }
+ memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+}
+#endif /* CONFIG_NUMA_EMU */
+
int __init amd_scan_nodes(void)
{
unsigned int bits;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
+}
+
+static inline void get_huge_page_tail(struct page *page)
+{
+ /*
+ * __split_huge_page_refcount() cannot run
+ * from under us.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) < 0);
+ atomic_inc(&page->_count);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * The pmd_trans_splitting() check below explains why
+ * pmdp_splitting_flush has to flush the tlb, to stop
+ * this gup-fast code from running while we set the
+ * splitting bit in the pmd. Returning zero will take
+ * the slow path that will call wait_split_huge_page()
+ * if the pmd is still in splitting state. gup-fast
+ * can't because it has irq disabled and
+ * wait_split_huge_page() would never return as the
+ * tlb flush IPI wouldn't run.
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f89b5bb4e93f..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
#include <asm/bugs.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
@@ -715,6 +716,7 @@ void __init paging_init(void)
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
+ olpc_dt_build_devicetree();
sparse_init();
zone_sizes_init();
}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+int __initdata numa_off;
+
+static __init int numa_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+ if (!strncmp(opt, "off", 3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if (!strncmp(opt, "fake=", 5))
+ numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+#endif
+ return 0;
+}
+early_param("numa", numa_setup);
/*
* Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7762a517d69d..95ea1551eebc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
-int numa_off __initdata;
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
@@ -260,30 +259,35 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
static char *cmdline __initdata;
+void __init numa_emu_cmdline(char *str)
+{
+ cmdline = str;
+}
+
static int __init setup_physnodes(unsigned long start, unsigned long end,
int acpi, int amd)
{
- int nr_nodes = 0;
int ret = 0;
int i;
+ memset(physnodes, 0, sizeof(physnodes));
#ifdef CONFIG_ACPI_NUMA
if (acpi)
- nr_nodes = acpi_get_nodes(physnodes);
+ acpi_get_nodes(physnodes, start, end);
#endif
#ifdef CONFIG_AMD_NUMA
if (amd)
- nr_nodes = amd_get_nodes(physnodes);
+ amd_get_nodes(physnodes);
#endif
/*
* Basic sanity checking on the physical node map: there may be errors
* if the SRAT or AMD code incorrectly reported the topology or the mem=
* kernel parameter is used.
*/
- for (i = 0; i < nr_nodes; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
if (physnodes[i].start == physnodes[i].end)
continue;
if (physnodes[i].start > end) {
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
physnodes[i].start = start;
if (physnodes[i].end > end)
physnodes[i].end = end;
- }
-
- /*
- * Remove all nodes that have no memory or were truncated because of the
- * limited address range.
- */
- for (i = 0; i < nr_nodes; i++) {
- if (physnodes[i].start == physnodes[i].end)
- continue;
- physnodes[ret].start = physnodes[i].start;
- physnodes[ret].end = physnodes[i].end;
ret++;
}
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
return ret;
}
+static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+{
+ int i;
+
+ BUG_ON(acpi && amd);
+#ifdef CONFIG_ACPI_NUMA
+ if (acpi)
+ acpi_fake_nodes(nodes, nr_nodes);
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ amd_fake_nodes(nodes, nr_nodes);
+#endif
+ if (!acpi && !amd)
+ for (i = 0; i < nr_cpu_ids; i++)
+ numa_set_node(i, 0);
+}
+
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
- int nr_phys_nodes, int nr_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
return -1;
}
- for (i = 0; i < nr_phys_nodes; i++)
+ for (i = 0; i < MAX_NUMNODES; i++)
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
@@ -553,11 +563,9 @@ static int __init numa_emulation(unsigned long start_pfn,
{
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
- int num_phys_nodes;
int num_nodes;
int i;
- num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn,
unsigned long n;
n = simple_strtoul(cmdline, NULL, 0);
- num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
+ num_nodes = split_nodes_interleave(addr, max_addr, n);
}
if (num_nodes < 0)
@@ -595,7 +603,8 @@ static int __init numa_emulation(unsigned long start_pfn,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
- acpi_fake_nodes(nodes, num_nodes);
+ setup_physnodes(addr, max_addr, acpi, amd);
+ fake_physnodes(acpi, amd, num_nodes);
numa_init_array();
return 0;
}
@@ -610,8 +619,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
return;
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
@@ -661,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-static __init int numa_setup(char *opt)
-{
- if (!opt)
- return -EINVAL;
- if (!strncmp(opt, "off", 3))
- numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
- if (!strncmp(opt, "fake=", 5))
- cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
- if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
- return 0;
-}
-early_param("numa", numa_setup);
-
#ifdef CONFIG_NUMA
static __init int find_near_online_node(int node)
@@ -767,6 +762,7 @@ void __cpuinit numa_clear_node(int cpu)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+#ifndef CONFIG_NUMA_EMU
void __cpuinit numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -776,34 +772,115 @@ void __cpuinit numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
+#else
+void __cpuinit numa_add_cpu(int cpu)
+{
+ unsigned long addr;
+ u16 apicid;
+ int physnid;
+ int nid = NUMA_NO_NODE;
+
+ apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ if (apicid != BAD_APICID)
+ nid = apicid_to_node[apicid];
+ if (nid == NUMA_NO_NODE)
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ /*
+ * Use the starting address of the emulated node to find which physical
+ * node it is allocated on.
+ */
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ break;
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid) {
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+ }
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#endif /* !CONFIG_NUMA_EMU */
#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ char buf[64];
+
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return NULL;
+ }
+
+ cpulist_scnprintf(buf, sizeof(buf), mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, buf);
+ return mask;
+}
/*
* --------- debug versions of the numa functions ---------
*/
+#ifndef CONFIG_NUMA_EMU
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
- int node = early_cpu_to_node(cpu);
struct cpumask *mask;
- char buf[64];
- mask = node_to_cpumask_map[node];
- if (mask == NULL) {
- printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
- dump_stack();
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
return;
- }
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
+}
+#else
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ int i;
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+ for_each_online_node(i) {
+ unsigned long addr;
+
+ addr = node_start_pfn(i) << PAGE_SHIFT;
+ if (addr < physnodes[node].start ||
+ addr >= physnodes[node].end)
+ continue;
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
+ return;
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+ }
}
+#endif /* CONFIG_NUMA_EMU */
void __cpuinit numa_add_cpu(int cpu)
{
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ *pmdp = entry;
+ pmd_update_defer(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+
+ return changed;
+}
+#endif
+
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ if (ret)
+ pmd_update(vma->vm_mm, addr, pmdp);
+
+ return ret;
+}
+#endif
+
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
return young;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int set;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+ (unsigned long *)pmdp);
+ if (set) {
+ pmd_update(vma->vm_mm, address, pmdp);
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f16434568a51..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
static int __initdata num_memory_chunks; /* total number of memory chunks */
static u8 __initdata apicid_to_pxm[MAX_APICID];
-int numa_off __initdata;
int acpi_numa __initdata;
static __init void bad_srat(void)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 171a0aacb99a..603d285d1daa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -349,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
void __init acpi_numa_arch_fixup(void) {}
-int __init acpi_get_nodes(struct bootnode *physnodes)
+#ifdef CONFIG_NUMA_EMU
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end)
{
int i;
- int ret = 0;
for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
+ cutoff_node(i, start, end);
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
}
- return ret;
}
+#endif /* CONFIG_NUMA_EMU */
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -505,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
- "topology.\n");
for (i = 0; i < num_nodes; i++) {
int nid, pxm;
@@ -526,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
fake_apicid_to_node[j] == NUMA_NO_NODE)
fake_apicid_to_node[j] = i;
}
+
+ /*
+ * If there are apicid-to-node mappings for physical nodes that do not
+ * have a corresponding emulated node, it should default to a guaranteed
+ * value.
+ */
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ if (apicid_to_node[i] != NUMA_NO_NODE &&
+ fake_apicid_to_node[i] == NUMA_NO_NODE)
+ fake_apicid_to_node[i] = 0;
+
for (i = 0; i < num_nodes; i++)
__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f24a8533bcdf..e2b7b0c06cdf 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- case DIE_NMI_IPI:
if (ctr_running)
model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
else if (!nmi_enabled)
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 2
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static void nmi_cpu_restore_registers(struct op_msrs *msrs)
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index 0636dd93cef8..720bf5a53c51 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
static struct notifier_block profile_timer_exceptions_nb = {
.notifier_call = profile_timer_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = NMI_LOW_PRIOR,
};
static int timer_start(void)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index fc1e8fe07e5c..e27dffbbb1a7 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/range.h>
+#include <asm/amd_nb.h>
#include <asm/pci_x86.h>
#include <asm/pci-direct.h>
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
.notifier_call = amd_cpu_notify,
};
+static void __init pci_enable_pci_io_ecs(void)
+{
+#ifdef CONFIG_AMD_NB
+ unsigned int i, n;
+
+ for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
+ u8 bus = amd_nb_bus_dev_ranges[i].bus;
+ u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
+ u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (; slot < limit; ++slot) {
+ u32 val = read_pci_config(bus, slot, 3, 0);
+
+ if (!early_is_amd_nb(val))
+ continue;
+
+ val = read_pci_config(bus, slot, 3, 0x8c);
+ if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
+ val |= ENABLE_CF8_EXT_CFG >> 32;
+ write_pci_config(bus, slot, 3, 0x8c, val);
+ }
+ ++n;
+ }
+ }
+ pr_info("Extended Config Space enabled on %u nodes\n", n);
+#endif
+}
+
static int __init pci_io_ecs_init(void)
{
int cpu;
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void)
if (boot_cpu_data.x86 < 0x10)
return 0;
+ /* Try the PCI method first. */
+ if (early_pci_allowed())
+ pci_enable_pci_io_ecs();
+
register_cpu_notifier(&amd_cpu_notifier);
for_each_online_cpu(cpu)
amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfbd..ab8269b0da29 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
* option) any later version.
*/
+#include <linux/acpi.h>
#include <linux/delay.h>
#include <linux/dmi.h>
#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
u8 fbus, lbus;
int i;
+#ifdef CONFIG_ACPI
/*
- * The x86_pci_root_bus_res_quirks() function already refuses to use
- * this information if ACPI _CRS was used. Therefore, we don't bother
- * checking if ACPI is enabled, and just generate the information
- * for both the ACPI _CRS and no ACPI cases.
+ * We should get host bridge information from ACPI unless the BIOS
+ * doesn't support it.
*/
+ if (acpi_os_get_root_pointer())
+ return;
+#endif
info = &pci_root_info[pci_root_num];
pci_root_num++;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f7c8a399978c..5fe75026ecc2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
unsigned int pci_early_dump_regs;
static int pci_bf_sort;
+static int smbios_type_b1_flag;
int pci_routeirq;
int noioapicquirk;
#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
return 0;
}
+static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
+ void *private_data)
+{
+ u8 *d = (u8 *)dm + 4;
+
+ if (dm->type != 0xB1)
+ return;
+ switch (((*(u32 *)d) >> 9) & 0x03) {
+ case 0x00:
+ printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
+ break;
+ case 0x01: /* set pci=bfsort */
+ smbios_type_b1_flag = 1;
+ break;
+ case 0x02: /* do not set pci=bfsort */
+ smbios_type_b1_flag = 2;
+ break;
+ default:
+ break;
+ }
+}
+
+static int __devinit find_sort_method(const struct dmi_system_id *d)
+{
+ dmi_walk(read_dmi_type_b1, NULL);
+
+ if (smbios_type_b1_flag == 1) {
+ set_bf_sort(d);
+ return 0;
+ }
+ return -1;
+}
+
/*
* Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
*/
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
},
#endif /* __i386__ */
{
+ .callback = find_sort_method,
+ .ident = "Dell System",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+ },
+ },
+ {
.callback = set_bf_sort,
.ident = "Dell PowerEdge 1950",
.matches = {
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf9..87e6c8323117 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH10_1:
case PCI_DEVICE_ID_INTEL_ICH10_2:
case PCI_DEVICE_ID_INTEL_ICH10_3:
- case PCI_DEVICE_ID_INTEL_PATSBURG_LPC:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..25bfdbb5b130 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -103,7 +103,7 @@ struct dw_spi_reg {
static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
static u32 *pclk_spi0;
-/* Always contains an accessable address, start with 0 */
+/* Always contains an accessible address, start with 0 */
static struct dw_spi_reg *pspi;
static struct kmsg_dumper dw_dumper;
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c31b8fcb5a86..e797428b163b 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_OLPC) += olpc.o
obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index f5442c03abc3..127775696d6c 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -1,6 +1,7 @@
/*
* Support for features of the OLPC XO-1 laptop
*
+ * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
* Copyright (C) 2010 One Laptop per Child
* Copyright (C) 2006 Red Hat, Inc.
* Copyright (C) 2006 Advanced Micro Devices, Inc.
@@ -12,8 +13,6 @@
*/
#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
#include <linux/platform_device.h>
#include <linux/pm.h>
@@ -22,9 +21,6 @@
#define DRV_NAME "olpc-xo1"
-#define PMS_BAR 4
-#define ACPI_BAR 5
-
/* PMC registers (PMS block) */
#define PM_SCLK 0x10
#define PM_IN_SLPCTL 0x20
@@ -57,65 +53,67 @@ static void xo1_power_off(void)
outl(0x00002000, acpi_base + PM1_CNT);
}
-/* Read the base addresses from the PCI BAR info */
-static int __devinit setup_bases(struct pci_dev *pdev)
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
{
- int r;
+ struct resource *res;
- r = pci_enable_device_io(pdev);
- if (r) {
- dev_err(&pdev->dev, "can't enable device IO\n");
- return r;
- }
+ /* don't run on non-XOs */
+ if (!machine_is_olpc())
+ return -ENODEV;
- r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
- if (r) {
- dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
- return r;
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "can't fetch device resource info\n");
+ return -EIO;
}
- r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
- if (r) {
- dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
- pci_release_region(pdev, ACPI_BAR);
- return r;
+ if (!request_region(res->start, resource_size(res), DRV_NAME)) {
+ dev_err(&pdev->dev, "can't request region\n");
+ return -EIO;
}
- acpi_base = pci_resource_start(pdev, ACPI_BAR);
- pms_base = pci_resource_start(pdev, PMS_BAR);
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = res->start;
+ else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+ acpi_base = res->start;
+
+ /* If we have both addresses, we can override the poweroff hook */
+ if (pms_base && acpi_base) {
+ pm_power_off = xo1_power_off;
+ printk(KERN_INFO "OLPC XO-1 support registered\n");
+ }
return 0;
}
-static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
{
- struct pci_dev *pcidev;
- int r;
-
- pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
- NULL);
- if (!pdev)
- return -ENODEV;
-
- r = setup_bases(pcidev);
- if (r)
- return r;
+ struct resource *r;
- pm_power_off = xo1_power_off;
+ r = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ release_region(r->start, resource_size(r));
- printk(KERN_INFO "OLPC XO-1 support registered\n");
- return 0;
-}
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = 0;
+ else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+ acpi_base = 0;
-static int __devexit olpc_xo1_remove(struct platform_device *pdev)
-{
pm_power_off = NULL;
return 0;
}
-static struct platform_driver olpc_xo1_driver = {
+static struct platform_driver cs5535_pms_drv = {
+ .driver = {
+ .name = "cs5535-pms",
+ .owner = THIS_MODULE,
+ },
+ .probe = olpc_xo1_probe,
+ .remove = __devexit_p(olpc_xo1_remove),
+};
+
+static struct platform_driver cs5535_acpi_drv = {
.driver = {
- .name = DRV_NAME,
+ .name = "cs5535-acpi",
.owner = THIS_MODULE,
},
.probe = olpc_xo1_probe,
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = {
static int __init olpc_xo1_init(void)
{
- return platform_driver_register(&olpc_xo1_driver);
+ int r;
+
+ r = platform_driver_register(&cs5535_pms_drv);
+ if (r)
+ return r;
+
+ r = platform_driver_register(&cs5535_acpi_drv);
+ if (r)
+ platform_driver_unregister(&cs5535_pms_drv);
+
+ return r;
}
static void __exit olpc_xo1_exit(void)
{
- platform_driver_unregister(&olpc_xo1_driver);
+ platform_driver_unregister(&cs5535_acpi_drv);
+ platform_driver_unregister(&cs5535_pms_drv);
}
MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000000..dab874647530
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,183 @@
+/*
+ * OLPC-specific OFW device tree support code.
+ *
+ * Paul Mackerras August 1996.
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ * {engebret|bergner}@us.ibm.com
+ *
+ * Adapted for sparc by David S. Miller davem@davemloft.net
+ * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/of.h>
+#include <linux/of_pdt.h>
+#include <asm/olpc_ofw.h>
+
+static phandle __init olpc_dt_getsibling(phandle node)
+{
+ const void *args[] = { (void *)node };
+ void *res[] = { &node };
+
+ if ((s32)node == -1)
+ return 0;
+
+ if (olpc_ofw("peer", args, res) || (s32)node == -1)
+ return 0;
+
+ return node;
+}
+
+static phandle __init olpc_dt_getchild(phandle node)
+{
+ const void *args[] = { (void *)node };
+ void *res[] = { &node };
+
+ if ((s32)node == -1)
+ return 0;
+
+ if (olpc_ofw("child", args, res) || (s32)node == -1) {
+ pr_err("PROM: %s: fetching child failed!\n", __func__);
+ return 0;
+ }
+
+ return node;
+}
+
+static int __init olpc_dt_getproplen(phandle node, const char *prop)
+{
+ const void *args[] = { (void *)node, prop };
+ int len;
+ void *res[] = { &len };
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("getproplen", args, res)) {
+ pr_err("PROM: %s: getproplen failed!\n", __func__);
+ return -1;
+ }
+
+ return len;
+}
+
+static int __init olpc_dt_getproperty(phandle node, const char *prop,
+ char *buf, int bufsize)
+{
+ int plen;
+
+ plen = olpc_dt_getproplen(node, prop);
+ if (plen > bufsize || plen < 1) {
+ return -1;
+ } else {
+ const void *args[] = { (void *)node, prop, buf, (void *)plen };
+ void *res[] = { &plen };
+
+ if (olpc_ofw("getprop", args, res)) {
+ pr_err("PROM: %s: getprop failed!\n", __func__);
+ return -1;
+ }
+ }
+
+ return plen;
+}
+
+static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
+{
+ const void *args[] = { (void *)node, prev, buf };
+ int success;
+ void *res[] = { &success };
+
+ buf[0] = '\0';
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("nextprop", args, res) || success != 1)
+ return -1;
+
+ return 0;
+}
+
+static int __init olpc_dt_pkg2path(phandle node, char *buf,
+ const int buflen, int *len)
+{
+ const void *args[] = { (void *)node, buf, (void *)buflen };
+ void *res[] = { len };
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("package-to-path", args, res) || *len < 1)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int prom_early_allocated __initdata;
+
+void * __init prom_early_alloc(unsigned long size)
+{
+ static u8 *mem;
+ static size_t free_mem;
+ void *res;
+
+ if (free_mem < size) {
+ const size_t chunk_size = max(PAGE_SIZE, size);
+
+ /*
+ * To mimimize the number of allocations, grab at least
+ * PAGE_SIZE of memory (that's an arbitrary choice that's
+ * fast enough on the platforms we care about while minimizing
+ * wasted bootmem) and hand off chunks of it to callers.
+ */
+ res = alloc_bootmem(chunk_size);
+ if (!res)
+ return NULL;
+ prom_early_allocated += chunk_size;
+ memset(res, 0, chunk_size);
+ free_mem = chunk_size;
+ mem = res;
+ }
+
+ /* allocate from the local cache */
+ free_mem -= size;
+ res = mem;
+ mem += size;
+ return res;
+}
+
+static struct of_pdt_ops prom_olpc_ops __initdata = {
+ .nextprop = olpc_dt_nextprop,
+ .getproplen = olpc_dt_getproplen,
+ .getproperty = olpc_dt_getproperty,
+ .getchild = olpc_dt_getchild,
+ .getsibling = olpc_dt_getsibling,
+ .pkg2path = olpc_dt_pkg2path,
+};
+
+void __init olpc_dt_build_devicetree(void)
+{
+ phandle root;
+
+ if (!olpc_ofw_is_installed())
+ return;
+
+ root = olpc_dt_getsibling(0);
+ if (!root) {
+ pr_err("PROM: unable to get root node from OFW!\n");
+ return;
+ }
+ of_pdt_build_devicetree(root, &prom_olpc_ops);
+
+ pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
+ prom_early_allocated);
+}
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 787320464379..e7604f62870d 100644
--- a/arch/x86/platform/olpc/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -110,3 +110,8 @@ void __init olpc_ofw_detect(void)
(unsigned long)olpc_ofw_cif, (-start) >> 20);
reserve_top_address(-start);
}
+
+bool __init olpc_ofw_is_installed(void)
+{
+ return olpc_ofw_cif != NULL;
+}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o platform-pci-unplug.o
+ grant-table.o suspend.o platform-pci-unplug.o \
+ p2m.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index aa8c89ae54cf..50542efe45fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1174,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void)
xen_smp_init();
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * The pages we from Xen are not related to machine pages, so
+ * any NUMA information the kernel tries to get from ACPI will
+ * be meaningless. Prevent it from trying.
+ */
+ acpi_numa = -1;
+#endif
+
pgd = (pgd_t *)xen_start_info->pt_base;
if (!xen_initial_domain())
@@ -1185,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
local_irq_disable();
- early_boot_irqs_off();
+ early_boot_irqs_disabled = true;
memblock_init();
@@ -1256,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
-static uint32_t xen_cpuid_base(void)
-{
- uint32_t base, eax, ebx, ecx, edx;
- char signature[13];
-
- for (base = 0x40000000; base < 0x40010000; base += 0x100) {
- cpuid(base, &eax, &ebx, &ecx, &edx);
- *(uint32_t *)(signature + 0) = ebx;
- *(uint32_t *)(signature + 4) = ecx;
- *(uint32_t *)(signature + 8) = edx;
- signature[12] = 0;
-
- if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
- return base;
- }
-
- return 0;
-}
-
static int init_hvm_pv_info(int *major, int *minor)
{
uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1384,6 +1374,18 @@ static bool __init xen_hvm_platform(void)
return true;
}
+bool xen_hvm_need_lapic(void)
+{
+ if (xen_pv_domain())
+ return false;
+ if (!xen_hvm_domain())
+ return false;
+ if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
.name = "Xen HVM",
.detect = xen_hvm_platform,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..6a6fe8939645 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
#endif
};
-void __init xen_init_irq_ops()
+void __init xen_init_irq_ops(void)
{
pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fde..5e92b61ad574 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
-/*
- * Xen leaves the responsibility for maintaining p2m mappings to the
- * guests themselves, but it must also access and update the p2m array
- * during suspend/resume when all the pages are reallocated.
- *
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
- *
- * Xen
- * |
- * p2m_top p2m_top_mfn
- * / \ / \
- * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
- * / \ / \ / /
- * p2m p2m p2m p2m p2m p2m p2m ...
- *
- * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
- *
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
- * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
- *
- * P2M_PER_PAGE depends on the architecture, as a mfn is always
- * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively.
- */
-
-unsigned long xen_max_p2m_pfn __read_mostly;
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
- BUG_ON(pfn >= MAX_P2M_PFN);
- return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
-}
-
-static inline unsigned p2m_mid_index(unsigned long pfn)
-{
- return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
-}
-
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
-static void p2m_top_init(unsigned long ***top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing;
-}
-
-static void p2m_top_mfn_init(unsigned long *top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = virt_to_mfn(p2m_mid_missing_mfn);
-}
-
-static void p2m_top_mfn_p_init(unsigned long **top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing_mfn;
-}
-
-static void p2m_mid_init(unsigned long **mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
-}
-
-static void p2m_mid_mfn_init(unsigned long *mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
-}
-
-static void p2m_init(unsigned long *p2m)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- p2m[i] = INVALID_P2M_ENTRY;
-}
-
-/*
- * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
- *
- * This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
- * to allocate memory.
- *
- * - After resume we're called from within stop_machine, but the mfn
- * tree should alreay be completely allocated.
- */
-void xen_build_mfn_list_list(void)
-{
- unsigned long pfn;
-
- /* Pre-initialize p2m_top_mfn to be completely missing */
- if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
-
- p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_p_init(p2m_top_mfn_p);
-
- p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_init(p2m_top_mfn);
- } else {
- /* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
- }
-
- for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned long **mid;
- unsigned long *mid_mfn_p;
-
- mid = p2m_top[topidx];
- mid_mfn_p = p2m_top_mfn_p[topidx];
-
- /* Don't bother allocating any mfn mid levels if
- * they're just missing, just update the stored mfn,
- * since all could have changed over a migrate.
- */
- if (mid == p2m_mid_missing) {
- BUG_ON(mididx);
- BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
- pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
- continue;
- }
-
- if (mid_mfn_p == p2m_mid_missing_mfn) {
- /*
- * XXX boot-time only! We should never find
- * missing parts of the mfn tree after
- * runtime. extend_brk() will BUG if we call
- * it too late.
- */
- mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
-
- p2m_top_mfn_p[topidx] = mid_mfn_p;
- }
-
- p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
- }
-}
-
-void xen_setup_mfn_list_list(void)
-{
- BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
- HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
-}
-
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
- unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
- unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned long pfn;
-
- xen_max_p2m_pfn = max_pfn;
-
- p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_missing);
-
- p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
-
- p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_init(p2m_top);
-
- /*
- * The domain builder gives us a pre-constructed p2m array in
- * mfn_list for all the pages initially given to us, so we just
- * need to graft that into our tree structure.
- */
- for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
-
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
-
- p2m_top[topidx] = mid;
- }
-
- p2m_top[topidx][mididx] = &mfn_list[pfn];
- }
-}
-
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- return p2m_top[topidx][mididx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-
-static void *alloc_p2m_page(void)
-{
- return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
-
-static void free_p2m_page(void *p)
-{
- free_page((unsigned long)p);
-}
-
-/*
- * Fully allocate the p2m structure for a given pfn. We need to check
- * that both the top and mid levels are allocated, and make sure the
- * parallel mfn tree is kept in sync. We may race with other cpus, so
- * the new pages are installed with cmpxchg; if we lose the race then
- * simply free the page we allocated and use the one that's there.
- */
-static bool alloc_p2m(unsigned long pfn)
-{
- unsigned topidx, mididx;
- unsigned long ***top_p, **mid;
- unsigned long *top_mfn_p, *mid_mfn;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
- top_p = &p2m_top[topidx];
- mid = *top_p;
-
- if (mid == p2m_mid_missing) {
- /* Mid level is missing, allocate a new one */
- mid = alloc_p2m_page();
- if (!mid)
- return false;
-
- p2m_mid_init(mid);
-
- if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
- free_p2m_page(mid);
- }
-
- top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = p2m_top_mfn_p[topidx];
-
- BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
-
- if (mid_mfn == p2m_mid_missing_mfn) {
- /* Separately check the mid mfn level */
- unsigned long missing_mfn;
- unsigned long mid_mfn_mfn;
-
- mid_mfn = alloc_p2m_page();
- if (!mid_mfn)
- return false;
-
- p2m_mid_mfn_init(mid_mfn);
-
- missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
- mid_mfn_mfn = virt_to_mfn(mid_mfn);
- if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
- free_p2m_page(mid_mfn);
- else
- p2m_top_mfn_p[topidx] = mid_mfn;
- }
-
- if (p2m_top[topidx][mididx] == p2m_missing) {
- /* p2m leaf page is missing */
- unsigned long *p2m;
-
- p2m = alloc_p2m_page();
- if (!p2m)
- return false;
-
- p2m_init(p2m);
-
- if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
- free_p2m_page(p2m);
- else
- mid_mfn[mididx] = virt_to_mfn(p2m);
- }
-
- return true;
-}
-
-/* Try to install p2m mapping; fail if intermediate bits missing */
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN)) {
- BUG_ON(mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- if (p2m_top[topidx][mididx] == p2m_missing)
- return mfn == INVALID_P2M_ENTRY;
-
- p2m_top[topidx][mididx][idx] = mfn;
-
- return true;
-}
-
-bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!alloc_p2m(pfn))
- return false;
-
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
- }
-
- return true;
-}
-
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
offset = address & ~PAGE_MASK;
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
+EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
void make_lowmem_page_readonly(void *vaddr)
{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..ddc81a06edb9
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,528 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+static void __init m2p_override_init(void);
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
+void xen_build_mfn_list_list(void)
+{
+ unsigned long pfn;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ unsigned long pfn;
+
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid);
+
+ p2m_top[topidx] = mid;
+ }
+
+ /*
+ * As long as the mfn_list has enough entries to completely
+ * fill a p2m page, pointing into the array is ok. But if
+ * not the entries beyond the last pfn will be undefined.
+ * And guessing that the 'what-ever-there-is' does not take it
+ * too kindly when changing it to invalid markers, a new page
+ * is allocated, initialized and filled with the valid part.
+ */
+ if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
+ unsigned long p2midx;
+ unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m);
+
+ for (p2midx = 0; pfn + p2midx < max_pfn; p2midx++) {
+ p2m[p2midx] = mfn_list[pfn + p2midx];
+ }
+ p2m_top[topidx][mididx] = p2m;
+ } else
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
+
+ m2p_override_init();
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN))
+ return INVALID_P2M_ENTRY;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
+
+ return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!alloc_p2m(pfn))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+ unsigned i;
+
+ m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
+
+ for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+ INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+ return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long address;
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_add_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ page->private = mfn;
+ page->index = pfn_to_mfn(pfn);
+
+ __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+ if (!PageHighMem(page))
+ /* Just zap old mapping for now */
+ pte_clear(&init_mm, address, ptep);
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return 0;
+}
+
+int m2p_remove_override(struct page *page)
+{
+ unsigned long flags;
+ unsigned long mfn;
+ unsigned long pfn;
+ unsigned long address;
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ mfn = get_phys_to_machine(pfn);
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
+ return -EINVAL;
+
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_remove_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+ __set_phys_to_machine(pfn, page->index);
+
+ if (!PageHighMem(page))
+ set_pte_at(&init_mm, address, ptep,
+ pfn_pte(pfn, PAGE_KERNEL));
+ /* No tlb flush necessary because the caller already
+ * left the pte unmapped. */
+
+ return 0;
+}
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+ struct page *p, *ret;
+
+ ret = NULL;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (p->private == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+ struct page *p = m2p_find_override(mfn);
+ unsigned long ret = pfn;
+
+ if (p)
+ ret = page_to_pfn(p);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);