summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2010-08-15 09:06:31 +0400
committerLen Brown <len.brown@intel.com>2010-08-15 09:06:31 +0400
commit95ee46aa8698f2000647dfb362400fadbb5807cf (patch)
treee5a05c7297f997e191c73091934e42e3195c0e40 /arch/x86
parentcfa806f059801dbe7e435745eb2e187c8bfe1e7f (diff)
parent92fa5bd9a946b6e7aab6764e7312e4e3d9bed295 (diff)
downloadlinux-95ee46aa8698f2000647dfb362400fadbb5807cf.tar.xz
Merge branch 'linus' into release
Conflicts: drivers/acpi/debug.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/boot/Makefile8
-rw-r--r--arch/x86/boot/boot.h28
-rw-r--r--arch/x86/boot/cmdline.c6
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/cmdline.c21
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S13
-rw-r--r--arch/x86/boot/compressed/head_64.S13
-rw-r--r--arch/x86/boot/compressed/misc.c56
-rw-r--r--arch/x86/boot/compressed/misc.h39
-rw-r--r--arch/x86/boot/compressed/string.c2
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c139
-rw-r--r--arch/x86/boot/main.c9
-rw-r--r--arch/x86/boot/printf.c4
-rw-r--r--arch/x86/boot/string.c63
-rw-r--r--arch/x86/boot/tty.c37
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/ia32/sys_ia32.c23
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h7
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/bootparam.h11
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h198
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h83
-rw-r--r--arch/x86/include/asm/cpufeature.h43
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/highmem.h2
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/i387.h26
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h20
-rw-r--r--arch/x86/include/asm/irq_vectors.h3
-rw-r--r--arch/x86/include/asm/kdebug.h6
-rw-r--r--arch/x86/include/asm/kgdb.h20
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/local64.h1
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/include/asm/mrst.h26
-rw-r--r--arch/x86/include/asm/msr-index.h26
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h31
-rw-r--r--arch/x86/include/asm/page.h7
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/perf_event.h18
-rw-r--r--arch/x86/include/asm/perf_event_p4.h99
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/processor.h21
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/rwsem.h21
-rw-r--r--arch/x86/include/asm/scatterlist.h1
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/stacktrace.h49
-rw-r--r--arch/x86/include/asm/sys_ia32.h15
-rw-r--r--arch/x86/include/asm/syscalls.h2
-rw-r--r--arch/x86/include/asm/system.h7
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h151
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h14
-rw-r--r--arch/x86/include/asm/xsave.h40
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c8
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/nmi.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/common.c28
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c8
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c108
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c206
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c62
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c165
-rw-r--r--arch/x86/kernel/cpu/scattered.c63
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)58
-rw-r--r--arch/x86/kernel/cpu/vmware.c9
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S16
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c15
-rw-r--r--arch/x86/kernel/hw_breakpoint.c51
-rw-r--r--arch/x86/kernel/i387.c42
-rw-r--r--arch/x86/kernel/kgdb.c189
-rw-r--r--arch/x86/kernel/kprobes.c33
-rw-r--r--arch/x86/kernel/mpparse.c16
-rw-r--r--arch/x86/kernel/mrst.c105
-rw-r--r--arch/x86/kernel/olpc.c20
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/pci-dma.c7
-rw-r--r--arch/x86/kernel/process.c56
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tlb_uv.c760
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/xsave.c195
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c807
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h252
-rw-r--r--arch/x86/kvm/svm.c147
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c261
-rw-r--r--arch/x86/kvm/x86.c1176
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S238
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmpxchg.c (renamed from arch/x86/kernel/cpu/cmpxchg.c)18
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/mm/dump_pagetables.c32
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/highmem_32.c4
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmmio.c16
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pf_in.c30
-rw-r--r--arch/x86/mm/testmmiotrace.c22
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/oprofile/nmi_int.c17
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/irq.c6
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/vdso/Makefile3
-rwxr-xr-xarch/x86/vdso/checkundef.sh10
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c201
-rw-r--r--arch/x86/xen/mmu.c328
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c58
-rw-r--r--arch/x86/xen/platform-pci-unplug.c137
-rw-r--r--arch/x86/xen/setup.c72
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/suspend.c12
-rw-r--r--arch/x86/xen/time.c96
-rw-r--r--arch/x86/xen/xen-ops.h13
202 files changed, 6510 insertions, 3148 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a66..a84fc34c8f77 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
+ select HAVE_PERF_EVENTS_NMI
select ANON_INODES
select HAVE_ARCH_KMEMCHECK
select HAVE_USER_RETURN_NOTIFIER
@@ -72,9 +73,6 @@ config ARCH_DEFCONFIG
default "arch/x86/configs/i386_defconfig" if X86_32
default "arch/x86/configs/x86_64_defconfig" if X86_64
-config GENERIC_TIME
- def_bool y
-
config GENERIC_CMOS_UPDATE
def_bool y
@@ -2046,7 +2044,7 @@ config SCx200
config SCx200HR_TIMER
tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
- depends on SCx200 && GENERIC_TIME
+ depends on SCx200
default y
---help---
This driver provides a clocksource built upon the on-chip
@@ -2062,6 +2060,15 @@ config OLPC
Add support for detecting the unique features of the OLPC
XO hardware.
+config OLPC_OPENFIRMWARE
+ bool "Support for OLPC's Open Firmware"
+ depends on !X86_64 && !X86_PAE
+ default y if OLPC
+ help
+ This option adds support for the implementation of Open Firmware
+ that is used on the OLPC XO-1 Children's Machine.
+ If unsure, say N here.
+
endif # X86_32
config K8_NB
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd7..f7cb086b4add 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
-setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
-setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
-setup-y += version.o
+setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
+setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
+setup-y += video-mode.o version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
# The link order of the video-*.o modules can matter. In particular,
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f2..c7093bd9f2d3 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -28,6 +28,7 @@
#include "bitops.h"
#include <asm/cpufeature.h>
#include <asm/processor-flags.h>
+#include "ctype.h"
/* Useful macros */
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -37,6 +38,8 @@
extern struct setup_header hdr;
extern struct boot_params boot_params;
+#define cpu_relax() asm volatile("rep; nop")
+
/* Basic port I/O */
static inline void outb(u8 v, u16 port)
{
@@ -198,11 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
return diff;
}
-static inline int isdigit(int ch)
-{
- return (ch >= '0') && (ch <= '9');
-}
-
/* Heap -- available for dynamic lists. */
extern char _end[];
extern char *HEAP;
@@ -287,8 +285,18 @@ struct biosregs {
void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
/* cmdline.c */
-int cmdline_find_option(const char *option, char *buffer, int bufsize);
-int cmdline_find_option_bool(const char *option);
+int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
+}
+
+static inline int cmdline_find_option_bool(const char *option)
+{
+ return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+}
+
/* cpu.c, cpucheck.c */
struct cpu_features {
@@ -300,6 +308,10 @@ extern struct cpu_features cpu;
int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
int validate_cpu(void);
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
/* edd.c */
void query_edd(void);
@@ -329,8 +341,10 @@ void initregs(struct biosregs *regs);
/* string.c */
int strcmp(const char *str1, const char *str2);
+int strncmp(const char *cs, const char *ct, size_t count);
size_t strnlen(const char *s, size_t maxlen);
unsigned int atou(const char *s);
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
/* tty.c */
void puts(const char *);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce0..6b3b6f708c04 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c)
* Returns the length of the argument (regardless of if it was
* truncated to fit in the buffer), or -1 on not found.
*/
-int cmdline_find_option(const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
{
- u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
addr_t cptr;
char c;
int len = -1;
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
* Returns the position of that option (starts counting with 1)
* or 0 on not found
*/
-int cmdline_find_option_bool(const char *option)
+int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
{
- u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
addr_t cptr;
char c;
int pos = 0, wstart = 0;
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fbb47daf2459..0c229551eead 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T
hostprogs-y := mkpiggy
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
$(call if_changed,ld)
@:
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 000000000000..cb62f786990d
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,21 @@
+#include "misc.h"
+
+static unsigned long fs;
+static inline void set_fs(unsigned long seg)
+{
+ fs = seg << 4; /* shift it back */
+}
+typedef unsigned long addr_t;
+static inline char rdfs8(addr_t addr)
+{
+ return *((char *)(fs + addr));
+}
+#include "../cmdline.c"
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
+}
+int cmdline_find_option_bool(const char *option)
+{
+ return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 000000000000..261e81fb9582
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
+#include "misc.h"
+
+int early_serial_base;
+
+#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70ffae2..67a655a39ce4 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -124,6 +124,19 @@ relocated:
rep stosl
/*
+ * Adjust our own GOT
+ */
+ leal _got(%ebx), %edx
+ leal _egot(%ebx), %ecx
+1:
+ cmpl %ecx, %edx
+ jae 2f
+ addl %ebx, (%edx)
+ addl $4, %edx
+ jmp 1b
+2:
+
+/*
* Do the decompression, and jump to the new kernel..
*/
leal z_extract_offset_negative(%ebx), %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc9c06a..52f85a196fa0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -280,6 +280,19 @@ relocated:
rep stosq
/*
+ * Adjust our own GOT
+ */
+ leaq _got(%rip), %rdx
+ leaq _egot(%rip), %rcx
+1:
+ cmpq %rcx, %rdx
+ jae 2f
+ addq %rbx, (%rdx)
+ addq $8, %rdx
+ jmp 1b
+2:
+
+/*
* Do the decompression, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 51e240779a44..8f7bef8e9fff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -9,23 +9,7 @@
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-/*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
- */
-#undef CONFIG_PARAVIRT
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
-
-#include <linux/linkage.h>
-#include <linux/screen_info.h>
-#include <linux/elf.h>
-#include <linux/io.h>
-#include <asm/page.h>
-#include <asm/boot.h>
-#include <asm/bootparam.h>
+#include "misc.h"
/* WARNING!!
* This code is compiled with -fPIC and it is relocated dynamically
@@ -123,15 +107,13 @@ static void error(char *m);
/*
* This is set up by the setup-routine at boot-time
*/
-static struct boot_params *real_mode; /* Pointer to real-mode data */
+struct boot_params *real_mode; /* Pointer to real-mode data */
static int quiet;
+static int debug;
void *memset(void *s, int c, size_t n);
void *memcpy(void *dest, const void *src, size_t n);
-static void __putstr(int, const char *);
-#define putstr(__x) __putstr(0, __x)
-
#ifdef CONFIG_X86_64
#define memptr long
#else
@@ -170,7 +152,21 @@ static void scroll(void)
vidmem[i] = ' ';
}
-static void __putstr(int error, const char *s)
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+static void serial_putchar(int ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+void __putstr(int error, const char *s)
{
int x, y, pos;
char c;
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s)
if (!error)
return;
#endif
+ if (early_serial_base) {
+ const char *str = s;
+ while (*str) {
+ if (*str == '\n')
+ serial_putchar('\r');
+ serial_putchar(*str++);
+ }
+ }
if (real_mode->screen_info.orig_video_mode == 0 &&
lines == 0 && cols == 0)
@@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
{
real_mode = rmode;
- if (real_mode->hdr.loadflags & QUIET_FLAG)
+ if (cmdline_find_option_bool("quiet"))
quiet = 1;
+ if (cmdline_find_option_bool("debug"))
+ debug = 1;
if (real_mode->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
@@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
lines = real_mode->screen_info.orig_video_lines;
cols = real_mode->screen_info.orig_video_cols;
+ console_init();
+ if (debug)
+ putstr("early console in decompress_kernel\n");
+
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 000000000000..3f19c81a6203
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,39 @@
+#ifndef BOOT_COMPRESSED_MISC_H
+#define BOOT_COMPRESSED_MISC_H
+
+/*
+ * we have to be careful, because no indirections are allowed here, and
+ * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening
+ */
+#undef CONFIG_PARAVIRT
+#ifdef CONFIG_X86_32
+#define _ASM_X86_DESC_H 1
+#endif
+
+#include <linux/linkage.h>
+#include <linux/screen_info.h>
+#include <linux/elf.h>
+#include <linux/io.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+
+#define BOOT_BOOT_H
+#include "../ctype.h"
+
+/* misc.c */
+extern struct boot_params *real_mode; /* Pointer to real-mode data */
+void __putstr(int error, const char *s);
+#define putstr(__x) __putstr(0, __x)
+#define puts(__x) __putstr(0, __x)
+
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+int cmdline_find_option_bool(const char *option);
+
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
+#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 000000000000..19b3e693cd72
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,2 @@
+#include "misc.h"
+#include "../string.c"
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 5ddabceee124..34d047c98284 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -41,6 +41,12 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+ .got : {
+ _got = .;
+ KEEP(*(.got.plt))
+ KEEP(*(.got))
+ _egot = .;
+ }
.data : {
_data = . ;
*(.data)
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 000000000000..25e13403193c
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
+#ifndef BOOT_ISDIGIT_H
+
+#define BOOT_ISDIGIT_H
+
+static inline int isdigit(int ch)
+{
+ return (ch >= '0') && (ch <= '9');
+}
+
+static inline int isxdigit(int ch)
+{
+ if (isdigit(ch))
+ return true;
+
+ if ((ch >= 'a') && (ch <= 'f'))
+ return true;
+
+ return (ch >= 'A') && (ch <= 'F');
+}
+
+#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 000000000000..030f4b93e255
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,139 @@
+#include "boot.h"
+
+#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
+
+#define XMTRDY 0x20
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+#define DEFAULT_BAUD 9600
+
+static void early_serial_init(int port, int baud)
+{
+ unsigned char c;
+ unsigned divisor;
+
+ outb(0x3, port + LCR); /* 8n1 */
+ outb(0, port + IER); /* no interrupt */
+ outb(0, port + FCR); /* no fifo */
+ outb(0x3, port + MCR); /* DTR + RTS */
+
+ divisor = 115200 / baud;
+ c = inb(port + LCR);
+ outb(c | DLAB, port + LCR);
+ outb(divisor & 0xff, port + DLL);
+ outb((divisor >> 8) & 0xff, port + DLH);
+ outb(c & ~DLAB, port + LCR);
+
+ early_serial_base = port;
+}
+
+static void parse_earlyprintk(void)
+{
+ int baud = DEFAULT_BAUD;
+ char arg[32];
+ int pos = 0;
+ int port = 0;
+
+ if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
+ char *e;
+
+ if (!strncmp(arg, "serial", 6)) {
+ port = DEFAULT_SERIAL_PORT;
+ pos += 6;
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ if (!strncmp(arg, "ttyS", 4)) {
+ static const int bases[] = { 0x3f8, 0x2f8 };
+ int idx = 0;
+
+ if (!strncmp(arg + pos, "ttyS", 4))
+ pos += 4;
+
+ if (arg[pos++] == '1')
+ idx = 1;
+
+ port = bases[idx];
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ baud = simple_strtoull(arg + pos, &e, 0);
+ if (baud == 0 || arg + pos == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+#define BASE_BAUD (1843200/16)
+static unsigned int probe_baud(int port)
+{
+ unsigned char lcr, dll, dlh;
+ unsigned int quot;
+
+ lcr = inb(port + LCR);
+ outb(lcr | DLAB, port + LCR);
+ dll = inb(port + DLL);
+ dlh = inb(port + DLH);
+ outb(lcr, port + LCR);
+ quot = (dlh << 8) | dll;
+
+ return BASE_BAUD / quot;
+}
+
+static void parse_console_uart8250(void)
+{
+ char optstr[64], *options;
+ int baud = DEFAULT_BAUD;
+ int port = 0;
+
+ /*
+ * console=uart8250,io,0x3f8,115200n8
+ * need to make sure it is last one console !
+ */
+ if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
+ return;
+
+ options = optstr;
+
+ if (!strncmp(options, "uart8250,io,", 12))
+ port = simple_strtoull(options + 12, &options, 0);
+ else if (!strncmp(options, "uart,io,", 8))
+ port = simple_strtoull(options + 8, &options, 0);
+ else
+ return;
+
+ if (options && (options[0] == ','))
+ baud = simple_strtoull(options + 1, &options, 0);
+ else
+ baud = probe_baud(port);
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+void console_init(void)
+{
+ parse_earlyprintk();
+
+ if (!early_serial_base)
+ parse_console_uart8250();
+}
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895bd..40358c8905be 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -130,6 +130,11 @@ void main(void)
/* First, copy the boot header into the "zeropage" */
copy_boot_params();
+ /* Initialize the early-boot console */
+ console_init();
+ if (cmdline_find_option_bool("debug"))
+ puts("early console in setup code\n");
+
/* End of heap check */
init_heap();
@@ -168,10 +173,6 @@ void main(void)
/* Set the video mode */
set_video();
- /* Parse command line for 'quiet' and pass it to decompressor. */
- if (cmdline_find_option_bool("quiet"))
- boot_params.hdr.loadflags |= QUIET_FLAG;
-
/* Do the last things and invoke protected mode */
go_to_protected_mode();
}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbdddd..cdac91ca55d3 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s)
#define SMALL 32 /* Must be 32 == 0x20 */
#define SPECIAL 64 /* 0x */
-#define do_div(n,base) ({ \
+#define __do_div(n, base) ({ \
int __res; \
__res = ((unsigned long) n) % (unsigned) base; \
n = ((unsigned long) n) / (unsigned) base; \
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision,
tmp[i++] = '0';
else
while (num != 0)
- tmp[i++] = (digits[do_div(num, base)] | locase);
+ tmp[i++] = (digits[__do_div(num, base)] | locase);
if (i > precision)
precision = i;
size -= precision;
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2abf..3cbc4058dd26 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2)
return 0;
}
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+ unsigned char c1, c2;
+
+ while (count) {
+ c1 = *cs++;
+ c2 = *ct++;
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (!c1)
+ break;
+ count--;
+ }
+ return 0;
+}
+
size_t strnlen(const char *s, size_t maxlen)
{
const char *es = s;
@@ -48,3 +64,50 @@ unsigned int atou(const char *s)
i = i * 10 + (*s++ - '0');
return i;
}
+
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+static unsigned int simple_guess_base(const char *cp)
+{
+ if (cp[0] == '0') {
+ if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
+ return 16;
+ else
+ return 8;
+ } else {
+ return 10;
+ }
+}
+
+/**
+ * simple_strtoull - convert a string to an unsigned long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
+{
+ unsigned long long result = 0;
+
+ if (!base)
+ base = simple_guess_base(cp);
+
+ if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
+ cp += 2;
+
+ while (isxdigit(*cp)) {
+ unsigned int value;
+
+ value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
+ if (value >= base)
+ break;
+ result = result * base + value;
+ cp++;
+ }
+ if (endp)
+ *endp = (char *)cp;
+
+ return result;
+}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c7..def2451f46ae 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -10,23 +10,36 @@
* ----------------------------------------------------------------------- */
/*
- * Very simple screen I/O
- * XXX: Probably should add very simple serial I/O?
+ * Very simple screen and serial I/O
*/
#include "boot.h"
+int early_serial_base;
+
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
/*
* These functions are in .inittext so they can be used to signal
* error during initialization.
*/
-void __attribute__((section(".inittext"))) putchar(int ch)
+static void __attribute__((section(".inittext"))) serial_putchar(int ch)
{
- struct biosregs ireg;
+ unsigned timeout = 0xffff;
- if (ch == '\n')
- putchar('\r'); /* \n -> \r\n */
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+static void __attribute__((section(".inittext"))) bios_putchar(int ch)
+{
+ struct biosregs ireg;
initregs(&ireg);
ireg.bx = 0x0007;
@@ -36,6 +49,17 @@ void __attribute__((section(".inittext"))) putchar(int ch)
intcall(0x10, &ireg, NULL);
}
+void __attribute__((section(".inittext"))) putchar(int ch)
+{
+ if (ch == '\n')
+ putchar('\r'); /* \n -> \r\n */
+
+ bios_putchar(ch);
+
+ if (early_serial_base != 0)
+ serial_putchar(ch);
+}
+
void __attribute__((section(".inittext"))) puts(const char *str)
{
while (*str)
@@ -112,3 +136,4 @@ int getchar_timeout(void)
return 0; /* Timeout! */
}
+
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad19654a..e3a32431ca1e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_PKGTEMP is not set
# CONFIG_SENSORS_IT87 is not set
# CONFIG_SENSORS_LM63 is not set
# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd847a4..4251f8372050 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_PKGTEMP is not set
# CONFIG_SENSORS_IT87 is not set
# CONFIG_SENSORS_LM63 is not set
# CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa3..b86feabed69b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,7 @@ ia32_sys_call_table:
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
.quad compat_sys_recvmmsg
+ .quad sys_fanotify_init
+ .quad sys32_fanotify_mark
+ .quad sys_prlimit64 /* 340 */
ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88d..849813f398e7 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -51,7 +51,7 @@
#define AA(__x) ((unsigned long)(__x))
-asmlinkage long sys32_truncate64(char __user *filename,
+asmlinkage long sys32_truncate64(const char __user *filename,
unsigned long offset_low,
unsigned long offset_high)
{
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
return 0;
}
-asmlinkage long sys32_stat64(char __user *filename,
+asmlinkage long sys32_stat64(const char __user *filename,
struct stat64 __user *statbuf)
{
struct kstat stat;
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename,
return ret;
}
-asmlinkage long sys32_lstat64(char __user *filename,
+asmlinkage long sys32_lstat64(const char __user *filename,
struct stat64 __user *statbuf)
{
struct kstat stat;
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
return ret;
}
-asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
struct stat64 __user *statbuf, int flag)
{
struct kstat stat;
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
((loff_t)AA(poshi) << 32) | AA(poslo));
}
-asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
- u32 poslo, u32 poshi)
+asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
+ u32 count, u32 poslo, u32 poshi)
{
return sys_pwrite64(fd, ubuf, count,
((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
return ret;
}
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
compat_uptr_t __user *envp, struct pt_regs *regs)
{
long error;
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
((u64)len_hi << 32) | len_lo);
}
+
+asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
+ u32 mask_lo, u32 mask_hi,
+ int fd, const char __user *pathname)
+{
+ return sys_fanotify_mark(fanotify_fd, flags,
+ ((u64)mask_hi << 32) | mask_lo,
+ fd, pathname);
+}
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968fc..92091de11113 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
boot_cpu_data.x86_model <= 0x05 &&
boot_cpu_data.x86_mask < 0x0A)
return 1;
- else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+ else if (c1e_detected)
return 1;
else
return max_cstate;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 03b6bb5394a0..bc6abb7bc7ee 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -45,10 +45,9 @@
struct alt_instr {
u8 *instr; /* original instruction */
u8 *replacement;
- u8 cpuid; /* cpuid bit set for replacement */
+ u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */
- u8 pad1;
#ifdef CONFIG_X86_64
u32 pad2;
#endif
@@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
_ASM_ALIGN "\n" \
_ASM_PTR "661b\n" /* label */ \
_ASM_PTR "663f\n" /* new instruction */ \
- " .byte " __stringify(feature) "\n" /* feature bit */ \
+ " .word " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .discard,\"aw\",@progbits\n" \
" .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
".previous\n" \
".section .altinstr_replacement, \"ax\"\n" \
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe570..a69b1ac9eaf8 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
extern unsigned int boot_cpu_id;
-extern int disable_apbt_percpu;
extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d83c716..8e6218550e77 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -70,6 +70,14 @@ struct sys_desc_table {
__u8 table[14];
};
+/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
+struct olpc_ofw_header {
+ __u32 ofw_magic; /* OFW signature */
+ __u32 ofw_version;
+ __u32 cif_handler; /* callback into OFW */
+ __u32 irq_desc_table;
+} __attribute__((packed));
+
struct efi_info {
__u32 efi_loader_signature;
__u32 efi_systab;
@@ -92,7 +100,8 @@ struct boot_params {
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
__u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
struct sys_desc_table sys_desc_table; /* 0x0a0 */
- __u8 _pad4[144]; /* 0x0b0 */
+ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
+ __u8 _pad4[128]; /* 0x0c0 */
struct edid_info edid_info; /* 0x140 */
struct efi_info efi_info; /* 0x1c0 */
__u32 alt_mem_k; /* 0x1e0 */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8859e12dd3cf..284a6e8f7ce1 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -11,38 +11,42 @@
extern void __xchg_wrong_size(void);
/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
*/
-
-struct __xchg_dummy {
- unsigned long a[100];
-};
-#define __xg(x) ((struct __xchg_dummy *)(x))
-
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
switch (size) { \
case 1: \
- asm volatile("xchgb %b0,%1" \
- : "=q" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile("xchgb %0,%1" \
+ : "=q" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile("xchgw %w0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile("xchgw %0,%1" \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 4: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
asm volatile("xchgl %0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
default: \
__xchg_wrong_size(); \
} \
@@ -53,60 +57,33 @@ struct __xchg_dummy {
__xchg((v), (ptr), sizeof(*ptr))
/*
- * The semantics of XCHGCMP8B are a bit strange, this is why
- * there is a loop and the loading of %%eax and %%edx has to
- * be inside. This inlines well in most cases, the cached
- * cost is around ~38 cycles. (in the future we might want
- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
- * might have an implicit FPU-save as a cost, so it's not
- * clear which path to go.)
+ * CMPXCHG8B only writes to the target if we had the previous
+ * value in registers, otherwise it acts as a read and gives us the
+ * "new previous" value. That is why there is a loop. Preloading
+ * EDX:EAX is a performance optimization: in the common case it means
+ * we need only one locked operation.
*
- * cmpxchg8b must be used with the lock prefix here to allow
- * the instruction to be executed atomically, see page 3-102
- * of the instruction set reference 24319102.pdf. We need
- * the reader side to see the coherent 64bit value.
+ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
+ * least an FPU save and/or %cr0.ts manipulation.
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow the
+ * instruction to be executed atomically. We need to have the reader
+ * side to see the coherent 64bit value.
*/
-static inline void __set_64bit(unsigned long long *ptr,
- unsigned int low, unsigned int high)
+static inline void set_64bit(volatile u64 *ptr, u64 value)
{
+ u32 low = value;
+ u32 high = value >> 32;
+ u64 prev = *ptr;
+
asm volatile("\n1:\t"
- "movl (%0), %%eax\n\t"
- "movl 4(%0), %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b (%0)\n\t"
+ LOCK_PREFIX "cmpxchg8b %0\n\t"
"jnz 1b"
- : /* no outputs */
- : "D"(ptr),
- "b"(low),
- "c"(high)
- : "ax", "dx", "memory");
-}
-
-static inline void __set_64bit_constant(unsigned long long *ptr,
- unsigned long long value)
-{
- __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
-}
-
-#define ll_low(x) *(((unsigned int *)&(x)) + 0)
-#define ll_high(x) *(((unsigned int *)&(x)) + 1)
-
-static inline void __set_64bit_var(unsigned long long *ptr,
- unsigned long long value)
-{
- __set_64bit(ptr, ll_low(value), ll_high(value));
+ : "=m" (*ptr), "+A" (prev)
+ : "b" (low), "c" (high)
+ : "memory");
}
-#define set_64bit(ptr, value) \
- (__builtin_constant_p((value)) \
- ? __set_64bit_constant((ptr), (value)) \
- : __set_64bit_var((ptr), (value)))
-
-#define _set_64bit(ptr, value) \
- (__builtin_constant_p(value) \
- ? __set_64bit(ptr, (unsigned int)(value), \
- (unsigned int)((value) >> 32)) \
- : __set_64bit(ptr, ll_low((value)), ll_high((value))))
-
extern void __cmpxchg_wrong_size(void);
/*
@@ -121,23 +98,32 @@ extern void __cmpxchg_wrong_size(void);
__typeof__(*(ptr)) __new = (new); \
switch (size) { \
case 1: \
- asm volatile(lock "cmpxchgb %b1,%2" \
- : "=a"(__ret) \
- : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile(lock "cmpxchgw %w1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 4: \
- asm volatile(lock "cmpxchgl %1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
default: \
__cmpxchg_wrong_size(); \
} \
@@ -175,32 +161,28 @@ extern void __cmpxchg_wrong_size(void);
(unsigned long long)(n)))
#endif
-static inline unsigned long long __cmpxchg64(volatile void *ptr,
- unsigned long long old,
- unsigned long long new)
+static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
{
- unsigned long long prev;
- asm volatile(LOCK_PREFIX "cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
+ u64 prev;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+ : "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "0" (old)
: "memory");
return prev;
}
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
- unsigned long long old,
- unsigned long long new)
+static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- unsigned long long prev;
- asm volatile("cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
+ u64 prev;
+ asm volatile("cmpxchg8b %1"
+ : "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "0" (old)
: "memory");
return prev;
}
@@ -264,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
* to simulate the cmpxchg8b on the 80386 and 80486 CPU.
*/
-extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
-
#define cmpxchg64(ptr, o, n) \
({ \
__typeof__(*(ptr)) __ret; \
@@ -283,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
__ret; })
-
-#define cmpxchg64_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 4)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- __ret; \
-})
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (o); \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io("call cmpxchg8b_emu", \
+ "cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__ret), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ __ret; })
#endif
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 485ae415faec..423ae58aa020 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,51 +3,60 @@
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
-#define __xg(x) ((volatile long *)(x))
-
-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
+static inline void set_64bit(volatile u64 *ptr, u64 val)
{
*ptr = val;
}
-#define _set_64bit set_64bit
-
extern void __xchg_wrong_size(void);
extern void __cmpxchg_wrong_size(void);
/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
*/
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
switch (size) { \
case 1: \
- asm volatile("xchgb %b0,%1" \
- : "=q" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile("xchgb %0,%1" \
+ : "=q" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile("xchgw %w0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile("xchgw %0,%1" \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 4: \
- asm volatile("xchgl %k0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile("xchgl %0,%1" \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 8: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
asm volatile("xchgq %0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
default: \
__xchg_wrong_size(); \
} \
@@ -71,29 +80,41 @@ extern void __cmpxchg_wrong_size(void);
__typeof__(*(ptr)) __new = (new); \
switch (size) { \
case 1: \
- asm volatile(lock "cmpxchgb %b1,%2" \
- : "=a"(__ret) \
- : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile(lock "cmpxchgw %w1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 4: \
- asm volatile(lock "cmpxchgl %k1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 8: \
- asm volatile(lock "cmpxchgq %1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
+ asm volatile(lock "cmpxchgq %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
default: \
__cmpxchg_wrong_size(); \
} \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 468145914389..781a50b29a49 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
#include <asm/required-features.h>
-#define NCAPINTS 9 /* N 32-bit words worth of info */
+#define NCAPINTS 10 /* N 32-bit words worth of info */
/*
* Note: If the comment begins with a quoted string, that string is used
@@ -89,7 +89,7 @@
#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
+ /* 21 available, was AMD_C1E */
#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
@@ -124,6 +124,8 @@
#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -157,22 +159,29 @@
/*
* Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc
+ * CPUID levels like 0x6, 0xA etc, word 7
*/
#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
+#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
-/* Virtualization flags: Linux defined */
+/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -194,7 +203,9 @@ extern const char * const x86_power_flags[32];
(((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
(((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
? 1 : \
test_cpu_cap(c, bit))
@@ -291,7 +302,7 @@ extern const char * const x86_power_flags[32];
* patch the target code for additional performance.
*
*/
-static __always_inline __pure bool __static_cpu_has(u8 bit)
+static __always_inline __pure bool __static_cpu_has(u16 bit)
{
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
asm goto("1: jmp %l[t_no]\n"
@@ -300,11 +311,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
_ASM_ALIGN "\n"
_ASM_PTR "1b\n"
_ASM_PTR "0\n" /* no replacement */
- " .byte %P0\n" /* feature bit */
+ " .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
- " .byte 0xff + 0 - (2b-1b)\n" /* padding */
".previous\n"
+ /* skipping size check since replacement size = 0 */
: : "i" (bit) : : t_no);
return true;
t_no:
@@ -318,10 +329,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
_ASM_ALIGN "\n"
_ASM_PTR "1b\n"
_ASM_PTR "3f\n"
- " .byte %P1\n" /* feature bit */
+ " .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
- " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
+ ".previous\n"
+ ".section .discard,\"aw\",@progbits\n"
+ " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
".previous\n"
".section .altinstr_replacement,\"ax\"\n"
"3: movb $1,%0\n"
@@ -337,7 +350,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
( \
__builtin_constant_p(boot_cpu_has(bit)) ? \
boot_cpu_has(bit) : \
- (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \
+ __builtin_constant_p(bit) ? \
__static_cpu_has(bit) : \
boot_cpu_has(bit) \
)
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ac91eed21061..d4c419f883a0 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d, h) (1)
extern int dma_supported(struct device *hwdev, u64 mask);
extern int dma_set_mask(struct device *dev, u64 mask);
@@ -87,13 +86,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size,
flush_write_buffers();
}
-static inline int dma_get_cache_alignment(void)
-{
- /* no easy way to get cache size on all x86, so return the
- * maximum possible, to be safe */
- return boot_cpu_data.x86_clflush_size;
-}
-
static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
gfp_t gfp)
{
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80f..8caac76ac324 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
void kunmap(struct page *page);
void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 942255310e6a..528a11e8d3e3 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint {
#include <linux/list.h>
/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_X 0x00
#define X86_BREAKPOINT_LEN_1 0x40
#define X86_BREAKPOINT_LEN_2 0x44
#define X86_BREAKPOINT_LEN_4 0x4c
-#define X86_BREAKPOINT_LEN_EXECUTE 0x40
#ifdef CONFIG_X86_64
#define X86_BREAKPOINT_LEN_8 0x48
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c8..ff2546ce7178 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
/* Recognized hypervisors */
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b904..a73a8d5a5e69 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void);
extern int init_fpu(struct task_struct *child);
extern asmlinkage void math_state_restore(void);
extern void __math_state_restore(void);
-extern void init_thread_xstate(void);
extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
extern user_regset_active_fn fpregs_active, xfpregs_active;
@@ -58,11 +57,25 @@ extern int restore_i387_xstate_ia32(void __user *buf);
#define X87_FSW_ES (1 << 7) /* Exception Summary */
+static __always_inline __pure bool use_xsaveopt(void)
+{
+ return static_cpu_has(X86_FEATURE_XSAVEOPT);
+}
+
static __always_inline __pure bool use_xsave(void)
{
return static_cpu_has(X86_FEATURE_XSAVE);
}
+extern void __sanitize_i387_state(struct task_struct *);
+
+static inline void sanitize_i387_state(struct task_struct *tsk)
+{
+ if (!use_xsaveopt())
+ return;
+ __sanitize_i387_state(tsk);
+}
+
#ifdef CONFIG_X86_64
/* Ignore delayed exceptions from user space */
@@ -127,6 +140,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
{
int err;
+ /*
+ * Clear the bytes not touched by the fxsave and reserved
+ * for the SW usage.
+ */
+ err = __clear_user(&fx->sw_reserved,
+ sizeof(struct _fpx_sw_bytes));
+ if (unlikely(err))
+ return -EFAULT;
+
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -482,6 +504,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
memcpy(dst->state, src->state, xstate_size);
}
+extern void fpu_finit(struct fpu *fpu);
+
#endif /* __ASSEMBLY__ */
#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4470c9ad4a3e..29f66793cc55 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -1,6 +1,12 @@
#ifndef _ASM_X86_INTEL_SCU_IPC_H_
#define _ASM_X86_INTEL_SCU_IPC_H_
+#define IPCMSG_VRTC 0xFA /* Set vRTC device */
+
+/* Command id associated with message IPCMSG_VRTC */
+#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
+#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
+
/* Read single register */
int intel_scu_ipc_ioread8(u16 addr, u8 *data);
@@ -28,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
/* Update single register based on the mask */
int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
-/*
- * Indirect register read
- * Can be used when SCCB(System Controller Configuration Block) register
- * HRIM(Honor Restricted IPC Messages) is set (bit 23)
- */
-int intel_scu_ipc_register_read(u32 addr, u32 *data);
-
-/*
- * Indirect register write
- * Can be used when SCCB(System Controller Configuration Block) register
- * HRIM(Honor Restricted IPC Messages) is set (bit 23)
- */
-int intel_scu_ipc_register_write(u32 addr, u32 data);
-
/* Issue commands to the SCU with or without data */
int intel_scu_ipc_simple_command(int cmd, int sub);
int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f64..e2ca30092557 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,9 @@
*/
#define MCE_SELF_VECTOR 0xeb
+/* Xen vector callback to receive events in a HVM domain */
+#define XEN_HVM_EVTCHN_CALLBACK 0xe9
+
#define NR_VECTORS 256
#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b974761..5bdfca86581b 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,5 +33,11 @@ extern void __show_regs(struct pt_regs *regs, int all);
extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
+#ifdef CONFIG_KEXEC
+extern int in_crash_kexec;
+#else
+/* no crash dump is ever in progress if no crash kernel can be kexec'd */
+#define in_crash_kexec 0
+#endif
#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cdc..396f5b5fc4d7 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
GDB_FS, /* 14 */
GDB_GS, /* 15 */
};
+#define GDB_ORIG_AX 41
+#define DBG_MAX_REG_NUM 16
#define NUMREGBYTES ((GDB_GS+1)*4)
#else /* ! CONFIG_X86_32 */
-enum regnames64 {
+enum regnames {
GDB_AX, /* 0 */
GDB_BX, /* 1 */
GDB_CX, /* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
GDB_R14, /* 14 */
GDB_R15, /* 15 */
GDB_PC, /* 16 */
+ GDB_PS, /* 17 */
+ GDB_CS, /* 18 */
+ GDB_SS, /* 19 */
};
-
-enum regnames32 {
- GDB_PS = 34,
- GDB_CS,
- GDB_SS,
-};
-#define NUMREGBYTES ((GDB_SS+1)*4)
-#endif /* CONFIG_X86_32 */
+#define GDB_ORIG_AX 57
+#define DBG_MAX_REG_NUM 20
+/* 17 64 bit regs and 3 32 bit regs */
+#define NUMREGBYTES ((17 * 8) + (3 * 4))
+#endif /* ! CONFIG_X86_32 */
static inline void arch_kgdb_breakpoint(void)
{
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
#define __KVM_HAVE_XEN_HVM
#define __KVM_HAVE_VCPU_EVENTS
#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
__u64 reserved[9];
};
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+ __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS 16
+
+struct kvm_xcr {
+ __u32 xcr;
+ __u32 reserved;
+ __u64 value;
+};
+
+struct kvm_xcrs {
+ __u32 nr_xcrs;
+ __u32 flags;
+ struct kvm_xcr xcrs[KVM_MAX_XCRS];
+ __u64 padding[16];
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..51cfd730ac5d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
#define X86EMUL_UNHANDLEABLE 1
/* Terminate emulation but return success to the caller. */
#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+
struct x86_emulate_ops {
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
+ unsigned int *error,
struct kvm_vcpu *vcpu);
/*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error,
struct kvm_vcpu *vcpu);
/*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
+ unsigned int *error,
struct kvm_vcpu *vcpu);
int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
int seg, struct kvm_vcpu *vcpu);
u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+ unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
- void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+ int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
int (*cpl)(struct kvm_vcpu *vcpu);
- void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+ int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+ int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
};
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
unsigned int bytes;
- unsigned long val, orig_val, *ptr;
+ unsigned long orig_val, *ptr;
+ union {
+ unsigned long val;
+ char valptr[sizeof(unsigned long) + 2];
+ };
};
struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
unsigned long modrm_val;
struct fetch_cache fetch;
struct read_cache io_read;
+ struct read_cache mem_read;
};
struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
int interruptibility;
bool restart; /* restart string instruction after writeback */
+
+ int exception; /* exception that happens during emulation or -1 */
+ u32 error_code; /* error code for exception */
+ bool error_code_valid;
+ unsigned long cr2; /* faulted address in case of #PF */
+
/* decode cache */
struct decode_cache decode;
};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..502e53f999cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
0xFFFFFF0000000000ULL)
#define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
#define UNMAPPED_GVA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */
#define KVM_NR_PAGE_SIZES 3
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
#define IOPL_SHIFT 12
-#define KVM_ALIAS_SLOTS 4
-
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
#define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
+ struct kvm_mmu_page *sp, bool clear_unsync);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
unsigned long mmu_seq;
} update_pte;
- struct i387_fxsave_struct host_fx_image;
- struct i387_fxsave_struct guest_fx_image;
+ struct fpu guest_fpu;
+ u64 xcr0;
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
/* fields used by HYPER-V emulation */
u64 hv_vapic;
-};
-
-struct kvm_mem_alias {
- gfn_t base_gfn;
- unsigned long npages;
- gfn_t target_gfn;
-#define KVM_ALIAS_INVALID 1UL
- unsigned long flags;
-};
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
- struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
- int naliases;
+ cpumask_var_t wbinvd_dirty_mask;
};
struct kvm_arch {
- struct kvm_mem_aliases *aliases;
-
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+ bool (*has_wbinvd_exit)(void);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -576,7 +565,6 @@ enum emulation_result {
#define EMULTYPE_SKIP (1 << 2)
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long value);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
bool has_error_code, u32 error_code);
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
int complete_pio(struct kvm_vcpu *vcpu);
bool kvm_check_iopl(struct kvm_vcpu *vcpu);
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
- asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
- asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
- asm("finit");
-}
-
static inline u32 get_rdx_init_val(void)
{
return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
new file mode 100644
index 000000000000..36c93b5cc239
--- /dev/null
+++ b/arch/x86/include/asm/local64.h
@@ -0,0 +1 @@
+#include <asm-generic/local64.h>
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f32a4301c4d4..c62c13cb9788 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,10 @@
#define MCM_ADDR_MEM 3 /* memory address */
#define MCM_ADDR_GENERIC 7 /* generic */
+/* CTL2 register defines */
+#define MCI_CTL2_CMCI_EN (1ULL << 30)
+#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
+
#define MCJ_CTX_MASK 3
#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM 0 /* inject context: random */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62d..16350740edf6 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -13,6 +13,32 @@
extern int pci_mrst_init(void);
int __init sfi_parse_mrtc(struct sfi_table_header *table);
+/*
+ * Medfield is the follow-up of Moorestown, it combines two chip solution into
+ * one. Other than that it also added always-on and constant tsc and lapic
+ * timers. Medfield is the platform name, and the chip name is called Penwell
+ * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
+ * identified via MSRs.
+ */
+enum mrst_cpu_type {
+ MRST_CPU_CHIP_LINCROFT = 1,
+ MRST_CPU_CHIP_PENWELL,
+};
+
+extern enum mrst_cpu_type __mrst_cpu_chip;
+static enum mrst_cpu_type mrst_identify_cpu(void)
+{
+ return __mrst_cpu_chip;
+}
+
+enum mrst_timer_options {
+ MRST_TIMER_DEFAULT,
+ MRST_TIMER_APBT_ONLY,
+ MRST_TIMER_LAPIC_APBT,
+};
+
+extern enum mrst_timer_options mrst_timer_options;
+
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..986f7790fdb2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
#define _EFER_LMA 10 /* Long mode active (read-only) */
#define _EFER_NX 11 /* No execute enable */
#define _EFER_SVME 12 /* Enable virtualization */
+#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
#define EFER_LMA (1<<_EFER_LMA)
#define EFER_NX (1<<_EFER_NX)
#define EFER_SVME (1<<_EFER_SVME)
+#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
/* Intel MSRs. Some also available on other CPUs */
@@ -94,9 +96,6 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
-#define CMCI_EN (1ULL << 30)
-#define CMCI_THRESHOLD_MASK 0xffffULL
-
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
@@ -159,8 +158,6 @@
#define MSR_K7_FID_VID_STATUS 0xc0010042
/* K6 MSRs */
-#define MSR_K6_EFER 0xc0000080
-#define MSR_K6_STAR 0xc0000081
#define MSR_K6_WHCR 0xc0000082
#define MSR_K6_UWCCR 0xc0000085
#define MSR_K6_EPMR 0xc0000086
@@ -224,12 +221,14 @@
#define MSR_IA32_THERM_CONTROL 0x0000019a
#define MSR_IA32_THERM_INTERRUPT 0x0000019b
-#define THERM_INT_LOW_ENABLE (1 << 0)
-#define THERM_INT_HIGH_ENABLE (1 << 1)
+#define THERM_INT_HIGH_ENABLE (1 << 0)
+#define THERM_INT_LOW_ENABLE (1 << 1)
+#define THERM_INT_PLN_ENABLE (1 << 24)
#define MSR_IA32_THERM_STATUS 0x0000019c
#define THERM_STATUS_PROCHOT (1 << 0)
+#define THERM_STATUS_POWER_LIMIT (1 << 10)
#define MSR_THERM2_CTL 0x0000019d
@@ -239,6 +238,19 @@
#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
+#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+
+#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+
/* MISC_ENABLE bits: architectural */
#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c5bc4c2d33f5..084ef95274cd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter)
#define rdmsr(msr, val1, val2) \
do { \
u64 __val = native_read_msr((msr)); \
- (val1) = (u32)__val; \
- (val2) = (u32)(__val >> 32); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
} while (0)
static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f3341..932f0f86b4b7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
extern int check_nmi_watchdog(void);
+#if !defined(CONFIG_LOCKUP_DETECTOR)
extern int nmi_watchdog_enabled;
+#endif
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 000000000000..08fde475cb3b
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_OLPC_OFW_H
+#define _ASM_X86_OLPC_OFW_H
+
+/* index into the page table containing the entry OFW occupies */
+#define OLPC_OFW_PDE_NR 1022
+
+#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
+
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+
+/* run an OFW command by calling into the firmware */
+#define olpc_ofw(name, args, res) \
+ __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
+
+extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res);
+
+/* determine whether OFW is available and lives in the proper memory */
+extern void olpc_ofw_detect(void);
+
+/* install OFW's pde permanently into the kernel's pgtable */
+extern void setup_olpc_ofw_pgd(void);
+
+#else /* !CONFIG_OLPC_OPENFIRMWARE */
+
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+
+#endif /* !CONFIG_OLPC_OPENFIRMWARE */
+
+#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 625c3f0e741a..8ca82839288a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,6 +37,13 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
This seems to be the official gcc blessed way to do such arithmetic. */
+/*
+ * We need __phys_reloc_hide() here because gcc may assume that there is no
+ * overflow during __pa() calculation and can optimize it unexpectedly.
+ * Newer versions of gcc provide -fno-strict-overflow switch to handle this
+ * case properly. Once all supported versions of gcc understand it, we can
+ * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
+ */
#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index cd2a31dc5fb8..49c7219826f9 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -30,6 +30,7 @@
#define PCI_HAS_IO_ECS 0x40000
#define PCI_NOASSIGN_ROMS 0x80000
#define PCI_ROOT_NO_CRS 0x100000
+#define PCI_NOASSIGN_BARS 0x200000
extern unsigned int pci_probe;
extern unsigned long pirq_table_addr;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e0..6e742cc4251b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -68,8 +68,9 @@ union cpuid10_eax {
union cpuid10_edx {
struct {
- unsigned int num_counters_fixed:4;
- unsigned int reserved:28;
+ unsigned int num_counters_fixed:5;
+ unsigned int bit_width_fixed:8;
+ unsigned int reserved:19;
} split;
unsigned int full;
};
@@ -140,6 +141,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
extern unsigned long perf_misc_flags(struct pt_regs *regs);
#define perf_misc_flags(regs) perf_misc_flags(regs)
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->ip = (__ip); \
+ (regs)->bp = caller_frame_pointer(); \
+ (regs)->cs = __KERNEL_CS; \
+ regs->flags = 0; \
+}
+
#else
static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 64a8ebff06fc..def500776b16 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -19,7 +19,6 @@
#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
#define ARCH_P4_MAX_CCCR (18)
-#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
#define P4_ESCR_EVENT_MASK 0x7e000000U
#define P4_ESCR_EVENT_SHIFT 25
@@ -71,10 +70,6 @@
#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
-/* Custom bits in reerved CCCR area */
-#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
-
-
/* Non HT mask */
#define P4_CCCR_MASK \
(P4_CCCR_OVF | \
@@ -106,8 +101,7 @@
* ESCR and CCCR but rather an only packed value should
* be unpacked and written to a proper addresses
*
- * the base idea is to pack as much info as
- * possible
+ * the base idea is to pack as much info as possible
*/
#define p4_config_pack_escr(v) (((u64)(v)) << 32)
#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
@@ -130,8 +124,6 @@
t; \
})
-#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
-
#define P4_CONFIG_HT_SHIFT 63
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
@@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
return escr;
}
+/*
+ * This are the events which should be used in "Event Select"
+ * field of ESCR register, they are like unique keys which allow
+ * the kernel to determinate which CCCR and COUNTER should be
+ * used to track an event
+ */
enum P4_EVENTS {
P4_EVENT_TC_DELIVER_MODE,
P4_EVENT_BPU_FETCH_REQUEST,
@@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES {
* a caller should use P4_ESCR_EMASK_NAME helper to
* pick the EventMask needed, for example
*
- * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
+ * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
*/
enum P4_ESCR_EMASKS {
P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
@@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS {
P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
};
-/* P4 PEBS: stale for a while */
-#define P4_PEBS_METRIC_MASK 0x00001fffU
-#define P4_PEBS_UOB_TAG 0x01000000U
-#define P4_PEBS_ENABLE 0x02000000U
-
-/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
-#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001
-#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002
-#define P4_PEBS__dtlb_load_miss_retired 0x3000004
-#define P4_PEBS__dtlb_store_miss_retired 0x3000004
-#define P4_PEBS__dtlb_all_miss_retired 0x3000004
-#define P4_PEBS__tagged_mispred_branch 0x3018000
-#define P4_PEBS__mob_load_replay_retired 0x3000200
-#define P4_PEBS__split_load_retired 0x3000400
-#define P4_PEBS__split_store_retired 0x3000400
-
-#define P4_VERT__1stl_cache_load_miss_retired 0x0000001
-#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001
-#define P4_VERT__dtlb_load_miss_retired 0x0000001
-#define P4_VERT__dtlb_store_miss_retired 0x0000002
-#define P4_VERT__dtlb_all_miss_retired 0x0000003
-#define P4_VERT__tagged_mispred_branch 0x0000010
-#define P4_VERT__mob_load_replay_retired 0x0000001
-#define P4_VERT__split_load_retired 0x0000001
-#define P4_VERT__split_store_retired 0x0000002
-
-enum P4_CACHE_EVENTS {
- P4_CACHE__NONE,
-
- P4_CACHE__1stl_cache_load_miss_retired,
- P4_CACHE__2ndl_cache_load_miss_retired,
- P4_CACHE__dtlb_load_miss_retired,
- P4_CACHE__dtlb_store_miss_retired,
- P4_CACHE__itlb_reference_hit,
- P4_CACHE__itlb_reference_miss,
-
- P4_CACHE__MAX
+/*
+ * P4 PEBS specifics (Replay Event only)
+ *
+ * Format (bits):
+ * 0-6: metric from P4_PEBS_METRIC enum
+ * 7 : reserved
+ * 8 : reserved
+ * 9-11 : reserved
+ *
+ * Note we have UOP and PEBS bits reserved for now
+ * just in case if we will need them once
+ */
+#define P4_PEBS_CONFIG_ENABLE (1 << 7)
+#define P4_PEBS_CONFIG_UOP_TAG (1 << 8)
+#define P4_PEBS_CONFIG_METRIC_MASK 0x3f
+#define P4_PEBS_CONFIG_MASK 0xff
+
+/*
+ * mem: Only counters MSR_IQ_COUNTER4 (16) and
+ * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
+ */
+#define P4_PEBS_ENABLE 0x02000000U
+#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U
+
+#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
+#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK)
+
+#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask))
+
+enum P4_PEBS_METRIC {
+ P4_PEBS_METRIC__none,
+
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_store_miss_retired,
+ P4_PEBS_METRIC__dtlb_all_miss_retired,
+ P4_PEBS_METRIC__tagged_mispred_branch,
+ P4_PEBS_METRIC__mob_load_replay_retired,
+ P4_PEBS_METRIC__split_load_retired,
+ P4_PEBS_METRIC__split_store_retired,
+
+ P4_PEBS_METRIC__max
};
#endif /* PERF_EVENT_P4_H */
+
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c612..076052cd62be 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
#define update_mmu_cache(vma, address, ptep) do { } while (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8ee..325b7bdbebaa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void);
extern unsigned long boot_option_idle_override;
extern unsigned long idle_halt;
extern unsigned long idle_nomwait;
+extern bool c1e_detected;
/*
* on systems with caches, caches must be flashed as the absolute
@@ -1025,4 +1026,24 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
return ratio;
}
+/*
+ * AMD errata checking
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_383[];
+extern const int amd_erratum_400[];
+extern bool cpu_has_amd_erratum(const int *);
+
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+#else
+#define cpu_has_amd_erratum(x) (false)
+#endif /* CONFIG_CPU_SUP_AMD */
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad1..6c7fc25f2c34 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
#define REQUIRED_MASK5 0
#define REQUIRED_MASK6 0
#define REQUIRED_MASK7 0
+#define REQUIRED_MASK8 0
+#define REQUIRED_MASK9 0
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede126972..d1e41b0f9b60 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
{
asm volatile("# beginning down_read\n\t"
LOCK_PREFIX _ASM_INC "(%1)\n\t"
- /* adds 0x00000001, returns the old value */
+ /* adds 0x00000001 */
" jns 1f\n"
" call call_rwsem_down_read_failed\n"
"1:\n\t"
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
{
rwsem_count_t tmp;
-
- tmp = RWSEM_ACTIVE_WRITE_BIAS;
asm volatile("# beginning down_write\n\t"
LOCK_PREFIX " xadd %1,(%2)\n\t"
- /* subtract 0x0000ffff, returns the old value */
+ /* adds 0xffff0001, returns the old value */
" test %1,%1\n\t"
/* was the count 0 before? */
" jz 1f\n"
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
"1:\n"
"# ending down_write"
: "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (tmp)
+ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
: "memory", "cc");
}
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
*/
static inline void __up_read(struct rw_semaphore *sem)
{
- rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
+ rwsem_count_t tmp;
asm volatile("# beginning __up_read\n\t"
LOCK_PREFIX " xadd %1,(%2)\n\t"
/* subtracts 1, returns the old value */
" jns 1f\n\t"
- " call call_rwsem_wake\n"
+ " call call_rwsem_wake\n" /* expects old value in %edx */
"1:\n"
"# ending __up_read\n"
: "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (tmp)
+ : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS)
: "memory", "cc");
}
@@ -216,10 +214,9 @@ static inline void __up_write(struct rw_semaphore *sem)
rwsem_count_t tmp;
asm volatile("# beginning __up_write\n\t"
LOCK_PREFIX " xadd %1,(%2)\n\t"
- /* tries to transition
- 0xffff0001 -> 0x00000000 */
- " jz 1f\n"
- " call call_rwsem_wake\n"
+ /* subtracts 0xffff0001, returns the old value */
+ " jns 1f\n\t"
+ " call call_rwsem_wake\n" /* expects old value in %edx */
"1:\n\t"
"# ending __up_write\n"
: "+m" (sem->count), "=d" (tmp)
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index fb0b1874396f..4240878b9d76 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -3,7 +3,6 @@
#include <asm-generic/scatterlist.h>
-#define ISA_DMA_THRESHOLD (0x00ffffff)
#define ARCH_HAS_SG_CHAIN
#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 86b1506f4179..ef292c792d74 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align);
* executable.)
*/
#define RESERVE_BRK(name,sz) \
- static void __section(.discard) __used \
+ static void __section(.discard.text) __used \
__brk_reservation_fn_##name##__(void) { \
asm volatile ( \
".pushsection .brk_reservation,\"aw\",@nobits;" \
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 4dab78edbad9..2b16a2ad23dc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,6 +1,13 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H
+#include <linux/uaccess.h>
+
extern int kstack_depth_to_print;
struct thread_info;
@@ -42,4 +49,46 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data);
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+struct stack_frame_ia32 {
+ u32 next_frame;
+ u32 return_address;
+};
+
+static inline unsigned long caller_frame_pointer(void)
+{
+ struct stack_frame *frame;
+
+ get_bp(frame);
+
+#ifdef CONFIG_FRAME_POINTER
+ frame = frame->next_frame;
+#endif
+
+ return (unsigned long)frame;
+}
+
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae7..cb238526a9f1 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -18,13 +18,13 @@
#include <asm/ia32.h>
/* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
-asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, char __user *,
+asmlinkage long sys32_fstatat(unsigned int, const char __user *,
struct stat64 __user *, int);
struct mmap_arg_struct32;
asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
asmlinkage long sys32_personality(unsigned long);
asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
-asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
compat_uptr_t __user *, struct pt_regs *);
asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
/* ia32/ipc32.c */
asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+
+asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
+ const char __user *);
#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b43e9a7..feb2ff9bfc2d 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *);
/* kernel/process.c */
int sys_fork(struct pt_regs *);
int sys_vfork(struct pt_regs *);
-long sys_execve(char __user *, char __user * __user *,
+long sys_execve(const char __user *, char __user * __user *,
char __user * __user *, struct pt_regs *);
long sys_clone(unsigned long, unsigned long, void __user *,
void __user *, struct pt_regs *);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e7f4d33c55ed..33ecc3ea8782 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,4 +457,11 @@ static __always_inline void rdtsc_barrier(void)
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
+/*
+ * We handle most unaligned accesses in hardware. On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN 0
#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a4..b766a5e8ba0e 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,13 @@
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
+#define __NR_fanotify_init 338
+#define __NR_fanotify_mark 339
+#define __NR_prlimit64 340
#ifdef __KERNEL__
-#define NR_syscalls 338
+#define NR_syscalls 341
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81e..363e9b8a715b 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,12 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __NR_recvmmsg 299
__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_fanotify_init 300
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark 301
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
+#define __NR_prlimit64 302
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index aa558ac0306e..42d412fd8b02 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -34,6 +34,7 @@
*/
#define UV_ITEMS_PER_DESCRIPTOR 8
+/* the 'throttle' to prevent the hardware stay-busy bug */
#define MAX_BAU_CONCURRENT 3
#define UV_CPUS_PER_ACT_STATUS 32
#define UV_ACT_STATUS_MASK 0x3
@@ -45,10 +46,26 @@
#define UV_DESC_BASE_PNODE_SHIFT 49
#define UV_PAYLOADQ_PNODE_SHIFT 49
#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
+#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
+#define UV_BAU_TUNABLES_DIR "sgi_uv"
+#define UV_BAU_TUNABLES_FILE "bau_tunables"
+#define WHITESPACE " \t\n"
#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
+/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+/* [30:28] URGENCY_7 an index into a table of times */
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
+
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+/* [45:40] BAU - BAU transaction timeout select - a multiplier */
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
/*
* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -59,24 +76,21 @@
#define DESC_STATUS_SOURCE_TIMEOUT 3
/*
- * source side threshholds at which message retries print a warning
- */
-#define SOURCE_TIMEOUT_LIMIT 20
-#define DESTINATION_TIMEOUT_LIMIT 20
-
-/*
- * misc. delays, in microseconds
+ * delay for 'plugged' timeout retries, in microseconds
*/
-#define THROTTLE_DELAY 10
-#define TIMEOUT_DELAY 10
-#define BIOS_TO 1000
-/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+#define PLUGGED_DELAY 10
/*
* threshholds at which to use IPI to free resources
*/
+/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
#define PLUGSB4RESET 100
-#define TIMEOUTSB4RESET 100
+/* after this many consecutive timeouts, use IPI to release resources */
+#define TIMEOUTSB4RESET 1
+/* at this number uses of IPI to release resources, giveup the request */
+#define IPI_RESET_LIMIT 1
+/* after this # consecutive successes, bump up the throttle if it was lowered */
+#define COMPLETE_THRESHOLD 5
/*
* number of entries in the destination side payload queue
@@ -96,6 +110,13 @@
#define FLUSH_COMPLETE 4
/*
+ * tuning the action when the numalink network is extremely delayed
+ */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
+#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+
+/*
* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
* If the 'multilevel' flag in the header portion of the descriptor
* has been set to 0, then endpoint multi-unicast mode is selected.
@@ -300,37 +321,16 @@ struct bau_payload_queue_entry {
/* bytes 24-31 */
};
-/*
- * one per-cpu; to locate the software tables
- */
-struct bau_control {
- struct bau_desc *descriptor_base;
+struct msg_desc {
+ struct bau_payload_queue_entry *msg;
+ int msg_slot;
+ int sw_ack_slot;
struct bau_payload_queue_entry *va_queue_first;
struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *bau_msg_head;
- struct bau_control *uvhub_master;
- struct bau_control *socket_master;
- unsigned long timeout_interval;
- atomic_t active_descriptor_count;
- int max_concurrent;
- int max_concurrent_constant;
- int retry_message_scans;
- int plugged_tries;
- int timeout_tries;
- int ipi_attempts;
- int conseccompletes;
- short cpu;
- short uvhub_cpu;
- short uvhub;
- short cpus_in_socket;
- short cpus_in_uvhub;
- unsigned short message_number;
- unsigned short uvhub_quiesce;
- short socket_acknowledge_count[DEST_Q_SIZE];
- cycles_t send_message;
- spinlock_t masks_lock;
- spinlock_t uvhub_lock;
- spinlock_t queue_lock;
+};
+
+struct reset_args {
+ int sender;
};
/*
@@ -344,18 +344,25 @@ struct ptc_stats {
unsigned long s_dtimeout; /* destination side timeouts */
unsigned long s_time; /* time spent in sending side */
unsigned long s_retriesok; /* successful retries */
- unsigned long s_ntargcpu; /* number of cpus targeted */
- unsigned long s_ntarguvhub; /* number of uvhubs targeted */
- unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
- unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
- unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
- unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
- unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
+ unsigned long s_ntargcpu; /* total number of cpu's targeted */
+ unsigned long s_ntargself; /* times the sending cpu was targeted */
+ unsigned long s_ntarglocals; /* targets of cpus on the local blade */
+ unsigned long s_ntargremotes; /* targets of cpus on remote blades */
+ unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+ unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
+ unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
+ unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
+ unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
+ unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
+ unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
+ unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
unsigned long s_resets_plug; /* ipi-style resets from plug state */
unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
unsigned long s_busy; /* status stayed busy past s/w timer */
unsigned long s_throttles; /* waits in throttle */
unsigned long s_retry_messages; /* retry broadcasts */
+ unsigned long s_bau_reenabled; /* for bau enable/disable */
+ unsigned long s_bau_disabled; /* for bau enable/disable */
/* destination statistics */
unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
@@ -370,6 +377,52 @@ struct ptc_stats {
unsigned long d_rcanceled; /* number of messages canceled by resets */
};
+/*
+ * one per-cpu; to locate the software tables
+ */
+struct bau_control {
+ struct bau_desc *descriptor_base;
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+ struct bau_payload_queue_entry *bau_msg_head;
+ struct bau_control *uvhub_master;
+ struct bau_control *socket_master;
+ struct ptc_stats *statp;
+ unsigned long timeout_interval;
+ unsigned long set_bau_on_time;
+ atomic_t active_descriptor_count;
+ int plugged_tries;
+ int timeout_tries;
+ int ipi_attempts;
+ int conseccompletes;
+ int baudisabled;
+ int set_bau_off;
+ short cpu;
+ short uvhub_cpu;
+ short uvhub;
+ short cpus_in_socket;
+ short cpus_in_uvhub;
+ unsigned short message_number;
+ unsigned short uvhub_quiesce;
+ short socket_acknowledge_count[DEST_Q_SIZE];
+ cycles_t send_message;
+ spinlock_t uvhub_lock;
+ spinlock_t queue_lock;
+ /* tunables */
+ int max_bau_concurrent;
+ int max_bau_concurrent_constant;
+ int plugged_delay;
+ int plugsb4reset;
+ int timeoutsb4reset;
+ int ipi_reset_limit;
+ int complete_threshold;
+ int congested_response_us;
+ int congested_reps;
+ int congested_period;
+ cycles_t period_time;
+ long period_requests;
+};
+
static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
{
return constant_test_bit(uvhub, &dstp->bits[0]);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
#define EXIT_REASON_IO_INSTRUCTION 30
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
/*
* Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
+
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa6..7fda040a76cd 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
return _hypercall2(int, nmi_op, op, arg);
}
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a400799..bf5f7d32bd08 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
*/
static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
{
- extern unsigned long max_mapnr;
unsigned long pfn = mfn_to_pfn(mfn);
- if ((pfn < max_mapnr)
- && !xen_feature(XENFEAT_auto_translated_physmap)
- && (get_phys_to_machine(pfn) != mfn))
- return max_mapnr; /* force !pfn_valid() */
- /* XXX fixme; not true with sparsemem */
+ if (get_phys_to_machine(pfn) != mfn)
+ return -1; /* force !pfn_valid() */
return pfn;
}
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 000000000000..1be1ab7d6a41
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_SWIOTLB_XEN_H
+#define _ASM_X86_SWIOTLB_XEN_H
+
+#ifdef CONFIG_SWIOTLB_XEN
+extern int xen_swiotlb;
+extern int __init pci_xen_swiotlb_detect(void);
+extern void __init pci_xen_swiotlb_init(void);
+#else
+#define xen_swiotlb (0)
+static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
+static inline void __init pci_xen_swiotlb_init(void) { }
+#endif
+
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae228..c6ce2452f10c 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -3,7 +3,8 @@
#include <linux/types.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+
+#define XSTATE_CPUID 0x0000000d
#define XSTATE_FP 0x1
#define XSTATE_SSE 0x2
@@ -13,6 +14,12 @@
#define FXSAVE_SIZE 512
+#define XSAVE_HDR_SIZE 64
+#define XSAVE_HDR_OFFSET FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE 256
+#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
/*
* These are the features that the OS can handle currently.
*/
@@ -26,10 +33,8 @@
extern unsigned int xstate_size;
extern u64 pcntxt_mask;
-extern struct xsave_struct *init_xstate_buf;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern void xsave_cntxt_init(void);
extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
@@ -59,6 +64,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu)
static inline int xsave_user(struct xsave_struct __user *buf)
{
int err;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ err = __clear_user(&buf->xsave_hdr,
+ sizeof(struct xsave_hdr_struct));
+ if (unlikely(err))
+ return -EFAULT;
+
__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -111,12 +126,25 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
: "memory");
}
+static inline void xsave_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
static inline void fpu_xsave(struct fpu *fpu)
{
/* This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
- : : "D" (&(fpu->state->xsave)),
- "a" (-1), "d"(-1) : "memory");
+ alternative_input(
+ ".byte " REX_PREFIX "0x0f,0xae,0x27",
+ ".byte " REX_PREFIX "0x0f,0xae,0x37",
+ X86_FEATURE_XSAVEOPT,
+ [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
+ "memory");
}
#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b22083721..0925676266bd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o
scx200-y += scx200_32.o
obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
obj-$(CONFIG_X86_MRST) += mrst.o
microcode-y := microcode_core.o
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
movl %eax, %ecx
orl %edx, %ecx
jz 1f
- movl $0xc0000080, %ecx
+ movl $MSR_EFER, %ecx
wrmsr
1:
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fcc3c61fdecc..33cec152070d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
* sleep.c - x86-specific ACPI sleep support.
*
* Copyright (C) 2001-2003 Patrick Mochel
- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
*/
#include <linux/acpi.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c7..f65ab8b014c4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->cpuid >= NCAPINTS*32);
if (!boot_cpu_has(a->cpuid))
continue;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c6..fa044e1e30a2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
unsigned long cap)
{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return 1;
+ }
+
return 0;
}
@@ -2609,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void)
pt_domain->mode |= PAGE_MODE_NONE;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
+ for_each_pci_dev(dev) {
if (!check_device(&dev->dev))
continue;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
+#include <asm/mrst.h>
#define APBT_MASK CLOCKSOURCE_MASK(32)
#define APBT_SHIFT 22
-#define APBT_CLOCKEVENT_RATING 150
+#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
#define APBT_MIN_DELTA_USEC 200
@@ -83,8 +84,6 @@ struct apbt_dev {
char name[10];
};
-int disable_apbt_percpu __cpuinitdata;
-
static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
};
/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp("apbt_only", arg) == 0)
- disable_apbt_percpu = 0;
- else if (strcmp("lapic_and_apbt", arg) == 0)
- disable_apbt_percpu = 1;
- else {
- pr_warning("X86 MRST timer option %s not recognised"
- " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
- arg);
- return -EINVAL;
- }
- return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
-/*
* start count down from 0xffff_ffff. this is done by toggling the enable bit
* then load initial load count to ~0.
*/
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
adev->num = smp_processor_id();
memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
- if (disable_apbt_percpu) {
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
global_clock_event = &adev->evt;
printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
static __init int apbt_late_init(void)
{
- if (disable_apbt_percpu || !apb_timer_block_enabled)
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+ !apb_timer_block_enabled)
return 0;
/* This notifier should be called after workqueue is ready */
hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
int timer_num;
struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+ BUG_ON(!apbt_virt_address);
+
timer_num = adev->num;
pr_debug("%s CPU %d timer %d mode=%d\n",
__func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
- if (disable_apbt_percpu) {
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
- u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 agp_aper_order = 0;
int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
return;
/* This is mostly duplicate of iommu_hole_init */
- agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+ search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
+obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
+endif
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
+
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6cab..e3b534cda49a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -460,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
}
/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
*/
static void __cpuinit setup_APIC_timer(void)
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
* acpi lapic path already maps that address in
* acpi_register_lapic_address()
*/
- if (!acpi_lapic)
+ if (!acpi_lapic && !smp_found_config)
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87feb..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int es7000_plat;
* GSI override for ES7000 platforms.
*/
-static unsigned int base;
static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
{
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+u64 hw_nmi_get_sample_period(void)
+{
+ return (u64)(cpu_khz) * 1000 * 60;
+}
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
+void arch_trigger_all_cpu_backtrace(void)
+{
+ int i;
+
+ cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+
+ printk(KERN_INFO "sending NMI to all CPUs:\n");
+ apic->send_IPI_all(NMI_VECTOR);
+
+ /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(to_cpumask(backtrace_mask)))
+ break;
+ mdelay(1);
+ }
+}
+
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+ int cpu = smp_processor_id();
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+ arch_spin_lock(&lock);
+ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ dump_stack();
+ arch_spin_unlock(&lock);
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ return NOTIFY_STOP;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static __read_mostly struct notifier_block backtrace_notifier = {
+ .notifier_call = arch_trigger_all_cpu_backtrace_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+ register_die_notifier(&backtrace_notifier);
+ return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
+#endif
+
+/* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
+int unknown_nmi_panic;
+void cpu_nmi_set_wd_enabled(void) { return; }
+void stop_apic_nmi_watchdog(void *unused) { return; }
+void setup_apic_nmi_watchdog(void *unused) { return; }
+int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
cfg = desc->chip_data;
- read_msi_msg_desc(desc, &msg);
+ get_cached_msi_msg_desc(desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
int cpu = smp_processor_id();
int rc = 0;
- /* check for other users first */
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- rc = 1;
- touched = 1;
- }
-
sum = get_timer_irqs(cpu);
if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..7b598b84c902 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
if (reason != DIE_NMI_IPI)
return NOTIFY_OK;
+
+ if (in_crash_kexec)
+ /* do nothing if entering the crash kernel */
+ return NOTIFY_OK;
/*
* Use a lock so only one cpu prints at a time
* to prevent intermixed output.
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3ac..4c9c67bf09b7 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
* is now the way life works).
* Fix thinko in suspend() (wrong return).
* Notify drivers on critical suspend.
- * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
* modified by sfr).
* Disable interrupts while we are suspended (Andy Henroid
* <andy_henroid@yahoo.com> fixed by sfr).
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
-obj-y := intel_cacheinfo.o addon_cpuid_features.o
+obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
-obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
+obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..60a57b13082d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
- if (c->x86 == 0x10 || c->x86 == 0x11)
+ if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
num_cache_leaves = 3;
}
- if (c->x86 >= 0xf && c->x86 <= 0x11)
+ if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
fam10h_check_enable_mmcfg();
}
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ if (c == &boot_cpu_data && c->x86 >= 0xf) {
unsigned long long tseg;
/*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
};
cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
+ * int[] in arch/x86/include/asm/processor.h.
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_400);
+
+const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_383);
+
+bool cpu_has_amd_erratum(const int *erratum)
+{
+ struct cpuinfo_x86 *cpu = &current_cpu_data;
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ /*
+ * If called early enough that current_cpu_data hasn't been initialized
+ * yet, fall back to boot_cpu_data.
+ */
+ if (cpu->x86 == 0)
+ cpu = &boot_cpu_data;
+
+ if (cpu->x86_vendor != X86_VENDOR_AMD)
+ return false;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 8) | cpu->x86_mask;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
+EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..490dac63c2d2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
return 1;
}
__setup("noxsave", x86_xsave_setup);
+static int __init x86_xsaveopt_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[4] = excap;
}
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+ if (eax > 0)
+ c->x86_capability[9] = ebx;
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ init_scattered_cpuid_features(c);
}
static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- init_scattered_cpuid_features(c);
detect_nopl(c);
}
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void)
dbg_restore_debug_regs();
fpu_init();
+ xsave_init();
raw_local_save_flags(kernel_eflags);
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void)
clear_used_math();
mxcsr_feature_mask_init();
- /*
- * Boot processor to setup the FP and extended state context info.
- */
- if (smp_processor_id() == boot_cpu_id)
- init_thread_xstate();
-
+ fpu_init();
xsave_init();
}
#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40ee..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
#include <linux/compiler.h>
#include <linux/dmi.h>
#include <linux/slab.h>
-#include <trace/events/power.h>
#include <linux/acpi.h>
#include <linux/io.h>
@@ -73,7 +72,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
+static struct acpi_processor_performance __percpu *acpi_perf_data;
static struct cpufreq_driver acpi_cpufreq_driver;
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -351,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
freqs.old = perf->states[perf->state].core_frequency * 1000;
freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, cmd.mask) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -367,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- for_each_cpu(i, cmd.mask) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
* Low Level chipset interface *
****************************************************************/
static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
- PCI_ANY_ID, PCI_ANY_ID },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
{ 0, },
};
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
}
/* detect which companion chip is used */
- while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
+ for_each_pci_dev(gx_pci) {
if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
return gx_pci;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
}
-static int __init longhaul_get_ranges(void)
+static int __cpuinit longhaul_get_ranges(void)
{
unsigned int i, j, k = 0;
unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
}
-static void __init longhaul_setup_voltagescaling(void)
+static void __cpuinit longhaul_setup_voltagescaling(void)
{
union msr_longhaul longhaul;
struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
return 0;
}
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
{
struct cpuinfo_x86 *c = &cpu_data(0);
char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
/*
* VIA C3 Samuel 1 & Samuel 2 (stepping 0)
*/
-static const int __initdata samuel1_mults[16] = {
+static const int __cpuinitdata samuel1_mults[16] = {
-1, /* 0000 -> RESERVED */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
-1, /* 1111 -> RESERVED */
};
-static const int __initdata samuel1_eblcr[16] = {
+static const int __cpuinitdata samuel1_eblcr[16] = {
50, /* 0000 -> RESERVED */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
/*
* VIA C3 Samuel2 Stepping 1->15
*/
-static const int __initdata samuel2_eblcr[16] = {
+static const int __cpuinitdata samuel2_eblcr[16] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
/*
* VIA C3 Ezra
*/
-static const int __initdata ezra_mults[16] = {
+static const int __cpuinitdata ezra_mults[16] = {
100, /* 0000 -> 10.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
120, /* 1111 -> 12.0x */
};
-static const int __initdata ezra_eblcr[16] = {
+static const int __cpuinitdata ezra_eblcr[16] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
/*
* VIA C3 (Ezra-T) [C5M].
*/
-static const int __initdata ezrat_mults[32] = {
+static const int __cpuinitdata ezrat_mults[32] = {
100, /* 0000 -> 10.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
-1, /* 1111 -> RESERVED (12.0x) */
};
-static const int __initdata ezrat_eblcr[32] = {
+static const int __cpuinitdata ezrat_eblcr[32] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
/*
* VIA C3 Nehemiah */
-static const int __initdata nehemiah_mults[32] = {
+static const int __cpuinitdata nehemiah_mults[32] = {
100, /* 0000 -> 10.0x */
-1, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
-1, /* 1111 -> 12.0x */
};
-static const int __initdata nehemiah_eblcr[32] = {
+static const int __cpuinitdata nehemiah_eblcr[32] = {
50, /* 0000 -> 5.0x */
160, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
unsigned short pos;
};
-static const struct mV_pos __initdata vrm85_mV[32] = {
+static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
{1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
{1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
{1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
{1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
};
-static const unsigned char __initdata mV_vrm85[32] = {
+static const unsigned char __cpuinitdata mV_vrm85[32] = {
0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
};
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
+static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
{1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
{1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
{1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
{675, 3}, {650, 2}, {625, 1}, {600, 0}
};
-static const unsigned char __initdata mV_mobilevrm[32] = {
+static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
* TMTA rules:
* performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
*/
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
+static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+ unsigned int *high_freq)
{
u32 msr_lo, msr_hi;
u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
}
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
{
int result = 0;
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
}
}
- if (c->x86 != 0xF) {
- if (!cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Unknown CPU. "
- "Please send an e-mail to "
- "<cpufreq@vger.kernel.org>\n");
+ if (c->x86 != 0xF)
return 0;
- }
/* on P-4s, the TSC runs with constant frequency independent whether
* throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb622..994230d4dc4e 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
u32 output_offset;
};
-static struct pcc_cpu *pcc_cpu_info;
+static struct pcc_cpu __percpu *pcc_cpu_info;
static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
{
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
* We will then get the same kind of behaviour already tested under
* the "well-known" other OS.
*/
-static int __init fixup_sgtc(void)
+static int __cpuinit fixup_sgtc(void)
{
unsigned int sgtc;
unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
}
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
+static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
{
printk(KERN_WARNING PFX
"%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
* A BIOS update is all that can save them.
* Mention this, and disable cpufreq.
*/
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
+static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
{
.callback = acer_cpufreq_pst,
.ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
{ }
};
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
{
union msr_fidvidstatus fidvidstatus;
int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3e90cce3dc8b..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
* Based on the powernow-k7.c module written by Dave Jones.
* (C) 2003 Dave Jones on behalf of SuSE Labs
* (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@suse.cz>
+ * (C) 2004 Pavel Machek <pavel@ucw.cz>
* Licensed under the terms of the GNU GPL License version 2.
* Based upon datasheets & sample CPUs kindly provided by AMD.
*
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
* www.amd.com
*/
printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
+ printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
+ " and Cool'N'Quiet support is enabled in BIOS setup\n");
return -ENODEV;
}
@@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
{
int i;
u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
- data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+ rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+ data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
for (i = 0; i < data->acpi_data.state_count; i++) {
u32 index;
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
};
const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf5..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
return l3;
}
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
+ int index)
{
int node;
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
this_leaf->l3 = l3_caches[node];
}
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
unsigned int slot)
{
- struct pci_dev *dev = this_leaf->l3->dev;
- unsigned int reg = 0;
+ int index;
if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
- if (!dev)
- return -EINVAL;
+ index = amd_get_l3_disable_slot(this_leaf->l3, slot);
+ if (index >= 0)
+ return sprintf(buf, "%d\n", index);
- pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
- return sprintf(buf, "0x%08x\n", reg);
+ return sprintf(buf, "FREE\n");
}
#define SHOW_CACHE_DISABLE(slot) \
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
}
}
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
+ unsigned long index)
{
- struct pci_dev *dev = this_leaf->l3->dev;
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- unsigned long val = 0;
+ int ret = 0;
#define SUBCACHE_MASK (3UL << 20)
#define SUBCACHE_INDEX 0xfff
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+ /*
+ * check whether this slot is already used or
+ * the index is already disabled
+ */
+ ret = amd_get_l3_disable_slot(l3, slot);
+ if (ret >= 0)
return -EINVAL;
+ /*
+ * check whether the other slot has disabled the
+ * same index already
+ */
+ if (index == amd_get_l3_disable_slot(l3, !slot))
+ return -EINVAL;
+
+ /* do not allow writes outside of allowed bits */
+ if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+ ((index & SUBCACHE_INDEX) > l3->indices))
+ return -EINVAL;
+
+ amd_l3_disable_index(l3, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
+{
+ unsigned long val = 0;
+ int cpu, err = 0;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!dev)
+ if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
+ cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- /* do not allow writes outside of allowed bits */
- if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
+ if (strict_strtoul(buf, 10, &val) < 0)
return -EINVAL;
- amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
-
+ err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ printk(KERN_WARNING "L3 disable slot %d in use!\n",
+ slot);
+ return err;
+ }
return count;
}
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
#else /* CONFIG_CPU_SUP_AMD */
static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
{
};
#endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_check_l3_disable(index, this_leaf);
+ amd_check_l3_disable(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc42562250..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
static DEFINE_MUTEX(mce_read_mutex);
#define rcu_dereference_check_mce(p) \
- rcu_dereference_check((p), \
+ rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
lockdep_is_held(&mce_read_mutex))
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
static int default_decode_mce(struct notifier_block *nb, unsigned long val,
void *data)
{
- pr_emerg("No human readable MCE decoding support on this CPU type.\n");
- pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+ pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
+ pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
return NOTIFY_STOP;
}
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
- pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
- pr_emerg("RIP%s %02x:<%016Lx> ",
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
pr_cont("\n");
}
- pr_emerg("TSC %llx ", m->tsc);
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
if (m->addr)
pr_cont("ADDR %llx ", m->addr);
if (m->misc)
pr_cont("MISC %llx ", m->misc);
pr_cont("\n");
- pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
/*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
-static void print_mce_head(void)
-{
- pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
- pr_emerg("This is not a software problem!\n");
-}
-
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
if (atomic_inc_return(&mce_fake_paniced) > 1)
return;
}
- print_mce_head();
/* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
apei_err = apei_write_mce(final);
}
if (cpu_missing)
- printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
- print_mce_tail();
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
- printk(KERN_EMERG "Machine check: %s\n", exp);
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mce_panic_timeout;
panic(msg);
} else
- printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
}
/* Support code for software error injection */
@@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
add_taint(TAINT_MACHINE_CHECK);
}
@@ -1220,7 +1209,7 @@ int mce_notify_irq(void)
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
+ pr_info(HW_ERR "Machine check events logged\n");
return 1;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (test_and_clear_bit(i, owned) && !boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val |= CMCI_EN | CMCI_THRESHOLD;
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (!test_and_set_bit(i, owned) && !boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf9716..c2a8b26d4fea 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
/*
- * Current thermal throttling state:
+ * Current thermal event state:
*/
-struct thermal_state {
- bool is_throttled;
-
+struct _thermal_state {
+ bool new_event;
+ int event;
u64 next_check;
- unsigned long throttle_count;
- unsigned long last_throttle_count;
+ unsigned long count;
+ unsigned long last_count;
+};
+
+struct thermal_state {
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
};
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+ static SYSDEV_ATTR(_name, 0444, \
+ therm_throt_sysdev_show_##_name, \
+ NULL) \
-#define define_therm_throt_sysdev_show_func(name) \
+#define define_therm_throt_sysdev_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##name( \
+static ssize_t therm_throt_sysdev_show_##event##_##name( \
struct sys_device *dev, \
struct sysdev_attribute *attr, \
char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
+ if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).name); \
- else \
+ per_cpu(thermal_state, cpu).event.name); \
+ } else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
+
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_throttle_count.attr,
+ &attr_core_throttle_count.attr,
NULL
};
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-static int therm_throt_process(bool is_throttled)
+static int therm_throt_process(bool new_event, int event, int level)
{
- struct thermal_state *state;
- unsigned int this_cpu;
- bool was_throttled;
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- this_cpu = smp_processor_id();
now = get_jiffies_64();
- state = &per_cpu(thermal_state, this_cpu);
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return 0;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return 0;
+ } else
+ return 0;
- was_throttled = state->is_throttled;
- state->is_throttled = is_throttled;
+ old_event = state->new_event;
+ state->new_event = new_event;
- if (is_throttled)
- state->throttle_count++;
+ if (new_event)
+ state->count++;
if (time_before64(now, state->next_check) &&
- state->throttle_count != state->last_throttle_count)
+ state->count != state->last_count)
return 0;
state->next_check = now + CHECK_INTERVAL;
- state->last_throttle_count = state->throttle_count;
+ state->last_count = state->count;
/* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ else
+ printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
add_taint(TAINT_MACHINE_CHECK);
return 1;
}
- if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ else
+ printk(KERN_INFO "CPU%d: %s power limit normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PTS))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_throttle_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_power_limit_count.attr,
+ thermal_attr_group.name);
+
+ return err;
}
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED (0)
+#define CORE_POWER_LIMIT ((__u64)1 << 62)
+#define PACKAGE_THROTTLED ((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
+
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
- mce_log_therm_throt_event(msr_val);
+
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+ | msr_val);
+ }
}
static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
smp_thermal_vector = intel_thermal_interrupt;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..d944bf6c50e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <asm/mshyperv.h>
struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
static bool __init ms_hyperv_platform(void)
{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
unsigned long gran_base, chunk_base, lose_base;
char gran_factor, chunk_factor, lose_factor;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
{
unsigned int mask_lo, mask_hi, base_lo, base_hi;
unsigned int tmp, hi;
- int cpu;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
- cpu = get_cpu();
+ get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
mtrr_type smp_type;
};
+static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
+
/**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static void ipi_handler(void *info)
+static int mtrr_work_handler(void *info)
{
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
unsigned long flags;
+ atomic_dec(&data->count);
+ while (!atomic_read(&data->gate))
+ cpu_relax();
+
local_irq_save(flags);
atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
+ while (atomic_read(&data->gate))
cpu_relax();
/* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
}
atomic_dec(&data->count);
- while (atomic_read(&data->gate))
+ while (!atomic_read(&data->gate))
cpu_relax();
atomic_dec(&data->count);
local_irq_restore(flags);
#endif
+ return 0;
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
*
* This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
*
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
* 2. Disable Interrupts
* 3. Wait for all procs to do so
* 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 15. Enable interrupts.
*
* What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
+ * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * decrementing the count, We reset data.count and set the data.gate flag
+ * allowing all the cpu's to proceed with the work. As each cpu disables
+ * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
+ * are waiting for that flag to be cleared. Once it's cleared, each
* CPU goes through the transition of updating MTRRs.
* The CPU vendors may each do it differently,
* so we call mtrr_if->set() callback and let them take care of it.
* When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
+ * to be set.
* When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
* Everyone then enables interrupts and we all continue on.
*
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
{
struct set_mtrr_data data;
unsigned long flags;
+ int cpu;
+
+ preempt_disable();
data.smp_reg = reg;
data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
atomic_set(&data.gate, 0);
/* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 0) != 0)
- panic("mtrr: timed out waiting for other CPUs\n");
+ for_each_online_cpu(cpu) {
+ struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
+
+ if (cpu == smp_processor_id())
+ continue;
+
+ stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
+ }
- local_irq_save(flags);
while (atomic_read(&data.count))
cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
smp_wmb();
atomic_set(&data.gate, 1);
+ local_irq_save(flags);
+
+ while (atomic_read(&data.count))
+ cpu_relax();
+
+ /* Ok, reset count and toggle gate */
+ atomic_set(&data.count, num_booting_cpus() - 1);
+ smp_wmb();
+ atomic_set(&data.gate, 0);
+
/* Do our MTRR business */
/*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
atomic_set(&data.count, num_booting_cpus() - 1);
smp_wmb();
- atomic_set(&data.gate, 0);
+ atomic_set(&data.gate, 1);
/*
* Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
cpu_relax();
local_irq_restore(flags);
+ preempt_enable();
}
/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..f2da20fda02d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -220,6 +220,7 @@ struct x86_pmu {
struct perf_event *event);
struct event_constraint *event_constraints;
void (*quirks)(void);
+ int perfctr_second_write;
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
@@ -295,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
* count to the generic event atomically:
*/
again:
- prev_raw_count = atomic64_read(&hwc->prev_count);
+ prev_raw_count = local64_read(&hwc->prev_count);
rdmsrl(hwc->event_base + idx, new_raw_count);
- if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
goto again;
@@ -313,8 +314,8 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- atomic64_add(delta, &event->count);
- atomic64_sub(delta, &hwc->period_left);
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
return new_raw_count;
}
@@ -438,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
if (!hwc->sample_period) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
+ local64_set(&hwc->period_left, hwc->sample_period);
} else {
/*
* If we have a PMU initialized but no APIC
@@ -885,7 +886,7 @@ static int
x86_perf_event_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- s64 left = atomic64_read(&hwc->period_left);
+ s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int ret = 0, idx = hwc->idx;
@@ -897,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
*/
if (unlikely(left <= -period)) {
left = period;
- atomic64_set(&hwc->period_left, left);
+ local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
if (unlikely(left <= 0)) {
left += period;
- atomic64_set(&hwc->period_left, left);
+ local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
@@ -923,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event)
* The hw event starts counting from this event offset,
* mark it to be able to extra future deltas:
*/
- atomic64_set(&hwc->prev_count, (u64)-left);
+ local64_set(&hwc->prev_count, (u64)-left);
- wrmsrl(hwc->event_base + idx,
+ wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+
+ /*
+ * Due to erratum on certan cpu we need
+ * a second write to be sure the register
+ * is updated properly
+ */
+ if (x86_pmu.perfctr_second_write) {
+ wrmsrl(hwc->event_base + idx,
(u64)(-left) & x86_pmu.cntval_mask);
+ }
perf_event_update_userpage(event);
@@ -969,7 +979,7 @@ static int x86_pmu_enable(struct perf_event *event)
* skip the schedulability test here, it will be peformed
* at commit time(->commit_txn) as a whole
*/
- if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+ if (cpuc->group_flag & PERF_EVENT_TXN)
goto out;
ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1106,7 @@ static void x86_pmu_disable(struct perf_event *event)
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
*/
- if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+ if (cpuc->group_flag & PERF_EVENT_TXN)
return;
x86_pmu_stop(event);
@@ -1388,7 +1398,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+ cpuc->group_flag |= PERF_EVENT_TXN;
cpuc->n_txn = 0;
}
@@ -1401,7 +1411,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+ cpuc->group_flag &= ~PERF_EVENT_TXN;
/*
* Truncate the collected events.
*/
@@ -1435,11 +1445,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
- /*
- * Clear out the txn count so that ->cancel_txn() which gets
- * run after ->commit_txn() doesn't undo things.
- */
- cpuc->n_txn = 0;
+ cpuc->group_flag &= ~PERF_EVENT_TXN;
return 0;
}
@@ -1607,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack_bp,
};
-#include "../dumpstack.h"
-
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
@@ -1730,22 +1734,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
- regs->ip = ip;
- /*
- * perf_arch_fetch_caller_regs adds another call, we need to increment
- * the skip level
- */
- regs->bp = rewind_frame_pointer(skip + 1);
- regs->cs = __KERNEL_CS;
- /*
- * We abuse bit 3 to pass exact information, see perf_misc_flags
- * and the comment with PERF_EFLAGS_EXACT.
- */
- regs->flags = 0;
-}
-
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d1..febb12cea795 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
};
-struct p4_cache_event_bind {
+struct p4_pebs_bind {
unsigned int metric_pebs;
unsigned int metric_vert;
};
-#define P4_GEN_CACHE_EVENT_BIND(name) \
- [P4_CACHE__##name] = { \
- .metric_pebs = P4_PEBS__##name, \
- .metric_vert = P4_VERT__##name, \
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert) \
+ [P4_PEBS_METRIC__##name] = { \
+ .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
+ .metric_vert = vert, \
}
-static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
- P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+ P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
+ P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
+ P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
+ P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
+ P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
+ P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
+ P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
};
/*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
},
};
-#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
+#define P4_GEN_CACHE_EVENT(event, bit, metric) \
p4_config_pack_escr(P4_ESCR_EVENT(event) | \
P4_ESCR_EMASK_BIT(event, bit)) | \
- p4_config_pack_cccr(cache_event | \
+ p4_config_pack_cccr(metric | \
P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__1stl_cache_load_miss_retired),
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired),
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__2ndl_cache_load_miss_retired),
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__dtlb_load_miss_retired),
+ P4_PEBS_METRIC__dtlb_load_miss_retired),
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__dtlb_store_miss_retired),
+ P4_PEBS_METRIC__dtlb_store_miss_retired),
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
- P4_CACHE__itlb_reference_hit),
+ P4_PEBS_METRIC__none),
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
- P4_CACHE__itlb_reference_miss),
+ P4_PEBS_METRIC__none),
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
return config;
}
+static int p4_validate_raw_event(struct perf_event *event)
+{
+ unsigned int v;
+
+ /* user data may have out-of-bound event index */
+ v = p4_config_unpack_event(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_event_bind_map)) {
+ pr_warning("P4 PMU: Unknown event code: %d\n", v);
+ return -EINVAL;
+ }
+
+ /*
+ * it may have some screwed PEBS bits
+ */
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
+ pr_warning("P4 PMU: PEBS are not supported yet\n");
+ return -EINVAL;
+ }
+ v = p4_config_unpack_metric(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
+ pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int p4_hw_config(struct perf_event *event)
{
int cpu = get_cpu();
int rc = 0;
- unsigned int evnt;
u32 escr, cccr;
/*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW) {
- /* user data may have out-of-bound event index */
- evnt = p4_config_unpack_event(event->attr.config);
- if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
- rc = -EINVAL;
+ rc = p4_validate_raw_event(event);
+ if (rc)
goto out;
- }
/*
* We don't control raw events so it's up to the caller
@@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event)
* on HT machine but allow HT-compatible specifics to be
* passed on)
*
+ * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+ * bits since we keep additional info here (for cache events and etc)
+ *
* XXX: HT wide things should check perf_paranoid_cpu() &&
* CAP_SYS_ADMIN
*/
event->hw.config |= event->attr.config &
(p4_config_pack_escr(P4_ESCR_MASK_HT) |
- p4_config_pack_cccr(P4_CCCR_MASK_HT));
+ p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
}
rc = x86_setup_perfctr(event);
@@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
return overflow;
}
+static void p4_pmu_disable_pebs(void)
+{
+ /*
+ * FIXME
+ *
+ * It's still allowed that two threads setup same cache
+ * events so we can't simply clear metrics until we knew
+ * noone is depending on us, so we need kind of counter
+ * for "ReplayEvent" users.
+ *
+ * What is more complex -- RAW events, if user (for some
+ * reason) will pass some cache event metric with improper
+ * event opcode -- it's fine from hardware point of view
+ * but completely nonsence from "meaning" of such action.
+ *
+ * So at moment let leave metrics turned on forever -- it's
+ * ok for now but need to be revisited!
+ *
+ * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
+ * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+ */
+}
+
static inline void p4_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void)
continue;
p4_pmu_disable_event(event);
}
+
+ p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+ struct p4_pebs_bind *bind;
+ unsigned int idx;
+
+ BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+ idx = p4_config_unpack_metric(config);
+ if (idx == P4_PEBS_METRIC__none)
+ return;
+
+ bind = &p4_pebs_bind_map[idx];
+
+ (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+ (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
}
static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
int thread = p4_ht_config_thread(hwc->config);
u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
unsigned int idx = p4_config_unpack_event(hwc->config);
- unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
struct p4_event_bind *bind;
- struct p4_cache_event_bind *bind_cache;
u64 escr_addr, cccr;
bind = &p4_event_bind_map[idx];
@@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
cccr = p4_config_unpack_cccr(hwc->config);
/*
- * it could be Cache event so that we need to
- * set metrics into additional MSRs
+ * it could be Cache event so we need to write metrics
+ * into additional MSRs
*/
- BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
- if (idx_cache > P4_CACHE__NONE &&
- idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
- bind_cache = &p4_cache_event_bind_map[idx_cache];
- (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
- (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
- }
+ p4_pmu_enable_pebs(hwc->config);
(void)checking_wrmsrl(escr_addr, escr_conf);
(void)checking_wrmsrl(hwc->config_base + hwc->idx,
@@ -581,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ int overflow;
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -591,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
WARN_ON_ONCE(hwc->idx != idx);
/* it might be unflagged overflow */
- handled = p4_pmu_clear_cccr_ovf(hwc);
+ overflow = p4_pmu_clear_cccr_ovf(hwc);
val = x86_perf_event_update(event);
- if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+ if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
continue;
+ handled += overflow;
+
/* event overflow for sure */
data.period = event->hw.last_period;
@@ -612,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
inc_irq_stat(apic_perf_irqs);
}
- return handled;
+ return handled > 0;
}
/*
@@ -829,6 +907,15 @@ static __initconst const struct x86_pmu p4_pmu = {
.max_period = (1ULL << 39) - 1,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
+ /*
+ * This handles erratum N15 in intel doc 249199-029,
+ * the counter may not be updated correctly on write
+ * so we need a second write operation to do the trick
+ * (the official workaround didn't work)
+ *
+ * the former idea is taken from OProfile code
+ */
+ .perfctr_second_write = 1,
};
static __init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..34b4dad6f0b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,63 @@
+/*
+ * Routines to indentify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+ { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
+ { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
+ { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
+ { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
+ { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
+ { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
+ { 0, 0, 0, 0, 0 }
+ };
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+ &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a662..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
/*
- * Routines to indentify additional cpu features that are scattered in
- * cpuid space.
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
*/
-#include <linux/cpu.h>
+#include <linux/cpu.h>
+#include <asm/apic.h>
#include <asm/pat.h>
#include <asm/processor.h>
-#include <asm/apic.h>
-
-struct cpuid_bit {
- u16 feature;
- u8 reg;
- u8 bit;
- u32 level;
-};
-
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
- u32 max_level;
- u32 regs[4];
- const struct cpuid_bit *cb;
-
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
- { 0, 0, 0, 0 }
- };
-
- for (cb = cpuid_bits; cb->feature; cb++) {
-
- /* Verify that the level is valid */
- max_level = cpuid_eax(cb->level & 0xffff0000);
- if (max_level < cb->level ||
- max_level > (cb->level | 0xffff))
- continue;
-
- cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
- &regs[CR_ECX], &regs[CR_EDX]);
-
- if (regs[cb->reg] & (1 << cb->bit))
- set_cpu_cap(c, cb->feature);
- }
-}
-
/* leaf 0xb SMT level */
#define SMT_LEVEL 0
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
+ uint64_t tsc_hz, lpj;
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
+
+ if (!preset_lpj) {
+ lpj = ((u64)tsc_hz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
return tsc_hz;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..764c7c2b1811 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+int in_crash_kexec;
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
static void kdump_nmi_shootdown_cpus(void)
{
+ in_crash_kexec = 1;
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-#include <linux/uaccess.h>
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-struct stack_frame_ia32 {
- u32 next_frame;
- u32 return_address;
-};
-
-static inline unsigned long rewind_frame_pointer(int n)
-{
- struct stack_frame *frame;
-
- get_bp(frame);
-
-#ifdef CONFIG_FRAME_POINTER
- while (n--) {
- if (probe_kernel_address(&frame->next_frame, frame))
- break;
- }
-#endif
-
- return (unsigned long)frame;
-}
-
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
#define N_EXCEPTION_STACKS_END \
(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf153..227d00920d2f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
* compensating for the offset by changing to the ESPFIX segment with
* a base address that matches for the difference.
*/
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
mov %esp, %edx /* load kernel esp */
mov PT_OLDESP(%esp), %eax /* load userspace esp */
mov %dx, %ax /* eax: new kernel esp */
sub %eax, %edx /* offset (low word is 0) */
- PER_CPU(gdt_page, %ebx)
shr $16, %edx
- mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
- mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
pushl $__ESPFIX_SS
CFI_ADJUST_CFA_OFFSET 4
push %eax /* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
* normal stack and adjusts ESP with the matching offset.
*/
/* fixup the stack */
- PER_CPU(gdt_page, %ebx)
- mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
- mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
shl $16, %eax
addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error)
.balign 4
.long 661b
.long 663f
- .byte X86_FEATURE_XMM
+ .word X86_FEATURE_XMM
.byte 662b-661b
.byte 664f-663f
.previous
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
.previous
ENDPROC(xen_failsafe_callback)
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+ xen_evtchn_do_upcall)
+
#endif /* CONFIG_XEN */
#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ffa..17be5ec7cbba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
END(\sym)
.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
.macro paranoidzeroentry_ist sym do_sym ist
ENTRY(\sym)
INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %r12)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
*
* C extern interface:
- * extern long execve(char *name, char **argv, char **envp)
+ * extern long execve(const char *name, char **argv, char **envp)
*
* asm input arguments:
* rdi: name, rsi: argv, rdx: envp
*
* We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
*
* do_sys_execve asm fallback arguments:
* rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
CFI_ENDPROC
END(xen_failsafe_callback)
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
#endif /* CONFIG_XEN */
/*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..784360c0625c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
static void __init i386_default_early_setup(void)
{
- /* Initilize 32bit specific setup functions */
+ /* Initialize 32bit specific setup functions */
x86_init.resources.probe_roms = probe_roms;
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..ff4c453e13f3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
movsl
1:
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
+
#ifdef CONFIG_PARAVIRT
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
* init data section till per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq initial_gs(%rip),%rax
- movq %rax,%rdx
- shrq $32,%rdx
+ movl initial_gs(%rip),%eax
+ movl initial_gs+4(%rip),%edx
wrmsr
/* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d731175..351f9c0fea1f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
#include <asm/hpet.h>
#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
/* FSEC = 10^-15
NSEC = 10^-9 */
@@ -583,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
* scaled math multiplication factor for nanosecond to hpet tick
* conversion.
*/
- hpet_freq = 1000000000000000ULL;
+ hpet_freq = FSEC_PER_SEC;
do_div(hpet_freq, hpet_period);
evt->mult = div_sc((unsigned long) hpet_freq,
NSEC_PER_SEC, evt->shift);
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
- .shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = {
static int hpet_clocksource_register(void)
{
u64 start, now;
+ u64 hpet_freq;
cycle_t t1;
/* Start the counter */
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void)
* mult = (hpet_period * 2^shift)/10^6
* mult = (hpet_period << shift)/FSEC_PER_NSEC
*/
- clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
- clocksource_register(&clocksource_hpet);
+ /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
+ *
+ * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
+ * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
+ */
+ hpet_freq = FSEC_PER_SEC;
+ do_div(hpet_freq, hpet_period);
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
return 0;
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..a474ec37c32f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
{
/* Len */
switch (x86_len) {
+ case X86_BREAKPOINT_LEN_X:
+ *gen_len = sizeof(long);
+ break;
case X86_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
@@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
info->address = bp->attr.bp_addr;
+ /* Type */
+ switch (bp->attr.bp_type) {
+ case HW_BREAKPOINT_W:
+ info->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ info->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ info->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (bp->attr.bp_len == sizeof(long)) {
+ info->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+
/* Len */
switch (bp->attr.bp_len) {
case HW_BREAKPOINT_LEN_1:
@@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp)
return -EINVAL;
}
- /* Type */
- switch (bp->attr.bp_type) {
- case HW_BREAKPOINT_W:
- info->type = X86_BREAKPOINT_WRITE;
- break;
- case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- info->type = X86_BREAKPOINT_RW;
- break;
- case HW_BREAKPOINT_X:
- info->type = X86_BREAKPOINT_EXECUTE;
- break;
- default:
- return -EINVAL;
- }
-
return 0;
}
/*
@@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
ret = -EINVAL;
switch (info->len) {
+ case X86_BREAKPOINT_LEN_X:
+ align = sizeof(long) -1;
+ break;
case X86_BREAKPOINT_LEN_1:
align = 0;
break;
@@ -466,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
perf_bp_event(bp, args->regs);
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+ args->regs->flags |= X86_EFLAGS_RF;
+
rcu_read_unlock();
}
/*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..1f11f5ce668f 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
stts();
}
-void __cpuinit init_thread_xstate(void)
+static void __cpuinit init_thread_xstate(void)
{
+ /*
+ * Note that xstate_size might be overwriten later during
+ * xsave_init().
+ */
+
if (!HAVE_HWFP) {
xstate_size = sizeof(struct i387_soft_struct);
return;
}
- if (cpu_has_xsave) {
- xsave_cntxt_init();
- return;
- }
-
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
#ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
+
void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
@@ -93,21 +94,26 @@ void __cpuinit fpu_init(void)
write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
- /*
- * Boot processor to setup the FP and extended state context info.
- */
if (!smp_processor_id())
init_thread_xstate();
- xsave_init();
mxcsr_feature_mask_init();
/* clean state in init */
current_thread_info()->status = 0;
clear_used_math();
}
-#endif /* CONFIG_X86_64 */
-static void fpu_finit(struct fpu *fpu)
+#else /* CONFIG_X86_64 */
+
+void __cpuinit fpu_init(void)
+{
+ if (!smp_processor_id())
+ init_thread_xstate();
+}
+
+#endif /* CONFIG_X86_32 */
+
+void fpu_finit(struct fpu *fpu)
{
#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
@@ -132,6 +138,7 @@ static void fpu_finit(struct fpu *fpu)
fp->fos = 0xffff0000u;
}
}
+EXPORT_SYMBOL_GPL(fpu_finit);
/*
* The _current_ task is using the FPU for the first time
@@ -190,6 +197,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
&target->thread.fpu.state->fxsave, 0, -1);
}
@@ -207,6 +216,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->thread.fpu.state->fxsave, 0, -1);
@@ -446,6 +457,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-1);
}
+ sanitize_i387_state(target);
+
if (kbuf && pos == 0 && count == sizeof(env)) {
convert_from_fxsr(kbuf, target);
return 0;
@@ -467,6 +480,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
@@ -533,6 +548,9 @@ static int save_i387_xsave(void __user *buf)
struct _fpstate_ia32 __user *fx = buf;
int err = 0;
+
+ sanitize_i387_state(tsk);
+
/*
* For legacy compatible, we always set FP/SSE bits in the bit
* vector while saving the state to the user context.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae7..ef10940e1af0 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
#include <asm/system.h>
#include <asm/apic.h>
-/**
- * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- * @gdb_regs: A pointer to hold the registers in the order GDB wants.
- * @regs: The &struct pt_regs of the current process.
- *
- * Convert the pt_regs in @regs into the format for registers that
- * GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, dx) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
#endif
- gdb_regs[GDB_AX] = regs->ax;
- gdb_regs[GDB_BX] = regs->bx;
- gdb_regs[GDB_CX] = regs->cx;
- gdb_regs[GDB_DX] = regs->dx;
- gdb_regs[GDB_SI] = regs->si;
- gdb_regs[GDB_DI] = regs->di;
- gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PC] = regs->ip;
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (
#ifdef CONFIG_X86_32
- gdb_regs[GDB_PS] = regs->flags;
- gdb_regs[GDB_DS] = regs->ds;
- gdb_regs[GDB_ES] = regs->es;
- gdb_regs[GDB_CS] = regs->cs;
- gdb_regs[GDB_FS] = 0xFFFF;
- gdb_regs[GDB_GS] = 0xFFFF;
- if (user_mode_vm(regs)) {
- gdb_regs[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
- } else {
- gdb_regs[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
}
-#else
- gdb_regs[GDB_R8] = regs->r8;
- gdb_regs[GDB_R9] = regs->r9;
- gdb_regs[GDB_R10] = regs->r10;
- gdb_regs[GDB_R11] = regs->r11;
- gdb_regs[GDB_R12] = regs->r12;
- gdb_regs[GDB_R13] = regs->r13;
- gdb_regs[GDB_R14] = regs->r14;
- gdb_regs[GDB_R15] = regs->r15;
- gdb_regs32[GDB_PS] = regs->flags;
- gdb_regs32[GDB_CS] = regs->cs;
- gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
+ switch (regno) {
+#ifdef CONFIG_X86_32
+ case GDB_SS:
+ if (!user_mode_vm(regs))
+ *(unsigned long *)mem = __KERNEL_DS;
+ break;
+ case GDB_SP:
+ if (!user_mode_vm(regs))
+ *(unsigned long *)mem = kernel_stack_pointer(regs);
+ break;
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
#endif
+ }
+ return dbg_reg_def[regno].name;
}
/**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_SP] = p->thread.sp;
}
-/**
- * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- * @gdb_regs: A pointer to hold the registers we've received from GDB.
- * @regs: A pointer to a &struct pt_regs to hold these values in.
- *
- * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- * in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
- regs->ax = gdb_regs[GDB_AX];
- regs->bx = gdb_regs[GDB_BX];
- regs->cx = gdb_regs[GDB_CX];
- regs->dx = gdb_regs[GDB_DX];
- regs->si = gdb_regs[GDB_SI];
- regs->di = gdb_regs[GDB_DI];
- regs->bp = gdb_regs[GDB_BP];
- regs->ip = gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
- regs->flags = gdb_regs[GDB_PS];
- regs->ds = gdb_regs[GDB_DS];
- regs->es = gdb_regs[GDB_ES];
- regs->cs = gdb_regs[GDB_CS];
-#else
- regs->r8 = gdb_regs[GDB_R8];
- regs->r9 = gdb_regs[GDB_R9];
- regs->r10 = gdb_regs[GDB_R10];
- regs->r11 = gdb_regs[GDB_R11];
- regs->r12 = gdb_regs[GDB_R12];
- regs->r13 = gdb_regs[GDB_R13];
- regs->r14 = gdb_regs[GDB_R14];
- regs->r15 = gdb_regs[GDB_R15];
- regs->flags = gdb_regs32[GDB_PS];
- regs->cs = gdb_regs32[GDB_CS];
- regs->ss = gdb_regs32[GDB_SS];
-#endif
-}
-
static struct hw_breakpoint {
unsigned enabled;
unsigned long addr;
int len;
int type;
struct perf_event **pev;
-} breakinfo[4];
+} breakinfo[HBP_NUM];
static unsigned long early_dr7;
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
{
int breakno;
- for (breakno = 0; breakno < 4; breakno++) {
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
struct perf_event *bp;
struct arch_hw_breakpoint *info;
int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (breakinfo[i].addr == addr && breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
int cpu = raw_smp_processor_id();
struct perf_event *bp;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (!breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
switch (bptype) {
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
/* Disable hardware debugging while we are in kgdb: */
set_debugreg(0UL, 7);
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
{
unsigned long addr;
char *ptr;
- int newPC;
switch (remcomInBuffer[0]) {
case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
linux_regs->ip = addr;
case 'D':
case 'k':
- newPC = linux_regs->ip;
-
/* clear the trace bit */
linux_regs->flags &= ~X86_EFLAGS_TF;
atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -645,7 +640,7 @@ void kgdb_arch_late(void)
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (breakinfo[i].pev)
continue;
breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 675879b65ce6..1bfb6cf4dd55 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
}
/*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
+ * Skip the prefixes of the instruction.
*/
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
{
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ while (inat_is_legacy_prefix(attr)) {
+ insn++;
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ }
#ifdef CONFIG_X86_64
- if ((*insn & 0xf0) == 0x40)
- return 1;
+ if (inat_is_rex_prefix(attr))
+ insn++;
#endif
- return 0;
+ return insn;
}
/*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
*/
static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
{
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
switch (*insn) {
case 0xfa: /* cli */
case 0xfb: /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
return 1;
}
- /*
- * on X86_64, 0x40-0x4f are REX prefixes so we need to look
- * at the next byte instead.. but of course not recurse infinitely
- */
- if (is_REX_prefix(insn))
- return is_IF_modifier(++insn);
-
return 0;
}
@@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
unsigned long orig_ip = (unsigned long)p->addr;
kprobe_opcode_t *insn = p->ainsn.insn;
- /*skip the REX prefix*/
- if (is_REX_prefix(insn))
- insn++;
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
switch (*insn) {
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..d7b6f7fb4fec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+static void __init smp_register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ apic_version[boot_cpu_physical_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (early)
return 1;
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ smp_register_lapic_address(mpc->lapic);
+
if (mpc->oemptr)
x86_init.mpparse.smp_read_mpc_oem(mpc);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 5915e0b33303..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
#include <asm/i8259.h>
#include <asm/apb_timer.h>
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
+
static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
+
int sfi_mtimer_num;
struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
return 0;
}
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
- /* restore default lapic clock if disabled by cmdline */
- if (disable_apbt_percpu)
- return setup_secondary_APIC_clock();
- apbt_setup_secondary_clock();
-}
-
static unsigned long __init mrst_calibrate_tsc(void)
{
unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
void __init mrst_time_init(void)
{
+ switch (mrst_timer_options) {
+ case MRST_TIMER_APBT_ONLY:
+ break;
+ case MRST_TIMER_LAPIC_APBT:
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ break;
+ default:
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ break;
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ return;
+ }
+ /* we need at least one APB timer */
sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
pre_init_apic_IRQ0();
apbt_time_init();
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void)
sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
}
-/*
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
+void __cpuinit mrst_arch_setup(void)
{
- pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
- if (disable_apbt_percpu)
- setup_boot_APIC_clock();
-};
+ if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+ __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+ else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
+ __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+ else {
+ pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+ __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+ }
+ pr_debug("Moorestown CPU %s identified\n",
+ (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+ "Lincroft" : "Penwell");
+}
/* MID systems don't have i8042 controller */
static int mrst_i8042_detect(void)
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void)
x86_init.resources.reserve_resources = x86_init_noop;
x86_init.timers.timer_init = mrst_time_init;
- x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
- x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+ x86_init.oem.arch_setup = mrst_arch_setup;
+
+ x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
x86_platform.calibrate_tsc = mrst_calibrate_tsc;
x86_platform.i8042_detect = mrst_i8042_detect;
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void)
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
}
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp("apbt_only", arg) == 0)
+ mrst_timer_options = MRST_TIMER_APBT_ONLY;
+ else if (strcmp("lapic_and_apbt", arg) == 0)
+ mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+ else {
+ pr_warning("X86 MRST timer option %s not recognised"
+ " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+ arg);
+ return -EINVAL;
+ }
+ return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b3..0e0cdde519be 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
#include <asm/geode.h>
#include <asm/setup.h>
#include <asm/olpc.h>
-
-#ifdef CONFIG_OPEN_FIRMWARE
-#include <asm/ofw.h>
-#endif
+#include <asm/olpc_ofw.h>
struct olpc_platform_t olpc_platform_info;
EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
* The OBF flag will sometimes misbehave due to what we believe
* is a hardware quirk..
*/
- printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd);
+ pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
outb(cmd, 0x6c);
if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
" EC accept data!\n");
goto err;
}
- printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n",
- inbuf[i]);
+ pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
outb(inbuf[i], 0x68);
}
}
@@ -176,8 +172,7 @@ restart:
goto restart;
}
outbuf[i] = inb(0x68);
- printk(KERN_DEBUG "olpc-ec: received 0x%x\n",
- outbuf[i]);
+ pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
}
}
@@ -188,14 +183,15 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-#ifdef CONFIG_OPEN_FIRMWARE
+#ifdef CONFIG_OLPC_OPENFIRMWARE
static void __init platform_detect(void)
{
size_t propsize;
__be32 rev;
+ const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+ void *res[] = { &propsize };
- if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
- &propsize) || propsize != 4) {
+ if (olpc_ofw("getprop", args, res) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
rev = cpu_to_be32(0);
}
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 000000000000..3218aa71ab5e
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+ pgd_t *base, *ofw_pde;
+
+ if (!olpc_ofw_cif)
+ return;
+
+ /* fetch OFW's PDE */
+ base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+ if (!base) {
+ printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+ olpc_ofw_cif = NULL;
+ return;
+ }
+ ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+ /* install OFW's PDE permanently into the kernel's pgtable */
+ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+ /* implicit optimization barrier here due to uninline function return */
+
+ early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res)
+{
+ int ofw_args[MAXARGS + 3];
+ unsigned long flags;
+ int ret, i, *p;
+
+ BUG_ON(nr_args + nr_res > MAXARGS);
+
+ if (!olpc_ofw_cif)
+ return -EIO;
+
+ ofw_args[0] = (int)name;
+ ofw_args[1] = nr_args;
+ ofw_args[2] = nr_res;
+
+ p = &ofw_args[3];
+ for (i = 0; i < nr_args; i++, p++)
+ *p = (int)args[i];
+
+ /* call into ofw */
+ spin_lock_irqsave(&ofw_lock, flags);
+ ret = olpc_ofw_cif(ofw_args);
+ spin_unlock_irqrestore(&ofw_lock, flags);
+
+ if (!ret) {
+ for (i = 0; i < nr_res; i++, p++)
+ *((int *)res[i]) = *p;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+ struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+ unsigned long start;
+
+ /* ensure OFW booted us by checking for "OFW " string */
+ if (hdr->ofw_magic != OLPC_OFW_SIG)
+ return;
+
+ olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+ if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+ printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+ (unsigned long)olpc_ofw_cif);
+ olpc_ofw_cif = NULL;
+ return;
+ }
+
+ /* determine where OFW starts in memory */
+ start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+ printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+ (unsigned long)olpc_ofw_cif, (-start) >> 20);
+ reserve_top_address(-start);
+}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01dd..9f07cfcbd3a5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
#include <asm/x86_init.h>
+#include <asm/xen/swiotlb-xen.h>
static int forbid_dac __read_mostly;
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
- if (pci_swiotlb_detect())
+ if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
goto out;
gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
/* needs to be called after gart_iommu_hole_init */
amd_iommu_detect();
out:
+ pci_xen_swiotlb_init();
+
pci_swiotlb_init();
}
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
#endif
x86_init.iommu.iommu_init();
- if (swiotlb) {
+ if (swiotlb || xen_swiotlb) {
printk(KERN_INFO "PCI-DMA: "
"Using software bounce buffering for IO (SWIOTLB)\n");
swiotlb_print_info();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..64ecaf0af9af 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
@@ -300,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread);
/*
* sys_execve() executes a new program.
*/
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs *regs)
{
long error;
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
- trace_power_start(POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
- trace_power_start(POWER_CSTATE, (ax>>4)+1);
+ trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
static void mwait_idle(void)
{
if (!need_resched()) {
- trace_power_start(POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -478,7 +479,7 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
- trace_power_start(POWER_CSTATE, 0);
+ trace_power_start(POWER_CSTATE, 0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
return (edx & MWAIT_EDX_C1);
}
-/*
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
- */
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
-{
- u64 val;
- if (c->x86_vendor != X86_VENDOR_AMD)
- goto no_c1e_idle;
-
- /* Family 0x0f models < rev F do not have C1E */
- if (c->x86 == 0x0F && c->x86_model >= 0x40)
- return 1;
-
- if (c->x86 == 0x10) {
- /*
- * check OSVW bit for CPUs that are not affected
- * by erratum #400
- */
- if (cpu_has(c, X86_FEATURE_OSVW)) {
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
- if (val >= 2) {
- rdmsrl(MSR_AMD64_OSVW_STATUS, val);
- if (!(val & BIT(1)))
- goto no_c1e_idle;
- }
- }
- return 1;
- }
-
-no_c1e_idle:
- return 0;
-}
+bool c1e_detected;
+EXPORT_SYMBOL(c1e_detected);
static cpumask_var_t c1e_mask;
-static int c1e_detected;
void c1e_remove_cpu(int cpu)
{
@@ -584,12 +551,12 @@ static void c1e_idle(void)
u32 lo, hi;
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = 1;
+ c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
}
}
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
*/
printk(KERN_INFO "using mwait in idle threads.\n");
pm_idle = mwait_idle;
- } else if (check_c1e_idle(c)) {
+ } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+ /* E400: APIC timer interrupt does not wake up CPU from C1e */
printk(KERN_INFO "using C1E aware idle routine\n");
pm_idle = c1e_idle;
} else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <trace/events/power.h>
+
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
/*
@@ -111,6 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ trace_power_end(smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <trace/events/power.h>
+
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ trace_power_end(smp_processor_id());
+
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd031..b008e7883207 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
#include <asm/paravirt.h>
#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
#include <asm/percpu.h>
#include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
/* VMI may relocate the fixmap; do this before touching ioremap area */
vmi_init();
+ /* OFW also may relocate the fixmap */
+ olpc_ofw_detect();
+
early_trap_init();
early_cpu_init();
early_ioremap_init();
+ setup_olpc_ofw_pgd();
+
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
edid_info = boot_params.edid_info;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..a5e928b0cb5f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
goto do_rest;
}
- if (!keventd_up() || current_is_keventd())
- c_idle.work.func(&c_idle.work);
- else {
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
- }
+ schedule_work(&c_idle.work);
+ wait_for_completion(&c_idle.done);
if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +812,13 @@ do_rest:
if (cpumask_test_cpu(cpu, cpu_callin_mask))
break; /* It has booted */
udelay(100);
+ /*
+ * Allow other tasks to run while we wait for the
+ * AP to come online. This also gives a chance
+ * for the MTRR work(triggered by the AP coming online)
+ * to be completed in the stop machine context.
+ */
+ schedule();
}
if (cpumask_test_cpu(cpu, cpu_callin_mask))
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
return 0;
}
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void
+__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
if (!reliable)
return;
+#endif
+ if (nosched && in_sched_functions(addr))
+ return;
if (trace->skip > 0) {
trace->skip--;
return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
trace->entries[trace->nr_entries++] = addr;
}
+static void save_stack_address(void *data, unsigned long addr, int reliable)
+{
+ return __save_stack_address(data, addr, reliable, false);
+}
+
static void
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
- struct stack_trace *trace = (struct stack_trace *)data;
- if (!reliable)
- return;
- if (in_sched_functions(addr))
- return;
- if (trace->skip > 0) {
- trace->skip--;
- return;
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
+ return __save_stack_address(data, addr, reliable, true);
}
static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
-struct stack_frame {
+struct stack_frame_user {
const void __user *next_fp;
unsigned long ret_addr;
};
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
{
int ret;
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
trace->entries[trace->nr_entries++] = regs->ip;
while (trace->nr_entries < trace->max_entries) {
- struct stack_frame frame;
+ struct stack_frame_user frame;
frame.next_fp = NULL;
frame.ret_addr = 0;
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_recvmmsg
+ .long sys_fanotify_init
+ .long sys_fanotify_mark
+ .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e2..312ef0292815 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
*/
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -22,19 +23,37 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
-struct msg_desc {
- struct bau_payload_queue_entry *msg;
- int msg_slot;
- int sw_ack_slot;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static int timeout_base_ns[] = {
+ 20,
+ 160,
+ 1280,
+ 10240,
+ 81920,
+ 655360,
+ 5242880,
+ 167772160
};
-
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-
-static int uv_bau_max_concurrent __read_mostly;
-
+static int timeout_us;
static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
+
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
+
static int __init setup_nobau(char *arg)
{
nobau = 1;
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-struct reset_args {
- int sender;
-};
-
/*
* Determine the first node on a uvhub. 'Nodes' are used for kernel
* memory allocation.
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
struct ptc_stats *stat;
msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
stat->d_retries++;
/*
* cancel any message from msg+1 to the retry itself
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
slot2 = msg2 - mdp->va_queue_first;
mmr = uv_read_local_mmr
(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg2->sw_ack_vector << 8) |
- msg2->sw_ack_vector);
+ msg_res = msg2->sw_ack_vector;
/*
* This is a message retry; clear the resources held
* by the previous message only if they timed out.
* If it has not timed out we have an unexpected
* situation to report.
*/
- if (mmr & (msg_res << 8)) {
+ if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
/*
* is the resource timed out?
* make everyone ignore the cancelled message.
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
cancel_count++;
uv_write_local_mmr(
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << 8) | msg_res);
- } else
- printk(KERN_INFO "note bau retry: no effect\n");
+ (msg_res << UV_SW_ACK_NPENDING) |
+ msg_res);
+ }
}
}
if (!cancel_count)
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
* This must be a normal message, or retry of a normal message
*/
msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
stat->d_alltlb++;
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr)
bcp = &per_cpu(bau_control, smp_processor_id());
rap = (struct reset_args *)ptr;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
stat->d_resets++;
/*
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr)
*/
mmr = uv_read_local_mmr
(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg->sw_ack_vector << 8) |
- msg->sw_ack_vector);
+ msg_res = msg->sw_ack_vector;
if (mmr & msg_res) {
stat->d_rcanceled++;
uv_write_local_mmr(
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- msg_res);
+ (msg_res << UV_SW_ACK_NPENDING) |
+ msg_res);
}
}
}
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift, int this_cpu,
struct bau_control *bcp, struct bau_control *smaster, long try)
{
- int relaxes = 0;
unsigned long descriptor_status;
- unsigned long mmr;
- unsigned long mask;
cycles_t ttime;
- cycles_t timeout_time;
- struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+ struct ptc_stats *stat = bcp->statp;
struct bau_control *hmaster;
hmaster = bcp->uvhub_master;
- timeout_time = get_cycles() + bcp->timeout_interval;
/* spin on the status MMR, waiting for it to go idle */
while ((descriptor_status = (((unsigned long)
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* pending. In that case hardware returns the
* ERROR that looks like a destination timeout.
*/
- if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+ if (cycles_2_us(ttime - bcp->send_message) <
+ timeout_us) {
bcp->conseccompletes = 0;
return FLUSH_RETRY_PLUGGED;
}
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* descriptor_status is still BUSY
*/
cpu_relax();
- relaxes++;
- if (relaxes >= 10000) {
- relaxes = 0;
- if (get_cycles() > timeout_time) {
- quiesce_local_uvhub(hmaster);
-
- /* single-thread the register change */
- spin_lock(&hmaster->masks_lock);
- mmr = uv_read_local_mmr(mmr_offset);
- mask = 0UL;
- mask |= (3UL < right_shift);
- mask = ~mask;
- mmr &= mask;
- uv_write_local_mmr(mmr_offset, mmr);
- spin_unlock(&hmaster->masks_lock);
- end_uvhub_quiesce(hmaster);
- stat->s_busy++;
- return FLUSH_GIVEUP;
- }
- }
}
}
bcp->conseccompletes++;
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
return 1;
}
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ udelay(bcp->plugged_delay);
+ bcp->plugged_tries++;
+ if (bcp->plugged_tries >= bcp->plugsb4reset) {
+ bcp->plugged_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_plug++;
+ }
+}
+
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ hmaster->max_bau_concurrent = 1;
+ bcp->timeout_tries++;
+ if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+ bcp->timeout_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_timeout++;
+ }
+}
+
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+
+ /* let only one cpu do this disabling */
+ spin_lock(&disable_lock);
+ if (!baudisabled && bcp->period_requests &&
+ ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+ /* it becomes this cpu's job to turn on the use of the
+ BAU again */
+ baudisabled = 1;
+ bcp->set_bau_off = 1;
+ bcp->set_bau_on_time = get_cycles() +
+ sec_2_cycles(bcp->congested_period);
+ stat->s_bau_disabled++;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 1;
+ }
+ }
+ spin_unlock(&disable_lock);
+}
+
/**
* uv_flush_send_and_wait
*
* Send a broadcast and wait for it to complete.
*
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * The flush_mask contains the cpus the broadcast is to be sent to including
* cpus that are on the local uvhub.
*
- * Returns NULL if all flushing represented in the mask was done. The mask
- * is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
*/
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask,
- struct bau_control *bcp)
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
+ struct cpumask *flush_mask, struct bau_control *bcp)
{
int right_shift;
- int uvhub;
- int bit;
int completion_status = 0;
int seq_number = 0;
long try = 0;
int cpu = bcp->uvhub_cpu;
int this_cpu = bcp->cpu;
- int this_uvhub = bcp->uvhub;
unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
- struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+ cycles_t elapsed;
+ struct ptc_stats *stat = bcp->statp;
struct bau_control *smaster = bcp->socket_master;
struct bau_control *hmaster = bcp->uvhub_master;
- /*
- * Spin here while there are hmaster->max_concurrent or more active
- * descriptors. This is the per-uvhub 'throttle'.
- */
if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
&hmaster->active_descriptor_count,
- hmaster->max_concurrent)) {
+ hmaster->max_bau_concurrent)) {
stat->s_throttles++;
do {
cpu_relax();
} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
&hmaster->active_descriptor_count,
- hmaster->max_concurrent));
+ hmaster->max_bau_concurrent));
}
-
while (hmaster->uvhub_quiesce)
cpu_relax();
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
}
time1 = get_cycles();
do {
- /*
- * Every message from any given cpu gets a unique message
- * sequence number. But retries use that same number.
- * Our message may have timed out at the destination because
- * all sw-ack resources are in use and there is a timeout
- * pending there. In that case, our last send never got
- * placed into the queue and we need to persist until it
- * does.
- *
- * Make any retry a type MSG_RETRY so that the destination will
- * free any resource held by a previous message from this cpu.
- */
if (try == 0) {
- /* use message type set by the caller the first time */
+ bau_desc->header.msg_type = MSG_REGULAR;
seq_number = bcp->message_number++;
} else {
- /* use RETRY type on all the rest; same sequence */
bau_desc->header.msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
bcp->uvhub_cpu;
bcp->send_message = get_cycles();
-
uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
try++;
completion_status = uv_wait_completion(bau_desc, mmr_offset,
right_shift, this_cpu, bcp, smaster, try);
if (completion_status == FLUSH_RETRY_PLUGGED) {
- /*
- * Our retries may be blocked by all destination swack
- * resources being consumed, and a timeout pending. In
- * that case hardware immediately returns the ERROR
- * that looks like a destination timeout.
- */
- udelay(TIMEOUT_DELAY);
- bcp->plugged_tries++;
- if (bcp->plugged_tries >= PLUGSB4RESET) {
- bcp->plugged_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_plug++;
- }
+ destination_plugged(bau_desc, bcp, hmaster, stat);
} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
- hmaster->max_concurrent = 1;
- bcp->timeout_tries++;
- udelay(TIMEOUT_DELAY);
- if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
- bcp->timeout_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_timeout++;
- }
+ destination_timeout(bau_desc, bcp, hmaster, stat);
}
- if (bcp->ipi_attempts >= 3) {
+ if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
bcp->ipi_attempts = 0;
completion_status = FLUSH_GIVEUP;
break;
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
(completion_status == FLUSH_RETRY_TIMEOUT));
time2 = get_cycles();
-
- if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
- && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
- hmaster->max_concurrent++;
-
- /*
- * hold any cpu not timing out here; no other cpu currently held by
- * the 'throttle' should enter the activation code
- */
+ bcp->plugged_tries = 0;
+ bcp->timeout_tries = 0;
+ if ((completion_status == FLUSH_COMPLETE) &&
+ (bcp->conseccompletes > bcp->complete_threshold) &&
+ (hmaster->max_bau_concurrent <
+ hmaster->max_bau_concurrent_constant))
+ hmaster->max_bau_concurrent++;
while (hmaster->uvhub_quiesce)
cpu_relax();
atomic_dec(&hmaster->active_descriptor_count);
-
- /* guard against cycles wrap */
- if (time2 > time1)
- stat->s_time += (time2 - time1);
- else
- stat->s_requestor--; /* don't count this one */
+ if (time2 > time1) {
+ elapsed = time2 - time1;
+ stat->s_time += elapsed;
+ if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+ bcp->period_requests++;
+ bcp->period_time += elapsed;
+ if ((elapsed > congested_cycles) &&
+ (bcp->period_requests > bcp->congested_reps)) {
+ disable_for_congestion(bcp, stat);
+ }
+ }
+ } else
+ stat->s_requestor--;
if (completion_status == FLUSH_COMPLETE && try > 1)
stat->s_retriesok++;
else if (completion_status == FLUSH_GIVEUP) {
- /*
- * Cause the caller to do an IPI-style TLB shootdown on
- * the target cpu's, all of which are still in the mask.
- */
stat->s_giveup++;
- return flush_mask;
- }
-
- /*
- * Success, so clear the remote cpu's from the mask so we don't
- * use the IPI method of shootdown on them.
- */
- for_each_cpu(bit, flush_mask) {
- uvhub = uv_cpu_to_blade_id(bit);
- if (uvhub == this_uvhub)
- continue;
- cpumask_clear_cpu(bit, flush_mask);
+ return 1;
}
- if (!cpumask_empty(flush_mask))
- return flush_mask;
-
- return NULL;
+ return 0;
}
/**
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va, unsigned int cpu)
{
- int remotes;
int tcpu;
int uvhub;
int locals = 0;
+ int remotes = 0;
+ int hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
+ struct bau_control *tbcp;
+ /* kernel was booted 'nobau' */
if (nobau)
return cpumask;
bcp = &per_cpu(bau_control, cpu);
+ stat = bcp->statp;
+
+ /* bau was disabled due to slow response */
+ if (bcp->baudisabled) {
+ /* the cpu that disabled it must re-enable it */
+ if (bcp->set_bau_off) {
+ if (get_cycles() >= bcp->set_bau_on_time) {
+ stat->s_bau_reenabled++;
+ baudisabled = 0;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 0;
+ tbcp->period_requests = 0;
+ tbcp->period_time = 0;
+ }
+ }
+ }
+ return cpumask;
+ }
+
/*
* Each sending cpu has a per-cpu mask which it fills from the caller's
- * cpu mask. Only remote cpus are converted to uvhubs and copied.
+ * cpu mask. All cpus are converted to uvhubs and copied to the
+ * activation descriptor.
*/
flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
- /*
- * copy cpumask to flush_mask, removing current cpu
- * (current cpu should already have been flushed by the caller and
- * should never be returned if we return flush_mask)
- */
+ /* don't actually do a shootdown of the local cpu */
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
if (cpu_isset(cpu, *cpumask))
- locals++; /* current cpu was targeted */
+ stat->s_ntargself++;
bau_desc = bcp->descriptor_base;
bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
- remotes = 0;
+
+ /* cpu statistics */
for_each_cpu(tcpu, flush_mask) {
uvhub = uv_cpu_to_blade_id(tcpu);
- if (uvhub == bcp->uvhub) {
- locals++;
- continue;
- }
bau_uvhub_set(uvhub, &bau_desc->distribution);
- remotes++;
- }
- if (remotes == 0) {
- /*
- * No off_hub flushing; return status for local hub.
- * Return the caller's mask if all were local (the current
- * cpu may be in that mask).
- */
- if (locals)
- return cpumask;
+ if (uvhub == bcp->uvhub)
+ locals++;
else
- return NULL;
+ remotes++;
}
- stat = &per_cpu(ptcstats, cpu);
+ if ((locals + remotes) == 0)
+ return NULL;
stat->s_requestor++;
- stat->s_ntargcpu += remotes;
+ stat->s_ntargcpu += remotes + locals;
+ stat->s_ntargremotes += remotes;
+ stat->s_ntarglocals += locals;
remotes = bau_uvhub_weight(&bau_desc->distribution);
- stat->s_ntarguvhub += remotes;
- if (remotes >= 16)
+
+ /* uvhub statistics */
+ hubs = bau_uvhub_weight(&bau_desc->distribution);
+ if (locals) {
+ stat->s_ntarglocaluvhub++;
+ stat->s_ntargremoteuvhub += (hubs - 1);
+ } else
+ stat->s_ntargremoteuvhub += hubs;
+ stat->s_ntarguvhub += hubs;
+ if (hubs >= 16)
stat->s_ntarguvhub16++;
- else if (remotes >= 8)
+ else if (hubs >= 8)
stat->s_ntarguvhub8++;
- else if (remotes >= 4)
+ else if (hubs >= 4)
stat->s_ntarguvhub4++;
- else if (remotes >= 2)
+ else if (hubs >= 2)
stat->s_ntarguvhub2++;
else
stat->s_ntarguvhub1++;
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
bau_desc->payload.sending_cpu = cpu;
/*
- * uv_flush_send_and_wait returns null if all cpu's were messaged, or
- * the adjusted flush_mask if any cpu's were not messaged.
+ * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+ * or 1 if it gave up and the original cpumask should be returned.
*/
- return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
+ if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+ return NULL;
+ else
+ return cpumask;
}
/*
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
time_start = get_cycles();
bcp = &per_cpu(bau_control, smp_processor_id());
- stat = &per_cpu(ptcstats, smp_processor_id());
+ stat = bcp->statp;
msgdesc.va_queue_first = bcp->va_queue_first;
msgdesc.va_queue_last = bcp->va_queue_last;
msg = bcp->bau_msg_head;
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
}
static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
+microsec_2_cycles(unsigned long microsec)
{
unsigned long ns;
unsigned long long cyc;
- ns = millisec * 1000;
+ ns = microsec * 1000;
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
return cyc;
}
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
if (!cpu) {
seq_printf(file,
- "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
+ "# cpu sent stime self locals remotes ncpus localhub ");
+ seq_printf(file,
+ "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+ "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
seq_printf(file,
"retries rok resetp resett giveup sto bz throt ");
seq_printf(file,
"sw_ack recv rtime all ");
seq_printf(file,
- "one mult none retry canc nocan reset rcan\n");
+ "one mult none retry canc nocan reset rcan ");
+ seq_printf(file,
+ "disable enable\n");
}
if (cpu < num_possible_cpus() && cpu_online(cpu)) {
stat = &per_cpu(ptcstats, cpu);
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file,
"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
cpu, stat->s_requestor, cycles_2_us(stat->s_time),
- stat->s_ntarguvhub, stat->s_ntarguvhub16,
+ stat->s_ntargself, stat->s_ntarglocals,
+ stat->s_ntargremotes, stat->s_ntargcpu,
+ stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+ stat->s_ntarguvhub, stat->s_ntarguvhub16);
+ seq_printf(file, "%ld %ld %ld %ld %ld ",
stat->s_ntarguvhub8, stat->s_ntarguvhub4,
stat->s_ntarguvhub2, stat->s_ntarguvhub1,
- stat->s_ntargcpu, stat->s_dtimeout);
+ stat->s_dtimeout);
seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
stat->s_retry_messages, stat->s_retriesok,
stat->s_resets_plug, stat->s_resets_timeout,
stat->s_giveup, stat->s_stimeout,
stat->s_busy, stat->s_throttles);
+
/* destination side statistics */
seq_printf(file,
- "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+ "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
stat->d_requestee, cycles_2_us(stat->d_time),
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
stat->d_nomsg, stat->d_retries, stat->d_canceled,
stat->d_nocanceled, stat->d_resets,
stat->d_rcanceled);
+ seq_printf(file, "%ld %ld\n",
+ stat->s_bau_disabled, stat->s_bau_reenabled);
}
return 0;
}
/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[300];
+ int ret;
+
+ ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+ "max_bau_concurrent plugged_delay plugsb4reset",
+ "timeoutsb4reset ipi_reset_limit complete_threshold",
+ "congested_response_us congested_reps congested_period",
+ max_bau_concurrent, plugged_delay, plugsb4reset,
+ timeoutsb4reset, ipi_reset_limit, complete_threshold,
+ congested_response_us, congested_reps, congested_period);
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+}
+
+/*
* -1: resetf the statistics
* 0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
*/
static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
size_t count, loff_t *data)
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
long input_arg;
char optstr[64];
struct ptc_stats *stat;
- struct bau_control *bcp;
if (count == 0 || count > sizeof(optstr))
return -EINVAL;
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
"reset: number of ipi-style reset requests processed\n");
printk(KERN_DEBUG
"rcan: number messages canceled by reset requests\n");
+ printk(KERN_DEBUG
+ "disable: number times use of the BAU was disabled\n");
+ printk(KERN_DEBUG
+ "enable: number times use of the BAU was re-enabled\n");
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
memset(stat, 0, sizeof(struct ptc_stats));
}
- } else {
- uv_bau_max_concurrent = input_arg;
- bcp = &per_cpu(bau_control, smp_processor_id());
- if (uv_bau_max_concurrent < 1 ||
- uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
- printk(KERN_DEBUG
- "Error: BAU max concurrent %d; %d is invalid\n",
- bcp->max_concurrent, uv_bau_max_concurrent);
- return -EINVAL;
- }
- printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
- uv_bau_max_concurrent);
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- bcp->max_concurrent = uv_bau_max_concurrent;
+ }
+
+ return count;
+}
+
+static int local_atoi(const char *name)
+{
+ int val = 0;
+
+ for (;; name++) {
+ switch (*name) {
+ case '0' ... '9':
+ val = 10*val+(*name-'0');
+ break;
+ default:
+ return val;
}
}
+}
+
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int cnt = 0;
+ int val;
+ char *p;
+ char *q;
+ char instr[64];
+ struct bau_control *bcp;
+
+ if (count == 0 || count > sizeof(instr)-1)
+ return -EINVAL;
+ if (copy_from_user(instr, user, count))
+ return -EFAULT;
+ instr[count] = '\0';
+ /* count the fields */
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (; *p; p = q + strspn(q, WHITESPACE)) {
+ q = p + strcspn(p, WHITESPACE);
+ cnt++;
+ if (q == p)
+ break;
+ }
+ if (cnt != 9) {
+ printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+ return -EINVAL;
+ }
+
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+ q = p + strcspn(p, WHITESPACE);
+ val = local_atoi(p);
+ switch (cnt) {
+ case 0:
+ if (val == 0) {
+ max_bau_concurrent = MAX_BAU_CONCURRENT;
+ max_bau_concurrent_constant =
+ MAX_BAU_CONCURRENT;
+ continue;
+ }
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (val < 1 || val > bcp->cpus_in_uvhub) {
+ printk(KERN_DEBUG
+ "Error: BAU max concurrent %d is invalid\n",
+ val);
+ return -EINVAL;
+ }
+ max_bau_concurrent = val;
+ max_bau_concurrent_constant = val;
+ continue;
+ case 1:
+ if (val == 0)
+ plugged_delay = PLUGGED_DELAY;
+ else
+ plugged_delay = val;
+ continue;
+ case 2:
+ if (val == 0)
+ plugsb4reset = PLUGSB4RESET;
+ else
+ plugsb4reset = val;
+ continue;
+ case 3:
+ if (val == 0)
+ timeoutsb4reset = TIMEOUTSB4RESET;
+ else
+ timeoutsb4reset = val;
+ continue;
+ case 4:
+ if (val == 0)
+ ipi_reset_limit = IPI_RESET_LIMIT;
+ else
+ ipi_reset_limit = val;
+ continue;
+ case 5:
+ if (val == 0)
+ complete_threshold = COMPLETE_THRESHOLD;
+ else
+ complete_threshold = val;
+ continue;
+ case 6:
+ if (val == 0)
+ congested_response_us = CONGESTED_RESPONSE_US;
+ else
+ congested_response_us = val;
+ continue;
+ case 7:
+ if (val == 0)
+ congested_reps = CONGESTED_REPS;
+ else
+ congested_reps = val;
+ continue;
+ case 8:
+ if (val == 0)
+ congested_period = CONGESTED_PERIOD;
+ else
+ congested_period = val;
+ continue;
+ }
+ if (q == p)
+ break;
+ }
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->max_bau_concurrent = max_bau_concurrent;
+ bcp->max_bau_concurrent_constant = max_bau_concurrent;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->congested_response_us = congested_response_us;
+ bcp->congested_reps = congested_reps;
+ bcp->congested_period = congested_period;
+ }
return count;
}
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
return seq_open(file, &uv_ptc_seq_ops);
}
+static int tunables_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
static const struct file_operations proc_uv_ptc_operations = {
.open = uv_ptc_proc_open,
.read = seq_read,
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = {
.release = seq_release,
};
+static const struct file_operations tunables_fops = {
+ .open = tunables_open,
+ .read = tunables_read,
+ .write = tunables_write,
+};
+
static int __init uv_ptc_init(void)
{
struct proc_dir_entry *proc_uv_ptc;
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void)
UV_PTC_BASENAME);
return -EINVAL;
}
+
+ tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+ if (!tunables_dir) {
+ printk(KERN_ERR "unable to create debugfs directory %s\n",
+ UV_BAU_TUNABLES_DIR);
+ return -EINVAL;
+ }
+ tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+ tunables_dir, NULL, &tunables_fops);
+ if (!tunables_file) {
+ printk(KERN_ERR "unable to create debugfs file %s\n",
+ UV_BAU_TUNABLES_FILE);
+ return -EINVAL;
+ }
return 0;
}
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector)
}
/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+ unsigned long mmr_image;
+ int mult1;
+ int mult2;
+ int index;
+ int base;
+ int ret;
+ unsigned long ts_ns;
+
+ mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+ mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+ mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+ base = timeout_base_ns[index];
+ ts_ns = base * mult1 * mult2;
+ ret = ts_ns / 1000;
+ return ret;
+}
+
+/*
* initialize the bau_control structure for each cpu
*/
-static void uv_init_per_cpu(int nuvhubs)
+static void __init uv_init_per_cpu(int nuvhubs)
{
- int i, j, k;
+ int i;
int cpu;
int pnode;
int uvhub;
+ int have_hmaster;
short socket = 0;
+ unsigned short socket_mask;
+ unsigned char *uvhub_mask;
struct bau_control *bcp;
struct uvhub_desc *bdp;
struct socket_desc *sdp;
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs)
short cpu_number[16];
};
struct uvhub_desc {
- short num_sockets;
+ unsigned short socket_mask;
short num_cpus;
short uvhub;
short pnode;
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs)
};
struct uvhub_desc *uvhub_descs;
+ timeout_us = calculate_destination_timeout();
+
uvhub_descs = (struct uvhub_desc *)
kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
memset(bcp, 0, sizeof(struct bau_control));
- spin_lock_init(&bcp->masks_lock);
- bcp->max_concurrent = uv_bau_max_concurrent;
pnode = uv_cpu_hub_info(cpu)->pnode;
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub];
bdp->num_cpus++;
bdp->uvhub = uvhub;
bdp->pnode = pnode;
- /* time interval to catch a hardware stay-busy bug */
- bcp->timeout_interval = millisec_2_cycles(3);
- /* kludge: assume uv_hub.h is constant */
- socket = (cpu_physical_id(cpu)>>5)&1;
- if (socket >= bdp->num_sockets)
- bdp->num_sockets = socket+1;
+ /* kludge: 'assuming' one node per socket, and assuming that
+ disabling a socket just leaves a gap in node numbers */
+ socket = (cpu_to_node(cpu) & 1);
+ bdp->socket_mask |= (1 << socket);
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
}
- socket = 0;
- for_each_possible_blade(uvhub) {
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+ continue;
+ have_hmaster = 0;
bdp = &uvhub_descs[uvhub];
- for (i = 0; i < bdp->num_sockets; i++) {
- sdp = &bdp->socket[i];
- for (j = 0; j < sdp->num_cpus; j++) {
- cpu = sdp->cpu_number[j];
+ socket_mask = bdp->socket_mask;
+ socket = 0;
+ while (socket_mask) {
+ if (!(socket_mask & 1))
+ goto nextsocket;
+ sdp = &bdp->socket[socket];
+ for (i = 0; i < sdp->num_cpus; i++) {
+ cpu = sdp->cpu_number[i];
bcp = &per_cpu(bau_control, cpu);
bcp->cpu = cpu;
- if (j == 0) {
+ if (i == 0) {
smaster = bcp;
- if (i == 0)
+ if (!have_hmaster) {
+ have_hmaster++;
hmaster = bcp;
+ }
}
bcp->cpus_in_uvhub = bdp->num_cpus;
bcp->cpus_in_socket = sdp->num_cpus;
bcp->socket_master = smaster;
+ bcp->uvhub = bdp->uvhub;
bcp->uvhub_master = hmaster;
- for (k = 0; k < DEST_Q_SIZE; k++)
- bcp->socket_acknowledge_count[k] = 0;
- bcp->uvhub_cpu =
- uv_cpu_hub_info(cpu)->blade_processor_id;
+ bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
+ blade_processor_id;
}
+nextsocket:
socket++;
+ socket_mask = (socket_mask >> 1);
}
}
kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->baudisabled = 0;
+ bcp->statp = &per_cpu(ptcstats, cpu);
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+ bcp->max_bau_concurrent = max_bau_concurrent;
+ bcp->max_bau_concurrent_constant = max_bau_concurrent;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->congested_response_us = congested_response_us;
+ bcp->congested_reps = congested_reps;
+ bcp->congested_period = congested_period;
+ }
}
/*
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void)
zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
GFP_KERNEL, cpu_to_node(cur_cpu));
- uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
uv_nshift = uv_hub_info->m_val;
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nuvhubs = uv_num_possible_blades();
+ spin_lock_init(&disable_lock);
+ congested_cycles = microsec_2_cycles(congested_response_us);
uv_init_per_cpu(nuvhubs);
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void)
alloc_intr_gate(vector, uv_bau_message_intr1);
for_each_possible_blade(uvhub) {
- pnode = uv_blade_to_pnode(uvhub);
- /* INIT the bau */
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
- ((unsigned long)1 << 63));
- mmr = 1; /* should be 1 to broadcast to both sockets */
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+ if (uv_blade_nr_possible_cpus(uvhub)) {
+ pnode = uv_blade_to_pnode(uvhub);
+ /* INIT the bau */
+ uv_write_global_mmr64(pnode,
+ UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+ ((unsigned long)1 << 63));
+ mmr = 1; /* should be 1 to broadcast to both sockets */
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+ mmr);
+ }
}
return 0;
}
core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
+fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
+
#ifdef CONFIG_X86_LOCAL_APIC
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+
+#ifndef CONFIG_LOCKUP_DETECTOR
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (nmi_watchdog_tick(regs, reason))
return;
if (!do_nmi_callback(regs, cpu))
+#endif /* !CONFIG_LOCKUP_DETECTOR */
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..ce8e50239332 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
.read = read_tsc,
.resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
static void __init init_tsc_clocksource(void)
{
- clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
- clocksource_tsc.shift);
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register(&clocksource_tsc);
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
*/
#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
verify_cpu:
pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
je verify_cpu_sse_ok
test %di,%di
jz verify_cpu_no_longmode # only try to force SSE on AMD
- movl $0xc0010015,%ecx # HWCR
+ movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
- u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+ struct clocksource *clock, u32 mult)
{
unsigned long flags;
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
vsyscall_gtod_data.clock.shift = clock->shift;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+ vsyscall_gtod_data.wall_to_monotonic = *wtm;
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
* unlikely */
time_t __vsyscall(1) vtime(time_t *t)
{
- struct timeval tv;
+ unsigned seq;
time_t result;
if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
return time_syscall(t);
- vgettimeofday(&tv, NULL);
- result = tv.tv_sec;
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+ result = __vsyscall_gtod_data.wall_time_sec;
+
+ } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
if (t)
*t = result;
return result;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24a..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
*/
u64 pcntxt_mask;
+/*
+ * Represents init state for the supported extended state.
+ */
+static struct xsave_struct *init_xstate_buf;
+
struct _fpx_sw_bytes fx_sw_reserved;
#ifdef CONFIG_IA32_EMULATION
struct _fpx_sw_bytes fx_sw_reserved_ia32;
#endif
+static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+
+/*
+ * If a processor implementation discern that a processor state component is
+ * in its initialized state it may modify the corresponding bit in the
+ * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
+ * layout in the case of xsaveopt. While presenting the xstate information to
+ * the user, we always ensure that the memory layout of a feature will be in
+ * the init state if the corresponding header bit is zero. This is to ensure
+ * that the user doesn't see some stale state in the memory layout during
+ * signal handling, debugging etc.
+ */
+void __sanitize_i387_state(struct task_struct *tsk)
+{
+ u64 xstate_bv;
+ int feature_bit = 0x2;
+ struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+
+ if (!fx)
+ return;
+
+ BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+
+ xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
+
+ /*
+ * None of the feature bits are in init state. So nothing else
+ * to do for us, as the memory layout is upto date.
+ */
+ if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
+ return;
+
+ /*
+ * FP is in init state
+ */
+ if (!(xstate_bv & XSTATE_FP)) {
+ fx->cwd = 0x37f;
+ fx->swd = 0;
+ fx->twd = 0;
+ fx->fop = 0;
+ fx->rip = 0;
+ fx->rdp = 0;
+ memset(&fx->st_space[0], 0, 128);
+ }
+
+ /*
+ * SSE is in init state
+ */
+ if (!(xstate_bv & XSTATE_SSE))
+ memset(&fx->xmm_space[0], 0, 256);
+
+ xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
+
+ /*
+ * Update all the other memory layouts for which the corresponding
+ * header bit is in the init state.
+ */
+ while (xstate_bv) {
+ if (xstate_bv & 0x1) {
+ int offset = xstate_offsets[feature_bit];
+ int size = xstate_sizes[feature_bit];
+
+ memcpy(((void *) fx) + offset,
+ ((void *) init_xstate_buf) + offset,
+ size);
+ }
+
+ xstate_bv >>= 1;
+ feature_bit++;
+ }
+}
+
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
sizeof(struct _fpx_sw_bytes));
-
if (err)
- return err;
+ return -EFAULT;
/*
* First Magic check failed.
*/
if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -1;
+ return -EINVAL;
/*
* Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
if (fx_sw_user->xstate_size < min_xstate_size ||
fx_sw_user->xstate_size > xstate_size ||
fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -1;
+ return -EINVAL;
err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
fx_sw_user->extended_size -
FP_XSTATE_MAGIC2_SIZE));
+ if (err)
+ return err;
/*
* Check for the presence of second magic word at the end of memory
* layout. This detects the case where the user just copied the legacy
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (err || magic2 != FP_XSTATE_MAGIC2)
- return -1;
+ if (magic2 != FP_XSTATE_MAGIC2)
+ return -EFAULT;
return 0;
}
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf)
return 0;
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- /*
- * Start with clearing the user buffer. This will present a
- * clean context for the bytes not touched by the fxsave/xsave.
- */
- err = __clear_user(buf, sig_xstate_size);
- if (err)
- return err;
-
if (use_xsave())
err = xsave_user(buf);
else
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf)
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
} else {
+ sanitize_i387_state(tsk);
if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
xstate_size))
return -1;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
* init the state skipped by the user.
*/
mask = pcntxt_mask & ~mask;
-
- xrstor_state(init_xstate_buf, mask);
+ if (unlikely(mask))
+ xrstor_state(init_xstate_buf, mask);
return 0;
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
#endif
}
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
#ifdef CONFIG_X86_64
unsigned int sig_xstate_size = sizeof(struct _fpstate);
#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
/*
* Enable the extended processor state save/restore feature
*/
-void __cpuinit xsave_init(void)
+static inline void xstate_enable(void)
{
- if (!cpu_has_xsave)
- return;
-
set_in_cr4(X86_CR4_OSXSAVE);
-
- /*
- * Enable all the features that the HW is capable of
- * and the Linux kernel is aware of.
- */
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}
/*
+ * Record the offsets and sizes of different state managed by the xsave
+ * memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+ int eax, ebx, ecx, edx, leaf = 0x2;
+
+ xstate_features = fls64(pcntxt_mask);
+ xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
+ xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
+
+ do {
+ cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+
+ if (eax == 0)
+ break;
+
+ xstate_offsets[leaf] = ebx;
+ xstate_sizes[leaf] = eax;
+
+ leaf++;
+ } while (1);
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_xstate_init(void)
{
+ setup_xstate_features();
+
+ /*
+ * Setup init_xstate_buf to represent the init state of
+ * all the features managed by the xsave
+ */
init_xstate_buf = alloc_bootmem(xstate_size);
init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+
+ clts();
+ /*
+ * Init all the features state with header_bv being 0x0
+ */
+ xrstor_state(init_xstate_buf, -1);
+ /*
+ * Dump the init state again. This is to identify the init state
+ * of any feature which is not represented by all zero's.
+ */
+ xsave_state(init_xstate_buf, -1);
+ stts();
}
/*
* Enable and initialize the xsave feature.
*/
-void __ref xsave_cntxt_init(void)
+static void __init xstate_enable_boot_cpu(void)
{
unsigned int eax, ebx, ecx, edx;
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+ WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
+ return;
+ }
+
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
pcntxt_mask = eax + ((u64)edx << 32);
if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
* Support only the state known to OS.
*/
pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
- xsave_init();
+
+ xstate_enable();
/*
* Recompute the context size for enabled features
*/
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
xstate_size = ebx;
update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
"cntxt size 0x%x\n",
pcntxt_mask, xstate_size);
}
+
+/*
+ * For the very first instance, this calls xstate_enable_boot_cpu();
+ * for all subsequent instances, this calls xstate_enable().
+ *
+ * This is somewhat obfuscated due to the lack of powerful enough
+ * overrides for the section checks.
+ */
+void __cpuinit xsave_init(void)
+{
+ static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
+ void (*this_func)(void);
+
+ if (!cpu_has_xsave)
+ return;
+
+ this_func = next_func;
+ next_func = xstate_enable;
+ this_func();
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
* privileged instructions:
*
* Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
#define SrcImmU (9<<4) /* Immediate operand, unsigned */
#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
+#define SrcAcc (0xd<<4) /* Source Accumulator */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -88,10 +92,6 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
-#define Src2Imm16 (4<<29)
-#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
- in memory and second argument is located
- immediately after the first one in memory. */
#define Src2Mask (7<<29)
enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+ ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x30 - 0x37 */
ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x38 - 0x3F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
- DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+ DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
+ ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
/* 0x90 - 0x97 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x98 - 0x9F */
- 0, 0, SrcImm | Src2Imm16 | No64, 0,
+ 0, 0, SrcImmFAddr | No64, 0,
ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
/* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
- ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+ ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
+ ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
/* 0xA8 - 0xAF */
- 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+ DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
ByteOp | DstDI | String, DstDI | String,
/* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+ SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
/* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
[Group1A*8] =
DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
[Group3_Byte*8] =
- ByteOp | SrcImm | DstMem | ModRM, 0,
+ ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group3*8] =
- DstMem | SrcImm | ModRM, 0,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group4*8] =
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
0, 0, 0, 0, 0, 0,
[Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+ SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
SrcMem | ModRM | Stack, 0,
[Group7*8] =
0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
(_type)_x; \
})
+#define insn_fetch_arr(_arr, _size, _eip) \
+({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+})
+
static inline unsigned long ad_mask(struct decode_cache *c)
{
return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
c->seg_override = seg;
}
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+ return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return seg_base(ctxt, c->seg_override);
+ return seg_base(ctxt, ops, c->seg_override);
+}
+
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ return seg_base(ctxt, ops, VCPU_SREG_ES);
+}
+
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ return seg_base(ctxt, ops, VCPU_SREG_SS);
+}
+
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ ctxt->exception = vec;
+ ctxt->error_code = error;
+ ctxt->error_code_valid = valid;
+ ctxt->restart = false;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ int err)
{
- return seg_base(ctxt, VCPU_SREG_ES);
+ ctxt->cr2 = addr;
+ emulate_exception(ctxt, PF_VECTOR, err, true);
}
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
{
- return seg_base(ctxt, VCPU_SREG_SS);
+ emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ emulate_exception(ctxt, TS_VECTOR, err, true);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* we cannot decode insn before we complete previous rep insn */
WARN_ON(ctxt->restart);
- /* Shadow copy of register state. Committed on successful emulation. */
- memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->eip;
c->fetch.start = c->fetch.end = c->eip;
- ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
set_seg_override(c, VCPU_SREG_DS);
if (!(!c->twobyte && c->b == 0x8d))
- c->modrm_ea += seg_override_base(ctxt, c);
+ c->modrm_ea += seg_override_base(ctxt, ops, c);
if (c->ad_bytes != 8)
c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
else
c->src.val = insn_fetch(u8, 1, c->eip);
break;
+ case SrcAcc:
+ c->src.type = OP_REG;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = &c->regs[VCPU_REGS_RAX];
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val = *(u8 *)c->src.ptr;
+ break;
+ case 2:
+ c->src.val = *(u16 *)c->src.ptr;
+ break;
+ case 4:
+ c->src.val = *(u32 *)c->src.ptr;
+ break;
+ case 8:
+ c->src.val = *(u64 *)c->src.ptr;
+ break;
+ }
+ break;
case SrcOne:
c->src.bytes = 1;
c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
c->src.type = OP_MEM;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->src.ptr = (unsigned long *)
- register_address(c, seg_override_base(ctxt, c),
+ register_address(c, seg_override_base(ctxt, ops, c),
c->regs[VCPU_REGS_RSI]);
c->src.val = 0;
break;
+ case SrcImmFAddr:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = c->op_bytes + 2;
+ insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+ break;
+ case SrcMemFAddr:
+ c->src.type = OP_MEM;
+ c->src.ptr = (unsigned long *)c->modrm_ea;
+ c->src.bytes = c->op_bytes + 2;
+ break;
}
/*
@@ -1179,22 +1248,10 @@ done_prefixes:
c->src2.bytes = 1;
c->src2.val = insn_fetch(u8, 1, c->eip);
break;
- case Src2Imm16:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 2;
- c->src2.val = insn_fetch(u16, 2, c->eip);
- break;
case Src2One:
c->src2.bytes = 1;
c->src2.val = 1;
break;
- case Src2Mem16:
- c->src2.type = OP_MEM;
- c->src2.bytes = 2;
- c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
- c->src2.val = 0;
- break;
}
/* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)
- register_address(c, es_base(ctxt),
+ register_address(c, es_base(ctxt, ops),
c->regs[VCPU_REGS_RDI]);
c->dst.val = 0;
break;
@@ -1263,6 +1320,37 @@ done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->decode.mem_read;
+ u32 err;
+
+ while (size) {
+ int n = min(size, 8u);
+ size -= n;
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+ ctxt->vcpu);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt, addr, err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ mc->end += n;
+
+ read_cached:
+ memcpy(dest, mc->data + mc->pos, n);
+ mc->pos += n;
+ dest += n;
+ addr += n;
+ }
+ return X86EMUL_CONTINUE;
+}
+
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
addr = dt.address + index * 8;
ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+ emulate_pf(ctxt, addr, err);
return ret;
}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
addr = dt.address + index * 8;
ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+ emulate_pf(ctxt, addr, err);
return ret;
}
@@ -1481,11 +1569,70 @@ load:
ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
return X86EMUL_CONTINUE;
exception:
- kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+ emulate_exception(ctxt, err_vec, err_code, true);
return X86EMUL_PROPAGATE_FAULT;
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int rc;
+ struct decode_cache *c = &ctxt->decode;
+ u32 err;
+
+ switch (c->dst.type) {
+ case OP_REG:
+ /* The 4-byte case *is* correct:
+ * in 64-bit mode we zero-extend.
+ */
+ switch (c->dst.bytes) {
+ case 1:
+ *(u8 *)c->dst.ptr = (u8)c->dst.val;
+ break;
+ case 2:
+ *(u16 *)c->dst.ptr = (u16)c->dst.val;
+ break;
+ case 4:
+ *c->dst.ptr = (u32)c->dst.val;
+ break; /* 64b: zero-ext */
+ case 8:
+ *c->dst.ptr = c->dst.val;
+ break;
+ }
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ &err,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ &err,
+ ctxt->vcpu);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt,
+ (unsigned long)c->dst.ptr, err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+ c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
c->regs[VCPU_REGS_RSP]);
}
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_emulated(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- dest, len, ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+ c->regs[VCPU_REGS_RSP]),
+ dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
break;
case X86EMUL_MODE_VM86:
if (iopl < 3) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment segment;
- kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+ c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
- c->src.val = segment.selector;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
}
static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
+
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
++reg;
}
+
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+
+ return rc;
}
static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
old_eip = c->eip;
c->eip = c->src.val;
c->src.val = old_eip;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
case 4: /* jmp abs */
c->eip = c->src.val;
break;
case 6: /* push */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- int rc;
- struct decode_cache *c = &ctxt->decode;
-
- switch (c->dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct:
- * in 64-bit mode we zero-extend.
- */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *)c->dst.ptr = (u8)c->dst.val;
- break;
- case 2:
- *(u16 *)c->dst.ptr = (u16)c->dst.val;
- break;
- case 4:
- *c->dst.ptr = (u32)c->dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *c->dst.ptr = c->dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- break;
- case OP_NONE:
- /* no writeback */
- break;
- default:
- break;
- }
- return X86EMUL_CONTINUE;
-}
-
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- ctxt->interruptibility = mask;
-}
-
static inline void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct kvm_segment *cs, struct kvm_segment *ss)
+ struct x86_emulate_ops *ops, struct desc_struct *cs,
+ struct desc_struct *ss)
{
- memset(cs, 0, sizeof(struct kvm_segment));
- kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct kvm_segment));
+ memset(cs, 0, sizeof(struct desc_struct));
+ ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+ memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */
- cs->base = 0; /* flat segment */
+ set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
- cs->limit = 0xffffffff; /* 4GB limit */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
cs->type = 0x0b; /* Read, Execute, Accessed */
cs->s = 1;
cs->dpl = 0; /* will be adjusted later */
- cs->present = 1;
- cs->db = 1;
+ cs->p = 1;
+ cs->d = 1;
- ss->unusable = 0;
- ss->base = 0; /* flat segment */
- ss->limit = 0xffffffff; /* 4GB limit */
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
ss->g = 1; /* 4kb granularity */
ss->s = 1;
ss->type = 0x03; /* Read/Write, Accessed */
- ss->db = 1; /* 32bit stack segment */
+ ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
- ss->present = 1;
+ ss->p = 1;
}
static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
msr_data >>= 32;
- cs.selector = (u16)(msr_data & 0xfffc);
- ss.selector = (u16)(msr_data + 8);
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
if (is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->regs[VCPU_REGS_RCX] = c->eip;
if (is_long_mode(ctxt->vcpu)) {
#ifdef CONFIG_X86_64
c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
- kvm_x86_ops->get_msr(ctxt->vcpu,
- ctxt->mode == X86EMUL_MODE_PROT64 ?
- MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
c->eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~(msr_data | EFLG_RF);
#endif
} else {
/* legacy mode */
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
c->eip = (u32)msr_data;
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
}
static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
break;
}
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- cs.selector = (u16)msr_data;
- cs.selector &= ~SELECTOR_RPL_MASK;
- ss.selector = cs.selector + 8;
- ss.selector &= ~SELECTOR_RPL_MASK;
+ cs_sel = (u16)msr_data;
+ cs_sel &= ~SELECTOR_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ ss_sel &= ~SELECTOR_RPL_MASK;
if (ctxt->mode == X86EMUL_MODE_PROT64
|| is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
c->eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
return X86EMUL_CONTINUE;
}
static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
int usermode;
+ u16 cs_sel, ss_sel;
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
if ((c->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.dpl = 3;
ss.dpl = 3;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
- cs.selector = (u16)(msr_data + 16);
+ cs_sel = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- ss.selector = (u16)(msr_data + 24);
+ ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
- cs.selector = (u16)(msr_data + 32);
+ cs_sel = (u16)(msr_data + 32);
if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- ss.selector = cs.selector + 8;
- cs.db = 0;
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
cs.l = 1;
break;
}
- cs.selector |= SELECTOR_RPL_MASK;
- ss.selector |= SELECTOR_RPL_MASK;
+ cs_sel |= SELECTOR_RPL_MASK;
+ ss_sel |= SELECTOR_RPL_MASK;
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
- c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
- c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+ c->eip = c->regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
return X86EMUL_CONTINUE;
}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- struct kvm_segment tr_seg;
+ struct desc_struct tr_seg;
int r;
u16 io_bitmap_ptr;
u8 perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
- kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
- if (tr_seg.unusable)
+ ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+ if (!tr_seg.p)
return false;
- if (tr_seg.limit < 103)
+ if (desc_limit_scaled(&tr_seg) < 103)
return false;
- r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
- NULL);
+ r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+ ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
- if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
- ctxt->vcpu, NULL);
+ r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+ &perm, 1, ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
return true;
}
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- int seg)
-{
- struct desc_struct desc;
- if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
- return get_desc_base(&desc);
- else
- return ~0;
-}
-
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
}
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- ops->set_cr(3, tss->cr3, ctxt->vcpu);
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
}
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
int ret;
u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
ulong old_tss_base =
- get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+ ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
u32 desc_limit;
/* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
}
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
desc_limit < 0x2b)) {
- kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
- tss_selector & 0xfffc);
+ emulate_ts(ctxt, tss_selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
c->lock_prefix = 0;
c->src.val = (unsigned long) error_code;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
}
return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->eip;
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
c->dst.type = OP_NONE;
rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
has_error_code, error_code);
if (rc == X86EMUL_CONTINUE) {
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
rc = writeback(ctxt, ops);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->eip = c->eip;
}
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
int rc = X86EMUL_CONTINUE;
int saved_dst_type = c->dst.type;
- ctxt->interruptibility = 0;
-
- /* Shadow copy of register state. Committed on successful emulation.
- * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
- * modify them.
- */
-
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
string_done:
ctxt->restart = false;
- kvm_rip_write(ctxt->vcpu, c->eip);
+ ctxt->eip = c->eip;
goto done;
}
/* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
}
if (c->src.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+ c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
c->src.orig_val = c->src.val;
}
if (c->src2.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src2.ptr,
- &c->src2.val,
- c->src2.bytes,
- ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+ &c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+ &c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -2571,7 +2650,7 @@ special_insn:
emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
break;
case 0x06: /* push es */
- emulate_push_sreg(ctxt, VCPU_SREG_ES);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
break;
case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, VCPU_SREG_CS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
break;
case 0x16: /* push ss */
- emulate_push_sreg(ctxt, VCPU_SREG_SS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, VCPU_SREG_DS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 0x50 ... 0x57: /* push reg */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
goto done;
break;
case 0x60: /* pusha */
- emulate_pusha(ctxt);
+ rc = emulate_pusha(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
break;
case 0x61: /* popa */
rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
break;
case 0x68: /* push imm */
case 0x6a: /* push imm8 */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
c->src.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
}
break;
case 0x84 ... 0x85:
+ test:
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
break;
case 0x88 ... 0x8b: /* mov */
goto mov;
- case 0x8c: { /* mov r/m, sreg */
- struct kvm_segment segreg;
-
- if (c->modrm_reg <= VCPU_SREG_GS)
- kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
- else {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ case 0x8c: /* mov r/m, sreg */
+ if (c->modrm_reg > VCPU_SREG_GS) {
+ emulate_ud(ctxt);
goto done;
}
- c->dst.val = segreg.selector;
+ c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
- }
case 0x8d: /* lea r16/r32, m */
c->dst.val = c->modrm_ea;
break;
@@ -2757,12 +2834,12 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
@@ -2775,19 +2852,19 @@ special_insn:
goto done;
break;
case 0x90: /* nop / xchg r8,rax */
- if (!(c->rex_prefix & 1)) { /* nop */
- c->dst.type = OP_NONE;
+ if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+ c->dst.type = OP_NONE; /* nop */
break;
}
case 0x91 ... 0x97: /* xchg reg,rax */
- c->src.type = c->dst.type = OP_REG;
- c->src.bytes = c->dst.bytes = c->op_bytes;
+ c->src.type = OP_REG;
+ c->src.bytes = c->op_bytes;
c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
c->src.val = *(c->src.ptr);
goto xchg;
case 0x9c: /* pushf */
c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x9d: /* popf */
c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
if (rc != X86EMUL_CONTINUE)
goto done;
break;
- case 0xa0 ... 0xa1: /* mov */
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- c->dst.val = c->src.val;
- break;
- case 0xa2 ... 0xa3: /* mov */
- c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
- break;
+ case 0xa0 ... 0xa3: /* mov */
case 0xa4 ... 0xa5: /* movs */
goto mov;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
goto cmp;
+ case 0xa8 ... 0xa9: /* test ax, imm */
+ goto test;
case 0xaa ... 0xab: /* stos */
c->dst.val = c->regs[VCPU_REGS_RAX];
break;
@@ -2855,19 +2928,23 @@ special_insn:
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: /* jmp far */
+ case 0xea: { /* jmp far */
+ unsigned short sel;
jump_far:
- if (load_segment_descriptor(ctxt, ops, c->src2.val,
- VCPU_SREG_CS))
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
goto done;
- c->eip = c->src.val;
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
break;
+ }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
&c->dst.val))
goto done; /* IO is needed */
break;
- case 0xee: /* out al,dx */
- case 0xef: /* out (e/r)ax,dx */
+ case 0xee: /* out dx,al */
+ case 0xef: /* out dx,(e/r)ax */
c->src.val = c->regs[VCPU_REGS_RDX];
do_io_out:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
+ if (emulator_bad_iopl(ctxt, ops)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ } else {
ctxt->eflags &= ~X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
}
break;
case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+ if (emulator_bad_iopl(ctxt, ops)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ } else {
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
}
@@ -2964,11 +3043,12 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
- &c->src);
+ string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+ VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+ string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+ &c->dst);
if (c->rep_prefix && (c->d & String)) {
struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
(rc->end != 0 && rc->end == rc->pos))
ctxt->restart = false;
}
-
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
- ops->set_rflags(ctxt->vcpu, ctxt->eflags);
+ /*
+ * reset read cache here in case string instruction is restared
+ * without decoding
+ */
+ ctxt->decode.mem_read.end = 0;
+ ctxt->eip = c->eip;
done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 5: /* not defined */
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
case 7: /* invlpg*/
emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- rc = emulate_syscall(ctxt);
+ rc = emulate_syscall(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
else
@@ -3073,8 +3154,11 @@ twobyte_insn:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
break;
- case 0x08: /* invd */
case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
case 1:
case 5 ... 7:
case 9 ... 15:
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
case 0x21: /* mov from dr to reg */
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
- emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+ ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x22: /* mov reg, cr */
- ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+ if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ }
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+ ((ctxt->mode == X86EMUL_MODE_PROT64) ?
+ ~0ULL : ~0U), ctxt->vcpu) < 0) {
+ /* #UD condition is already handled by the code above */
+ emulate_gp(ctxt, 0);
goto done;
}
- emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+ emulate_gp(ctxt, 0);
goto done;
}
rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
break;
case 0x32:
/* rdmsr */
- if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+ emulate_gp(ctxt, 0);
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- rc = emulate_sysenter(ctxt);
+ rc = emulate_sysenter(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
else
goto writeback;
break;
case 0x35: /* sysexit */
- rc = emulate_sysexit(ctxt);
+ rc = emulate_sysexit(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
else
@@ -3160,7 +3255,7 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, VCPU_SREG_FS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
break;
case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, VCPU_SREG_GS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
* Copyright (c) 2006 Intel Corporation
* Copyright (c) 2007 Keir Fraser, XenSource Inc
* Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "irq.h"
#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- raw_spin_lock(&ps->inject_lock);
- if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ int value;
+
+ spin_lock(&ps->inject_lock);
+ value = atomic_dec_return(&ps->pit_timer.pending);
+ if (value < 0)
+ /* spurious acks can be generated if, for example, the
+ * PIC is being reset. Handle it gracefully here
+ */
atomic_inc(&ps->pit_timer.pending);
+ else if (value > 0)
+ /* in this case, we had multiple outstanding pit interrupts
+ * that we needed to inject. Reinject
+ */
+ queue_work(ps->pit->wq, &ps->pit->expired);
ps->irq_ack = 1;
- raw_spin_unlock(&ps->inject_lock);
+ spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
{
- pr_debug("execute del timer!\n");
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ cancel_work_sync(&pit->expired);
}
static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
.is_periodic = kpit_is_periodic,
};
+static void pit_do_work(struct work_struct *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ int inject = 0;
+
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
+ }
+ spin_unlock(&ps->inject_lock);
+ if (inject) {
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+ }
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ queue_work(pt->wq, &pt->expired);
+ }
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
+ cancel_work_sync(&ps->pit->expired);
pt->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = kvm_timer_fn;
+ pt->timer.function = pit_timer_fn;
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
- pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
}
break;
default:
- destroy_pit_timer(&ps->pit_timer);
+ destroy_pit_timer(kvm->arch.vpit);
}
}
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- raw_spin_lock_init(&pit->pit_state.inject_lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
+
+ pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+ if (!pit->wq) {
+ mutex_unlock(&pit->pit_state.lock);
+ kfree(pit);
+ return NULL;
+ }
+ INIT_WORK(&pit->expired, pit_do_work);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
struct hrtimer *timer;
if (kvm->arch.vpit) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &kvm->arch.vpit->speaker_dev);
kvm_unregister_irq_mask_notifier(kvm, 0,
&kvm->arch.vpit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
+ cancel_work_sync(&kvm->arch.vpit->expired);
kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ destroy_workqueue(kvm->arch.vpit->wq);
kfree(kvm->arch.vpit);
}
}
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int i;
-
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kpit_state *ps;
-
- if (pit) {
- int inject = 0;
- ps = &pit->pit_state;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- raw_spin_lock(&ps->inject_lock);
- if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- raw_spin_unlock(&ps->inject_lock);
- if (inject)
- __inject_pit_timer_intr(kvm);
- }
-}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- raw_spinlock_t inject_lock;
+ spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
@@ -40,6 +40,8 @@ struct kvm_pit {
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
+ struct workqueue_struct *wq;
+ struct work_struct expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2003-2004 Fabrice Bellard
* Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
#include <linux/kvm_host.h>
#include "trace.h"
+static void pic_irq_request(struct kvm *kvm, int level);
+
static void pic_lock(struct kvm_pic *s)
__acquires(&s->lock)
{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
__releases(&s->lock)
{
bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu;
+ struct kvm_vcpu *vcpu, *found = NULL;
+ int i;
s->wakeup_needed = false;
raw_spin_unlock(&s->lock);
if (wakeup) {
- vcpu = s->kvm->bsp_vcpu;
- if (vcpu)
- kvm_vcpu_kick(vcpu);
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = vcpu;
+ break;
+ }
+ }
+
+ if (!found)
+ found = s->kvm->bsp_vcpu;
+
+ kvm_vcpu_kick(found);
}
}
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
pic_set_irq1(&s->pics[0], 2, 0);
}
irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0)
- s->irq_request(s->irq_request_opaque, 1);
- else
- s->irq_request(s->irq_request_opaque, 0);
+ pic_irq_request(s->kvm, irq >= 0);
}
void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq;
- struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+ struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
u8 irr = s->irr, isr = s->imr;
s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
/*
* deassert a pending interrupt
*/
- s->pics_state->irq_request(s->pics_state->
- irq_request_opaque, 0);
+ pic_irq_request(s->pics_state->kvm, 0);
s->init_state = 1;
s->init4 = val & 1;
if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
} else
switch (s->init_state) {
- case 0: /* normal mode */
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
pic_update_irq(s->pics_state);
break;
+ }
case 1:
s->irq_base = val & 0xf8;
s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
/*
* callback when PIC0 irq status changed
*/
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
- s->irq_request = pic_irq_request;
- s->irq_request_opaque = kvm;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
/*
* irq.c: API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
- kvm_inject_pit_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
struct kvm;
struct kvm_vcpu;
-typedef void irq_request_func(void *opaque, int level);
-
struct kvm_kpic_state {
u8 last_irr; /* edge detection */
u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
+ might_sleep(); /* on svm */
+
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, ~0UL);
}
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+ | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2007 Novell
* Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
"dest_mode 0x%x, short_hand 0x%x\n",
target, source, dest, dest_mode, short_hand);
- ASSERT(!target);
+ ASSERT(target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
- set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (kvm_vcpu_is_bsp(vcpu)) {
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- }
+ if (!apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
return r;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..311f6dad8951 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
#include <linux/compiler.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte)
static void __set_spte(u64 *sptep, u64 spte)
{
+ set_64bit(sptep, spte);
+}
+
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
+ return xchg(sptep, new_spte);
#else
- set_64bit((unsigned long long *)sptep, spte);
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
#endif
}
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte;
+
+ if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+ !is_rmap_spte(*sptep))
+ __set_spte(sptep, new_spte);
+ else {
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (old_spte & shadow_accessed_mask)
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+ }
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
{
while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
}
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +381,11 @@ out:
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
{
- kfree(pc);
+ kmem_cache_free(pte_chain_cache, pc);
}
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
{
- kfree(rd);
+ kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (sp->role.direct)
+ BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+ else
+ sp->gfns[index] = gfn;
}
/*
@@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn,
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].write_count;
}
@@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
-
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm,
struct kvm_memory_slot *slot;
int *largepage_idx;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
@@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
/*
* Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].rmap_pde;
}
@@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
if (!is_rmap_spte(*spte))
return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
- sp->gfns[spte - sp->spt] = gfn;
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- pfn_t pfn;
+ gfn_t gfn;
unsigned long *rmapp;
int i;
- if (!is_rmap_spte(*spte))
- return;
sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
}
}
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+ old_spte & shadow_accessed_mask) {
+ __set_spte(sptep, new_spte);
+ } else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!is_rmap_spte(old_spte))
+ return;
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writable_pte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ set_spte_track_bits(sptep, new_spte);
+ rmap_remove(kvm, sptep);
+}
+
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
@@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
u64 *spte;
int i, write_protected = 0;
- gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
+ new_spte &= ~shadow_accessed_mask;
+ set_spte_track_bits(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- int idx = gfn_offset;
- idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ unsigned long idx;
+ int sh;
+
+ sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+ idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+ (memslot->base_gfn >> sh);
ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
@@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
- gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt)
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
- __free_page(virt_to_page(sp->gfns));
- kfree(sp);
+ if (!sp->role.direct)
+ __free_page(virt_to_page(sp->gfns));
+ kmem_cache_free(mmu_page_header_cache, sp);
++kvm->arch.n_free_mmu_pages;
}
@@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
+ u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+ PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
-
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ fn(parent_sp, sp->parent_pte);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ fn(parent_sp, spte);
}
}
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
+ mmu_parent_walk(sp, mark_unsync);
}
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
+ unsigned int index;
- if (!sp->parent_pte)
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ if (sp->unsync_children++)
return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+ struct kvm_mmu_page *sp, bool clear_unsync)
{
return 1;
}
@@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- __clear_bit(i, sp->unsync_child_bitmap);
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- }
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+ goto clear_child_bitmap;
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ goto clear_child_bitmap;
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ goto clear_child_bitmap;
- if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- }
- }
+ continue;
+
+clear_child_bitmap:
+ __clear_bit(i, sp->unsync_child_bitmap);
+ sp->unsync_children--;
+ WARN_ON((int)sp->unsync_children < 0);
}
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
- sp->unsync_children = 0;
return nr_unsync_leaf;
}
@@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
return __mmu_unsync_walk(sp, pvec);
}
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: found role %x\n",
- __func__, sp->role.word);
- return sp;
- }
- return NULL;
-}
-
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
@@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+#define for_each_gfn_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn) || (sp)->role.direct || \
+ (sp)->role.invalid) {} else
+
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list, bool clear_unsync)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
- if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (clear_unsync)
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+ if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
}
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ LIST_HEAD(invalid_list);
+ int ret;
+
+ ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+ if (ret)
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+ (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+ continue;
+ }
+ kvm_unlink_unsync_page(vcpu->kvm, s);
+ flush = true;
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (flush)
+ kvm_mmu_flush_tlb(vcpu);
+}
+
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_flush_remote_tlbs(vcpu->kvm);
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
+ kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
union kvm_mmu_page_role role;
- unsigned index;
unsigned quadrant;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *node;
+ bool need_sync = false;
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.cr4_pae = 0;
role.access = access;
- if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
- if (sp->gfn == gfn) {
- if (sp->unsync)
- if (kvm_sync_page(vcpu, sp))
- continue;
+ for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+ if (!need_sync && sp->unsync)
+ need_sync = true;
- if (sp->role.word != role.word)
- continue;
+ if (sp->role.word != role.word)
+ continue;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(sp);
- }
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
+ if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+ break;
+
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_mmu_mark_parents_unsync(sp);
+ } else if (sp->unsync)
+ kvm_mmu_mark_parents_unsync(sp);
+
+ trace_kvm_mmu_get_page(sp, false);
+ return sp;
+ }
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
if (!sp)
return sp;
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link, bucket);
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ kvm_sync_pages(vcpu, gfn);
+
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
--iterator->level;
}
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
}
}
pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent)
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
{
int i, zapped = 0;
struct mmu_page_path parents;
@@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
struct kvm_mmu_page *sp;
for_each_sp(pages, sp, parents, i) {
- kvm_mmu_zap_page(kvm, sp);
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
mmu_pages_clear_parents(&parents);
zapped++;
}
@@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
int ret;
- trace_kvm_mmu_zap_page(sp);
+ trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp);
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
- kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- hlist_del(&sp->hash_link);
- kvm_mmu_free_page(kvm, sp);
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
} else {
- sp->role.invalid = 1;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
}
+
+ sp->role.invalid = 1;
kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ kvm_flush_remote_tlbs(kvm);
+
+ do {
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(kvm, sp);
+ } while (!list_empty(invalid_list));
+
+}
+
/*
* Changing the number of mmu pages allocated to the vm
* Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
int used_pages;
+ LIST_HEAD(invalid_list);
used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
used_pages = max(0, used_pages);
@@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- used_pages -= kvm_mmu_zap_page(kvm, page);
- used_pages--;
+ used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+ &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
kvm_nr_mmu_pages = used_pages;
kvm->arch.n_free_mmu_pages = 0;
}
@@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
int r;
pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
return r;
}
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *nn;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: zap %lx %x\n",
- __func__, gfn, sp->role.word);
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: zap %lx %x\n",
+ __func__, gfn, sp->role.word);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *s;
- struct hlist_node *node, *n;
-
- index = kvm_page_table_hashfn(sp->gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- /* don't unsync if pagetable is shadowed with multiple roles */
- hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
- if (s->gfn != sp->gfn || s->role.direct)
- continue;
- if (s->role.word != sp->role.word)
- return 1;
- }
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
-
mmu_convert_notrap(sp);
- return 0;
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (s->unsync)
+ continue;
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ __kvm_unsync_page(vcpu, s);
+ }
}
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
- struct kvm_mmu_page *shadow;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ bool need_unsync = false;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
- if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!can_unsync)
return 1;
- if (shadow->unsync)
- return 0;
- if (can_unsync && oos_shadow)
- return kvm_unsync_page(vcpu, shadow);
- return 1;
+
+ if (s->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+
+ if (!need_unsync && !s->unsync) {
+ if (!oos_shadow)
+ return 1;
+ need_unsync = true;
+ }
}
+ if (need_unsync)
+ kvm_unsync_pages(vcpu, gfn);
return 0;
}
@@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+ || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+ && !user_fault)) {
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
}
spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- __set_spte(sptep, spte);
+ if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+ kvm_set_pfn_dirty(pfn);
+ update_spte(sptep, spte);
+done:
return ret;
}
@@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
@@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
}
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
- kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
- } else {
- if (was_writable)
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
}
+ kvm_release_pfn_clean(pfn);
if (speculative) {
vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
}
if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ u64 base_addr = iterator.addr;
+
+ base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+ pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return pt_write;
}
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+ char buf[1];
+ void __user *hva;
+ int r;
+
+ /* Touch the page, so send SIGBUS */
+ hva = (void __user *)gfn_to_hva(kvm, gfn);
+ r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+ kvm_release_pfn_clean(pfn);
+ if (is_hwpoison_pfn(pfn)) {
+ kvm_send_hwpoison_signal(kvm, gfn);
+ return 0;
+ } else if (is_fault_pfn(pfn))
+ return -EFAULT;
+
+ return 1;
+}
+
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
@@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn = gfn_to_pfn(vcpu->kvm, gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
@@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (!sp->root_count && sp->role.invalid) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ }
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
int ret = 0;
if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
@@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = 0;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = i << 30;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ /* mmu.free() should set root_hpa = INVALID_PAGE */
vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
r = mmu_alloc_roots(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
mmu_sync_roots(vcpu);
@@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
+ if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+ return;
+
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+ bool remote_flush, bool local_flush)
{
- if (need_remote_flush(old, new))
+ if (zap_page)
+ return;
+
+ if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
- else
+ else if (local_flush)
kvm_mmu_flush_tlb(vcpu);
}
@@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
+ union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
int r;
int invlpg_counter;
+ bool remote_flush, local_flush, zap_page;
+
+ zap_page = remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pte_updated = NULL;
}
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
- continue;
+ mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
pte_size = sp->role.cr4_pae ? 8 : 4;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
@@ -2697,8 +2869,8 @@ restart:
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- if (kvm_mmu_zap_page(vcpu->kvm, sp))
- goto restart;
+ zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -2722,16 +2894,22 @@ restart:
if (quadrant != sp->role.quadrant)
continue;
}
+ local_flush = true;
spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (gentry)
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mask.word))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+ if (!remote_flush && need_remote_flush(entry, *spte))
+ remote_flush = true;
++spte;
}
}
+ mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+ int free_pages;
+ LIST_HEAD(invalid_list);
+
+ free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+ while (free_pages < KVM_REFILL_PAGES &&
!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
- return 0;
+ /* fall through */
case EMULATE_FAIL:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
@@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
+ if (is_writable_pte(pt[i]))
pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_zap_page(kvm, sp))
+ if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
-
- kvm_flush_remote_tlbs(kvm);
}
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- return kvm_mmu_zap_page(kvm, page) + 1;
+ return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
list_for_each_entry(kvm, &vm_list, vm_list) {
int npages, idx, freed_pages;
+ LIST_HEAD(invalid_list);
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
kvm->arch.n_free_mmu_pages;
cache_count += npages;
if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+ freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+ &invalid_list);
cache_count -= freed_pages;
kvm_freed = kvm;
}
nr_to_scan--;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
return 1;
}
@@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
struct kvm_mmu_page *rev_sp;
gfn_t gfn;
- if (*sptep & PT_WRITABLE_MASK) {
+ if (is_writable_pte(*sptep)) {
rev_sp = page_header(__pa(sptep));
- gfn = rev_sp->gfns[sptep - rev_sp->spt];
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
@@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- rev_sp->role.level);
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
if (!*rmapp) {
if (!printk_ratelimit())
return;
@@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
if (!(ent & PT_PRESENT_MASK))
continue;
- if (!(ent & PT_WRITABLE_MASK))
+ if (!is_writable_pte(ent))
continue;
inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
}
@@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (sp->unsync)
continue;
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
spte = rmap_next(vcpu->kvm, rmapp, NULL);
while (spte) {
- if (*spte & PT_WRITABLE_MASK)
+ if (is_writable_pte(*spte))
printk(KERN_ERR "%s: (%s) shadow page has "
"writable mappings: gfn %lx role %x\n",
__func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
TP_ARGS(sp)
);
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_PROTO(struct kvm_mmu_page *sp),
TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
{
pt_element_t pte;
gfn_t table_gfn;
- unsigned index, pt_access, pte_access;
+ unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
- int rsvd_fault = 0;
+ bool eperm, present, rsvd_fault;
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
walk:
+ present = true;
+ eperm = rsvd_fault = false;
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
+ if (!is_present_gpte(pte)) {
+ present = false;
+ goto error;
+ }
--walker->level;
}
#endif
@@ -150,37 +155,42 @@ walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
- goto not_present;
+ if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+ present = false;
+ break;
+ }
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
+ if (!is_present_gpte(pte)) {
+ present = false;
+ break;
+ }
- rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
- if (rsvd_fault)
- goto access_error;
+ if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+ rsvd_fault = true;
+ break;
+ }
if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
- goto access_error;
+ eperm = true;
if (user_fault && !(pte & PT_USER_MASK))
- goto access_error;
+ eperm = true;
#if PTTYPE == 64
if (fetch_fault && (pte & PT64_NX_MASK))
- goto access_error;
+ eperm = true;
#endif
- if (!(pte & PT_ACCESSED_MASK)) {
+ if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
@@ -213,15 +223,18 @@ walk:
--walker->level;
}
+ if (!present || eperm || rsvd_fault)
+ goto error;
+
if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
if (ret)
goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
@@ -229,22 +242,18 @@ walk:
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pt_access, pte_access);
+ __func__, (u64)pte, pte_access, pt_access);
return 1;
-not_present:
+error:
walker->error_code = 0;
- goto err;
-
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
-
-err:
+ if (present)
+ walker->error_code |= PFERR_PRESENT_MASK;
if (write_fault)
walker->error_code |= PFERR_WRITE_MASK;
if (user_fault)
walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
+ if (fetch_fault && is_nx(vcpu))
walker->error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
return 0;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
if (!is_present_gpte(gpte)) {
- if (page->unsync)
+ if (sp->unsync)
new_spte = shadow_trap_nonpresent_pte;
else
new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
* we call mmu_set_spte() with reset_host_protection = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
- mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ int r;
+ pt_element_t curr_pte;
+
+ r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+ &curr_pte, sizeof(curr_pte));
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
- struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep = NULL;
- int direct;
- gfn_t table_gfn;
- int r;
- int level;
- pt_element_t curr_pte;
- struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp = NULL;
+ bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+ int top_level;
+ unsigned direct_access;
+ struct kvm_shadow_walk_iterator it;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- for_each_shadow_entry(vcpu, addr, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
- if (iterator.level == hlevel) {
- mmu_set_spte(vcpu, sptep, access,
- gw->pte_access & access,
- user_fault, write_fault,
- gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, level,
- gw->gfn, pfn, false, true);
- break;
- }
+ direct_access = gw->pt_access & gw->pte_access;
+ if (!dirty)
+ direct_access &= ~ACC_WRITE_MASK;
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
- continue;
+ top_level = vcpu->arch.mmu.root_level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
- if (is_large_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
+ for (shadow_walk_init(&it, vcpu, addr);
+ shadow_walk_okay(&it) && it.level > gw->level;
+ shadow_walk_next(&it)) {
+ gfn_t table_gfn;
- if (level <= gw->level) {
- int delta = level - gw->level + 1;
- direct = 1;
- if (!is_dirty_gpte(gw->ptes[level - delta]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
- /* advance table_gfn when emulating 1gb pages with 4k */
- if (delta == 0)
- table_gfn += PT_INDEX(addr, level);
- access &= gw->pte_access;
- } else {
- direct = 0;
- table_gfn = gw->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- direct, access, sptep);
- if (!direct) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
- kvm_mmu_put_page(shadow_page, sptep);
- kvm_release_pfn_clean(pfn);
- sptep = NULL;
- break;
- }
+ drop_large_spte(vcpu, it.sptep);
+
+ sp = NULL;
+ if (!is_shadow_present_pte(*it.sptep)) {
+ table_gfn = gw->table_gfn[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+ false, access, it.sptep);
}
- spte = __pa(shadow_page->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *sptep = spte;
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp)
+ link_shadow_page(it.sptep, sp);
}
- return sptep;
+ for (;
+ shadow_walk_okay(&it) && it.level > hlevel;
+ shadow_walk_next(&it)) {
+ gfn_t direct_gfn;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ drop_large_spte(vcpu, it.sptep);
+
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+ sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+ true, direct_access, it.sptep);
+ link_shadow_page(it.sptep, sp);
+ }
+
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+ user_fault, write_fault, dirty, ptwrite, it.level,
+ gw->gfn, pfn, false, true);
+
+ return it.sptep;
+
+out_gpte_changed:
+ if (sp)
+ kvm_mmu_put_page(sp, it.sptep);
+ kvm_release_pfn_clean(pfn);
+ return NULL;
}
/*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- pgprintk("gfn %lx is mmio\n", walker.gfn);
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn);
+ (void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -464,6 +493,7 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
gpa_t pte_gpa = -1;
int level;
u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
level = iterator.level;
sptep = iterator.sptep;
+ sp = page_header(__pa(sptep));
if (is_last_spte(*sptep, level)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
int offset, shift;
+ if (!sp->unsync)
+ break;
+
shift = PAGE_SHIFT -
(PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
+ drop_spte(vcpu->kvm, sptep,
+ shadow_trap_nonpresent_pte);
need_flush = 1;
- }
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ } else
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
- if (!is_shadow_present_pte(*sptep))
+ if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ bool clear_unsync)
{
int i, offset, nr_present;
bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
offset = nr_present = 0;
+ /* direct kvm_mmu_page can not be unsync. */
+ BUG_ON(sp->role.direct);
+
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
- gfn_t gfn = sp->gfns[i];
+ gfn_t gfn;
if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
- !(gpte & PT_ACCESSED_MASK)) {
+ gfn = gpte_to_gfn(gpte);
+ if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+ || gfn != sp->gfns[i] || !is_present_gpte(gpte)
+ || !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
- rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_gpte(gpte))
+ if (is_present_gpte(gpte) || !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- __set_spte(&sp->spt[i], nonpresent);
+ drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..bc5b9b8d4a33 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -130,7 +131,7 @@ static struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
- { .index = MSR_K6_STAR, .always = true },
+ { .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ vcpu->arch.efer = efer;
if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.efer = efer;
}
static int is_external_interrupt(u32 info)
@@ -383,8 +384,7 @@ static void svm_init_erratum_383(void)
int err;
u64 val;
- /* Only Fam10h is affected */
- if (boot_cpu_data.x86 != 0x10)
+ if (!cpu_has_amd_erratum(amd_erratum_383))
return;
/* Use _safe variants to not break nested virtualization */
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME);
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
for_each_possible_cpu(cpu) {
@@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm)
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+ (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->asid_generation = 0;
init_vmcb(svm);
- fx_init(&svm->vcpu);
+ err = fx_init(&svm->vcpu);
+ if (err)
+ goto free_page4;
+
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
+free_page4:
+ __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
@@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
return;
}
@@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = hsave->save.cr3;
svm->vcpu.arch.cr3 = hsave->save.cr3;
} else {
- kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
} else
- kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
/* Guest paging mode is active - reset mmu */
kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm)
static int invlpg_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
+ return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
+ return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
}
static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = tsc_offset + native_read_tsc();
break;
}
- case MSR_K6_STAR:
+ case MSR_STAR:
*data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
@@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
break;
}
- case MSR_K6_STAR:
+ case MSR_STAR:
svm->vmcb->save.star = data;
break;
#ifdef CONFIG_X86_64
@@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_NPF] = pf_interception,
};
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ pr_err("VMCB Control Area:\n");
+ pr_err("cr_read: %04x\n", control->intercept_cr_read);
+ pr_err("cr_write: %04x\n", control->intercept_cr_write);
+ pr_err("dr_read: %04x\n", control->intercept_dr_read);
+ pr_err("dr_write: %04x\n", control->intercept_dr_write);
+ pr_err("exceptions: %08x\n", control->intercept_exceptions);
+ pr_err("intercepts: %016llx\n", control->intercept);
+ pr_err("pause filter count: %d\n", control->pause_filter_count);
+ pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
+ pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
+ pr_err("tsc_offset: %016llx\n", control->tsc_offset);
+ pr_err("asid: %d\n", control->asid);
+ pr_err("tlb_ctl: %d\n", control->tlb_ctl);
+ pr_err("int_ctl: %08x\n", control->int_ctl);
+ pr_err("int_vector: %08x\n", control->int_vector);
+ pr_err("int_state: %08x\n", control->int_state);
+ pr_err("exit_code: %08x\n", control->exit_code);
+ pr_err("exit_info1: %016llx\n", control->exit_info_1);
+ pr_err("exit_info2: %016llx\n", control->exit_info_2);
+ pr_err("exit_int_info: %08x\n", control->exit_int_info);
+ pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
+ pr_err("nested_ctl: %lld\n", control->nested_ctl);
+ pr_err("nested_cr3: %016llx\n", control->nested_cr3);
+ pr_err("event_inj: %08x\n", control->event_inj);
+ pr_err("event_inj_err: %08x\n", control->event_inj_err);
+ pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
+ pr_err("next_rip: %016llx\n", control->next_rip);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
+ pr_err("cpl: %d efer: %016llx\n",
+ save->cpl, save->efer);
+ pr_err("cr0: %016llx cr2: %016llx\n",
+ save->cr0, save->cr2);
+ pr_err("cr3: %016llx cr4: %016llx\n",
+ save->cr3, save->cr4);
+ pr_err("dr6: %016llx dr7: %016llx\n",
+ save->dr6, save->dr7);
+ pr_err("rip: %016llx rflags: %016llx\n",
+ save->rip, save->rflags);
+ pr_err("rsp: %016llx rax: %016llx\n",
+ save->rsp, save->rax);
+ pr_err("star: %016llx lstar: %016llx\n",
+ save->star, save->lstar);
+ pr_err("cstar: %016llx sfmask: %016llx\n",
+ save->cstar, save->sfmask);
+ pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
+ save->kernel_gs_base, save->sysenter_cs);
+ pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
+ save->sysenter_esp, save->sysenter_eip);
+ pr_err("gpat: %016llx dbgctl: %016llx\n",
+ save->g_pat, save->dbgctl);
+ pr_err("br_from: %016llx br_to: %016llx\n",
+ save->br_from, save->br_to);
+ pr_err("excp_from: %016llx excp_to: %016llx\n",
+ save->last_excp_from, save->last_excp_to);
+
+}
+
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
+ pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+ dump_vmcb(vcpu);
return 0;
}
@@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
- trace_kvm_inj_virq(irq);
-
- ++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
BUG_ON(!(gif_set(svm)));
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ ++vcpu->stat.irq_injections;
+
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
@@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
/* FIXME: this code should not know anything about vcpus */
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
}
if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..49b25eee25ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
#include "trace.h"
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
@@ -231,14 +240,14 @@ static u64 host_efer;
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
static inline bool cpu_has_vmx_invept_individual_addr(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
vmcs, phys_addr);
}
+static void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+ : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc", "memory");
+ if (error)
+ printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+ vmcs, phys_addr);
+}
+
static void __vcpu_clear(void *arg)
{
struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
}
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
{
if (vmx->vpid == 0)
return;
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ if (cpu_has_vmx_invvpid_single())
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ if (cpu_has_vmx_invvpid_global())
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vmx);
+ else
+ vpid_sync_vcpu_global();
}
static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
}
#endif
+ if (current_thread_info()->status & TS_USEDFPU)
+ clts();
+ load_gdt(&__get_cpu_var(host_gdt));
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(vmx->vmcs);
u64 tsc_this, delta, new_offset;
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- if (vcpu->cpu != cpu) {
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+ else if (vcpu->cpu != cpu)
vcpu_clear(vmx);
- kvm_migrate_timers(vcpu);
- set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
- local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
- local_irq_enable();
- }
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- u8 error;
-
per_cpu(current_vmcs, cpu) = vmx->vmcs;
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
- vmx->vmcs, phys_addr);
+ vmcs_load(vmx->vmcs);
}
if (vcpu->cpu != cpu) {
struct desc_ptr dt;
unsigned long sysenter_esp;
+ kvm_migrate_timers(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ local_irq_disable();
+ list_add(&vmx->local_vcpus_link,
+ &per_cpu(vcpus_on_cpu, cpu));
+ local_irq_enable();
+
vcpu->cpu = cpu;
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
__vmx_load_host_state(to_vmx(vcpu));
+ if (!vmm_exclusive) {
+ __vcpu_clear(to_vmx(vcpu));
+ kvm_cpu_vmxoff();
+ }
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0 && vmx->rdtscp_enabled)
move_msr_up(vmx, index, save_nmsrs++);
/*
- * MSR_K6_STAR is only needed on long mode guests, and only
+ * MSR_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
- index = __find_msr_index(vmx, MSR_K6_STAR);
+ index = __find_msr_index(vmx, MSR_STAR);
if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
/* locked but not enabled */
}
+static void kvm_cpu_vmxon(u64 addr)
+{
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&addr), "m"(addr)
+ : "memory", "cc");
+}
+
static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&phys_addr), "m"(phys_addr)
- : "memory", "cc");
- ept_sync_global();
+ if (vmm_exclusive) {
+ kvm_cpu_vmxon(phys_addr);
+ ept_sync_global();
+ }
+
+ store_gdt(&__get_cpu_var(host_gdt));
return 0;
}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static void hardware_disable(void *garbage)
{
- vmclear_local_vcpus();
- kvm_cpu_vmxoff();
+ if (vmm_exclusive) {
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
+ }
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept()) {
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
}
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
gfn_t base_gfn;
slots = kvm_memslots(kvm);
- base_gfn = kvm->memslots->memslots[0].base_gfn +
+ base_gfn = slots->memslots[0].base_gfn +
kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
- vpid_sync_vcpu_all(to_vmx(vcpu));
- if (enable_ept)
+ vpid_sync_context(to_vmx(vcpu));
+ if (enable_ept) {
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ }
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
- vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
+ vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
static int init_rmode(struct kvm *kvm)
{
+ int idx, ret = 0;
+
+ idx = srcu_read_lock(&kvm->srcu);
if (!init_rmode_tss(kvm))
- return 0;
+ goto exit;
if (!init_rmode_identity_map(kvm))
- return 0;
- return 1;
+ goto exit;
+
+ ret = 1;
+exit:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
}
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret, idx;
+ int ret;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
- fx_init(&vmx->vcpu);
+ ret = fx_init(&vmx->vcpu);
+ if (ret != 0)
+ goto out;
seg_setup(VCPU_SREG_CS);
/*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
- vpid_sync_vcpu_all(vmx);
+ vpid_sync_context(vmx);
ret = 0;
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
- else
- return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- GUEST_INTR_STATE_NMI);
+ return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
}
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
+static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ skip_emulated_instruction(vcpu);
+}
+
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
int cr;
int reg;
+ int err;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr0(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 3:
- kvm_set_cr3(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr3(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 4:
- kvm_set_cr4(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr4(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
+ kvm_emulate_wbinvd(vcpu);
return 1;
}
-static int handle_apic_access(struct kvm_vcpu *vcpu)
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
- enum emulation_result er;
- unsigned long offset;
+ u64 new_bv = kvm_read_edx_eax(vcpu);
+ u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- offset = exit_qualification & 0xffful;
-
- er = emulate_instruction(vcpu, 0, 0, 0);
-
- if (er != EMULATE_DONE) {
- printk(KERN_ERR
- "Fail to handle apic access vmexit! Offset is 0x%lx\n",
- offset);
- return -ENOEXEC;
- }
+ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+ skip_emulated_instruction(vcpu);
return 1;
}
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+}
+
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (err != EMULATE_DONE) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- ret = 0;
- goto out;
- }
+ if (err != EMULATE_DONE)
+ return 0;
if (signal_pending(current))
goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason;
+ return 0;
+ }
+
if (unlikely(vmx->fail)) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- /*
- * Loading guest fpu may have cleared host cr0.ts
- */
- vmcs_writel(HOST_CR0, read_cr0());
-
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vmx);
}
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+
+ vmcs_clear(vmcs);
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
+}
+
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->vmcs)
goto free_msrs;
- vmcs_clear(vmx->vmcs);
+ vmcs_init(vmx->vmcs);
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.rdtscp_supported = vmx_rdtscp_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..25f19078b321 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
+#include <linux/uaccess.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#include <asm/debugreg.h>
-#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
return changed;
}
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+ X86_CR0_CD | X86_CR0_NW;
+
cr0 |= X86_CR0_ET;
#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr0 & 0xffffffff00000000UL)
+ return 1;
#endif
cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return 1;
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return 1;
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
- if (!is_pae(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!is_pae(vcpu))
+ return 1;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l) {
- kvm_inject_gp(vcpu, 0);
- return;
-
- }
+ if (cs_l)
+ return 1;
} else
#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
-
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+ return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
- kvm_mmu_reset_context(vcpu);
- return;
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+ u64 xcr0;
- if (cr4 & CR4_RESERVED_BITS) {
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ xcr0 = xcr;
+ if (kvm_x86_ops->get_cpl(vcpu) != 0)
+ return 1;
+ if (!(xcr0 & XSTATE_FP))
+ return 1;
+ if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+ return 1;
+ if (xcr0 & ~host_xcr0)
+ return 1;
+ vcpu->arch.xcr0 = xcr0;
+ vcpu->guest_xcr0_loaded = 0;
+ return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ if (__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
return;
+
+ /* Update OSXSAVE bit */
+ if (cpu_has_xsave && best->function == 0x1) {
+ best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= bit(X86_FEATURE_OSXSAVE);
}
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+ if (cr4 & CR4_RESERVED_BITS)
+ return 1;
+
+ if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+ return 1;
if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ && !load_pdptrs(vcpu, vcpu->arch.cr3))
+ return 1;
+
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
- if (cr4 & X86_CR4_VMXE) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
kvm_x86_ops->set_cr4(vcpu, cr4);
- vcpu->arch.cr4 = cr4;
- kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & pdptr_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
- return;
+ return 0;
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
} else {
if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
- if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr3 & CR3_PAE_RESERVED_BITS)
+ return 1;
+ if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+ return 1;
}
/*
* We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- kvm_inject_gp(vcpu, 0);
- else {
- vcpu->arch.cr3 = cr3;
- vcpu->arch.mmu.new_cr3(vcpu);
- }
+ return 1;
+ vcpu->arch.cr3 = cr3;
+ vcpu->arch.mmu.new_cr3(vcpu);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- if (cr8 & CR8_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
+ return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (__kvm_set_cr8(vcpu, cr8))
+ kvm_inject_gp(vcpu, 0);
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
/* fall through */
case 6:
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
/* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ int res;
+
+ res = __kvm_set_dr(vcpu, dr, val);
+ if (res > 0)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ else if (res < 0)
+ kvm_inject_gp(vcpu, 0);
+
+ return res;
+}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
switch (dr) {
case 0 ... 3:
*val = vcpu->arch.db[dr];
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return 1;
- }
/* fall through */
case 6:
*val = vcpu->arch.dr6;
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return 1;
- }
/* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
-static inline u32 bit(int bitno)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
- return 1 << (bitno & 31);
+ if (_kvm_get_dr(vcpu, dr, val)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ return 0;
}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,7 +733,7 @@ static u32 msrs_to_save[] = {
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_K6_STAR,
+ MSR_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
};
static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ u64 old_efer = vcpu->arch.efer;
+
if (efer & efer_reserved_bits)
return 1;
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
- vcpu->arch.efer = efer;
-
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
+ /* Update reserved bits */
+ if ((efer ^ old_efer) & EFER_NX)
+ kvm_mmu_reset_context(vcpu);
+
return 0;
}
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
if (!vcpu->time_page)
return 0;
- set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+ kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
return 1;
}
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
{
int i, idx;
- vcpu_load(vcpu);
-
idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- vcpu_put(vcpu);
-
return i;
}
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
+ case KVM_CAP_XCRS:
+ r = cpu_has_xsave;
+ break;
default:
r = 0;
break;
@@ -1717,8 +1785,28 @@ out:
return r;
}
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_fpu(vcpu);
kvm_x86_ops->vcpu_put(vcpu);
+ kvm_put_guest_fpu(vcpu);
}
static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
- vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- vcpu_put(vcpu);
+ update_cpuid(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
- vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- vcpu_put(vcpu);
+ update_cpuid(vcpu);
return 0;
out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
{
int r;
- vcpu_load(vcpu);
r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
out:
cpuid->nent = vcpu->arch.cpuid_nent;
- vcpu_put(vcpu);
return r;
}
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
- F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved, XSAVE, OSXSAVE */;
+ 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
switch (function) {
case 0:
- entry->eax = min(entry->eax, (u32)0xb);
+ entry->eax = min(entry->eax, (u32)0xd);
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case 0xd: {
+ int i;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ for (i = 1; *nent < maxnent; ++i) {
+ if (entry[i - 1].eax == 0 && i != 2)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
case KVM_CPUID_SIGNATURE: {
char signature[12] = "KVMKVMKVM\0\0";
u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- vcpu_load(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
- vcpu_put(vcpu);
return 0;
}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
update_cr8_intercept(vcpu);
- vcpu_put(vcpu);
return 0;
}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
- vcpu_load(vcpu);
kvm_queue_interrupt(vcpu, irq->irq, false);
- vcpu_put(vcpu);
-
return 0;
}
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
kvm_inject_nmi(vcpu);
- vcpu_put(vcpu);
return 0;
}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
- vcpu_load(vcpu);
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
out:
- vcpu_put(vcpu);
return r;
}
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- vcpu_load(vcpu);
-
events->exception.injected =
vcpu->arch.exception.pending &&
!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW);
-
- vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
- vcpu_load(vcpu);
-
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
vcpu->arch.sipi_vector = events->sipi_vector;
- vcpu_put(vcpu);
-
return 0;
}
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- vcpu_load(vcpu);
-
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
-
- vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
- vcpu_load(vcpu);
-
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
- vcpu_put(vcpu);
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (cpu_has_xsave)
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->xsave,
+ sizeof(struct xsave_struct));
+ else {
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->fxsave,
+ sizeof(struct i387_fxsave_struct));
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+ XSTATE_FPSSE;
+ }
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ u64 xstate_bv =
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ if (cpu_has_xsave)
+ memcpy(&vcpu->arch.guest_fpu.state->xsave,
+ guest_xsave->region, sizeof(struct xsave_struct));
+ else {
+ if (xstate_bv & ~XSTATE_FPSSE)
+ return -EINVAL;
+ memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+ guest_xsave->region, sizeof(struct i387_fxsave_struct));
+ }
return 0;
}
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!cpu_has_xsave) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!cpu_has_xsave)
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[0].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
- struct kvm_lapic_state *lapic = NULL;
+ union {
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+ u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
r = -EFAULT;
- if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+ if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
if (r)
goto out;
r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&mce, argp, sizeof mce))
goto out;
- vcpu_load(vcpu);
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
- vcpu_put(vcpu);
break;
}
case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
break;
}
+ case KVM_GET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xcrs, argp,
+ sizeof(struct kvm_xcrs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
default:
r = -EINVAL;
}
out:
- kfree(lapic);
+ kfree(u.buffer);
return r;
}
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
return kvm->arch.n_alloc_mmu_pages;
}
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
- struct kvm_mem_aliases *aliases;
-
- aliases = kvm_aliases(kvm);
-
- for (i = 0; i < aliases->naliases; ++i) {
- alias = &aliases->aliases[i];
- if (alias->flags & KVM_ALIAS_INVALID)
- continue;
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
- struct kvm_mem_aliases *aliases;
-
- aliases = kvm_aliases(kvm);
-
- for (i = 0; i < aliases->naliases; ++i) {
- alias = &aliases->aliases[i];
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-/*
- * Set a new alias region. Aliases map a portion of physical memory into
- * another portion. This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
- struct kvm_memory_alias *alias)
-{
- int r, n;
- struct kvm_mem_alias *p;
- struct kvm_mem_aliases *aliases, *old_aliases;
-
- r = -EINVAL;
- /* General sanity checks */
- if (alias->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (alias->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (alias->slot >= KVM_ALIAS_SLOTS)
- goto out;
- if (alias->guest_phys_addr + alias->memory_size
- < alias->guest_phys_addr)
- goto out;
- if (alias->target_phys_addr + alias->memory_size
- < alias->target_phys_addr)
- goto out;
-
- r = -ENOMEM;
- aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!aliases)
- goto out;
-
- mutex_lock(&kvm->slots_lock);
-
- /* invalidate any gfn reference in case of deletion/shrinking */
- memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
- aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
- old_aliases = kvm->arch.aliases;
- rcu_assign_pointer(kvm->arch.aliases, aliases);
- synchronize_srcu_expedited(&kvm->srcu);
- kvm_mmu_zap_all(kvm);
- kfree(old_aliases);
-
- r = -ENOMEM;
- aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!aliases)
- goto out_unlock;
-
- memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
- p = &aliases->aliases[alias->slot];
- p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
- p->npages = alias->memory_size >> PAGE_SHIFT;
- p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
- p->flags &= ~(KVM_ALIAS_INVALID);
-
- for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (aliases->aliases[n - 1].npages)
- break;
- aliases->naliases = n;
-
- old_aliases = kvm->arch.aliases;
- rcu_assign_pointer(kvm->arch.aliases, aliases);
- synchronize_srcu_expedited(&kvm->srcu);
- kfree(old_aliases);
- r = 0;
-
-out_unlock:
- mutex_unlock(&kvm->slots_lock);
-out:
- return r;
-}
-
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
unsigned long n;
unsigned long is_dirty = 0;
- unsigned long *dirty_bitmap = NULL;
mutex_lock(&kvm->slots_lock);
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
- r = -ENOMEM;
- dirty_bitmap = vmalloc(n);
- if (!dirty_bitmap)
- goto out;
- memset(dirty_bitmap, 0, n);
-
for (i = 0; !is_dirty && i < n/sizeof(long); i++)
is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
struct kvm_memslots *slots, *old_slots;
+ unsigned long *dirty_bitmap;
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
- if (!slots)
- goto out_free;
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
+ goto out;
+ memset(dirty_bitmap, 0, n);
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots) {
+ vfree(dirty_bitmap);
+ goto out;
+ }
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
synchronize_srcu_expedited(&kvm->srcu);
dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
kfree(old_slots);
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+ vfree(dirty_bitmap);
+ goto out;
+ }
+ vfree(dirty_bitmap);
+ } else {
+ r = -EFAULT;
+ if (clear_user(log->dirty_bitmap, n))
+ goto out;
}
r = 0;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- r = -EFAULT;
-out_free:
- vfree(dirty_bitmap);
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
- struct kvm_memory_alias alias;
struct kvm_pit_config pit_config;
} u;
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
break;
}
- case KVM_SET_MEMORY_REGION: {
- struct kvm_memory_region kvm_mem;
- struct kvm_userspace_memory_region kvm_userspace_mem;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
- goto out;
- kvm_userspace_mem.slot = kvm_mem.slot;
- kvm_userspace_mem.flags = kvm_mem.flags;
- kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
- kvm_userspace_mem.memory_size = kvm_mem.memory_size;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
- break;
- }
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
- case KVM_SET_MEMORY_ALIAS:
- r = -EFAULT;
- if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
- goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
- if (r)
- goto out;
- break;
case KVM_CREATE_IRQCHIP: {
struct kvm_pic *vpic;
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
}
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
}
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -3330,10 +3407,10 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, error_code);
+ if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_IO_NEEDED;
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- u32 error_code;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, error_code);
+ if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
return X86EMUL_CONTINUE;
vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+ memcpy(vcpu->run->mmio.data, val, bytes);
return X86EMUL_CONTINUE;
}
@@ -3431,6 +3506,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+ vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+ return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+ vcpu);
}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
#define CMPXCHG_TYPE(t, ptr, old, new) \
(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ goto emul_write;
+ }
kaddr = kmap_atomic(page, KM_USER0);
kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, vcpu);
+ return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
-int emulate_clts(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- kvm_x86_ops->fpu_activate(vcpu);
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ }
+ wbinvd();
return X86EMUL_CONTINUE;
}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulate_clts(struct kvm_vcpu *vcpu)
{
- return kvm_get_dr(ctxt->vcpu, dr, dest);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
+ return X86EMUL_CONTINUE;
}
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
{
- unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
- return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+ return _kvm_get_dr(vcpu, dr, dest);
}
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
{
- u8 opcodes[4];
- unsigned long rip = kvm_rip_read(vcpu);
- unsigned long rip_linear;
-
- if (!printk_ratelimit())
- return;
- rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
- printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
- context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+ return __kvm_set_dr(vcpu, dr, value);
}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
return value;
}
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
{
+ int res = 0;
+
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
- kvm_set_cr3(vcpu, val);
+ res = kvm_set_cr3(vcpu, val);
break;
case 4:
- kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
- kvm_set_cr8(vcpu, val & 0xfUL);
+ res = __kvm_set_cr8(vcpu, val & 0xfUL);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
}
+
+ return res;
}
static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
kvm_x86_ops->get_gdt(vcpu, dt);
}
+static unsigned long emulator_get_cached_segment_base(int seg,
+ struct kvm_vcpu *vcpu)
+{
+ return get_segment_base(vcpu, seg);
+}
+
static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
struct kvm_vcpu *vcpu)
{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
kvm_set_segment(vcpu, &kvm_seg, seg);
}
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
.set_cached_descriptor = emulator_set_cached_descriptor,
.get_segment_selector = emulator_get_segment_selector,
.set_segment_selector = emulator_set_segment_selector,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
- .set_rflags = emulator_set_rflags,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .set_msr = kvm_set_msr,
+ .get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = ~0;
}
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception == PF_VECTOR)
+ kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+ else if (ctxt->error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception);
+}
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return EMULATE_FAIL;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+
+ if (tdp_enabled)
+ return false;
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-entetr the
+ * guest to let CPU execute the instruction.
+ */
+ if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+ return true;
+
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+ if (gpa == UNMAPPED_GVA)
+ return true; /* let cpu generate fault */
+
+ if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ return true;
+
+ return false;
+}
+
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
{
- int r, shadow_mask;
- struct decode_cache *c;
- struct kvm_run *run = vcpu->run;
+ int r;
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
*/
cache_all_regs(vcpu);
- vcpu->mmio_is_write = 0;
-
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
int cs_db, cs_l;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ memset(c, 0, sizeof(struct decode_cache));
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ vcpu->arch.emulate_ctxt.interruptibility = 0;
+ vcpu->arch.emulate_ctxt.exception = -1;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
trace_kvm_emulate_insn_start(vcpu);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
- c = &vcpu->arch.emulate_ctxt.decode;
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return handle_emulation_failure(vcpu);
}
}
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
+ /* this is needed for vmware backdor interface to work since it
+ changes registers values during IO operation */
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+
restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
- if (r == 0)
- kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+ if (r) { /* emulation failed */
+ if (reexecute_instruction(vcpu, cr2))
+ return EMULATE_DONE;
- if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
- vcpu->arch.pio.count = 0;
- return EMULATE_DO_MMIO;
+ return handle_emulation_failure(vcpu);
}
- if (r || vcpu->mmio_is_write) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr;
- memcpy(run->mmio.data, vcpu->mmio_data, 8);
- run->mmio.len = vcpu->mmio_size;
- run->mmio.is_write = vcpu->mmio_is_write;
+ toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+ if (vcpu->arch.emulate_ctxt.exception >= 0) {
+ inject_emulated_exception(vcpu);
+ return EMULATE_DONE;
}
- if (r) {
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- goto done;
- if (!vcpu->mmio_needed) {
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- kvm_report_emulation_failure(vcpu, "mmio");
- return EMULATE_FAIL;
- }
+ if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in)
+ vcpu->arch.pio.count = 0;
return EMULATE_DO_MMIO;
}
- if (vcpu->mmio_is_write) {
- vcpu->mmio_needed = 0;
+ if (vcpu->mmio_needed) {
+ if (vcpu->mmio_is_write)
+ vcpu->mmio_needed = 0;
return EMULATE_DO_MMIO;
}
-done:
- if (vcpu->arch.exception.pending)
- vcpu->arch.emulate_ctxt.restart = false;
-
if (vcpu->arch.emulate_ctxt.restart)
goto restart;
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
perf_register_guest_info_callbacks(&kvm_guest_cbs);
+ if (cpu_has_xsave)
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
return 0;
out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, vcpu);
+ return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
}
}
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
- kvm_mmu_unload(vcpu);
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
if (vcpu->requests) {
- if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
- if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
kvm_write_guest_time(vcpu);
- if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
- if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
- &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
}
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
+ kvm_load_guest_xcr0(vcpu);
- local_irq_disable();
+ atomic_set(&vcpu->guest_mode, 1);
+ smp_wmb();
- clear_bit(KVM_REQ_KICK, &vcpu->requests);
- smp_mb__after_clear_bit();
+ local_irq_disable();
- if (vcpu->requests || need_resched() || signal_pending(current)) {
- set_bit(KVM_REQ_KICK, &vcpu->requests);
+ if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+ || need_resched() || signal_pending(current)) {
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
local_irq_enable();
preempt_enable();
r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- set_bit(KVM_REQ_KICK, &vcpu->requests);
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
local_irq_enable();
++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
- vcpu_load(vcpu);
-
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r == EMULATE_DO_MMIO) {
+ if (r != EMULATE_DONE) {
r = 0;
goto out;
}
@@ -4759,14 +4924,11 @@ out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- vcpu_put(vcpu);
return r;
}
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.exception.pending = false;
- vcpu_put(vcpu);
-
return 0;
}
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
{
struct desc_ptr dt;
- vcpu_load(vcpu);
-
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
- vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
- vcpu_put(vcpu);
return 0;
}
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int cs_db, cs_l, ret;
cache_all_regs(vcpu);
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ memset(c, 0, sizeof(struct decode_cache));
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
if (ret)
return EMULATE_FAIL;
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
return EMULATE_DONE;
}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int pending_vec, max_bits;
struct desc_ptr dt;
- vcpu_load(vcpu);
-
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- vcpu_put(vcpu);
-
return 0;
}
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
- vcpu_load(vcpu);
-
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
- goto unlock_out;
+ goto out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
r = 0;
-unlock_out:
- vcpu_put(vcpu);
+out:
return r;
}
/*
- * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
- u16 cwd;
- u16 swd;
- u16 twd;
- u16 fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
- u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-/*
* Translate a guest virtual address to a guest physical address.
*/
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
int idx;
- vcpu_load(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
- vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
- vcpu_load(vcpu);
+ struct i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fpu->last_dp = fxsave->rdp;
memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
- vcpu_load(vcpu);
+ struct i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fxsave->rdp = fpu->last_dp;
memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
- vcpu_put(vcpu);
-
return 0;
}
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
{
- unsigned after_mxcsr_mask;
+ int err;
+
+ err = fpu_alloc(&vcpu->arch.guest_fpu);
+ if (err)
+ return err;
+
+ fpu_finit(&vcpu->arch.guest_fpu);
/*
- * Touch the fpu the first time in non atomic context as if
- * this is the first fpu instruction the exception handler
- * will fire before the instruction returns and it'll have to
- * allocate ram with GFP_KERNEL.
+ * Ensure guest xcr0 is valid for loading
*/
- if (!used_math())
- kvm_fx_save(&vcpu->arch.host_fx_image);
-
- /* Initialize guest FPU by resetting ours and saving into guest's */
- preempt_disable();
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_finit();
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
- preempt_enable();
+ vcpu->arch.xcr0 = XSTATE_FP;
vcpu->arch.cr0 |= X86_CR0_ET;
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
- vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
- memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(fx_init);
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+ fpu_free(&vcpu->arch.guest_fpu);
+}
+
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_fpu_loaded)
return;
+ /*
+ * Restore all possible states in the guest,
+ * and assume host would use all available bits.
+ * Guest xcr0 would be loaded later.
+ */
+ kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ unlazy_fpu(current);
+ fpu_restore_checking(&vcpu->arch.guest_fpu);
trace_kvm_fpu(1);
}
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ kvm_put_guest_xcr0(vcpu);
+
if (!vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 0;
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
+ fpu_save_init(&vcpu->arch.guest_fpu);
++vcpu->stat.fpu_reload;
- set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
trace_kvm_fpu(0);
}
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
- /* We do fxsave: this must be aligned. */
- BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
+ fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+ goto fail_free_mce_banks;
+
return 0;
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
fail_free_lapic:
kvm_free_lapic(vcpu);
fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
- kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!kvm->arch.aliases) {
- kfree(kvm);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
kvm_free_all_assigned_devices(kvm);
+ kvm_free_pit(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_iommu_unmap_guest(kvm);
- kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm->arch.aliases);
kfree(kvm);
}
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
int user_alloc)
{
int npages = memslot->npages;
+ int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+ /* Prevent internal slot pages from being moved by fork()/COW. */
+ if (memslot->id >= KVM_MEMORY_SLOTS)
+ map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS,
+ map_flags,
0);
up_write(&current->mm->mmap_sem);
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ if (atomic_xchg(&vcpu->guest_mode, 0))
smp_send_reschedule(cpu);
put_cpu();
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
- return rcu_dereference_check(kvm->arch.aliases,
- srcu_read_lock_held(&kvm->srcu)
- || lockdep_is_held(&kvm->slots_lock));
-}
-
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b6965..e10cf070ede0 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
+ lib-y += cmpxchg.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 4a5979aa6883..2cda60a06e65 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -25,150 +25,172 @@
CFI_ADJUST_CFA_OFFSET -4
.endm
-.macro BEGIN func reg
-$v = \reg
-
-ENTRY(atomic64_\func\()_386)
- CFI_STARTPROC
- LOCK $v
-
-.macro RETURN
- UNLOCK $v
+#define BEGIN(op) \
+.macro endp; \
+ CFI_ENDPROC; \
+ENDPROC(atomic64_##op##_386); \
+.purgem endp; \
+.endm; \
+ENTRY(atomic64_##op##_386); \
+ CFI_STARTPROC; \
+ LOCK v;
+
+#define ENDP endp
+
+#define RET \
+ UNLOCK v; \
ret
-.endm
-
-.macro END_
- CFI_ENDPROC
-ENDPROC(atomic64_\func\()_386)
-.purgem RETURN
-.purgem END_
-.purgem END
-.endm
-
-.macro END
-RETURN
-END_
-.endm
-.endm
-BEGIN read %ecx
- movl ($v), %eax
- movl 4($v), %edx
-END
-
-BEGIN set %esi
- movl %ebx, ($v)
- movl %ecx, 4($v)
-END
-
-BEGIN xchg %esi
- movl ($v), %eax
- movl 4($v), %edx
- movl %ebx, ($v)
- movl %ecx, 4($v)
-END
-
-BEGIN add %ecx
- addl %eax, ($v)
- adcl %edx, 4($v)
-END
-
-BEGIN add_return %ecx
- addl ($v), %eax
- adcl 4($v), %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
-
-BEGIN sub %ecx
- subl %eax, ($v)
- sbbl %edx, 4($v)
-END
-
-BEGIN sub_return %ecx
+#define RET_ENDP \
+ RET; \
+ ENDP
+
+#define v %ecx
+BEGIN(read)
+ movl (v), %eax
+ movl 4(v), %edx
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(set)
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(xchg)
+ movl (v), %eax
+ movl 4(v), %edx
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add)
+ addl %eax, (v)
+ adcl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add_return)
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub)
+ subl %eax, (v)
+ sbbl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub_return)
negl %edx
negl %eax
sbbl $0, %edx
- addl ($v), %eax
- adcl 4($v), %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
-
-BEGIN inc %esi
- addl $1, ($v)
- adcl $0, 4($v)
-END
-
-BEGIN inc_return %esi
- movl ($v), %eax
- movl 4($v), %edx
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc)
+ addl $1, (v)
+ adcl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_return)
+ movl (v), %eax
+ movl 4(v), %edx
addl $1, %eax
adcl $0, %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
-
-BEGIN dec %esi
- subl $1, ($v)
- sbbl $0, 4($v)
-END
-
-BEGIN dec_return %esi
- movl ($v), %eax
- movl 4($v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec)
+ subl $1, (v)
+ sbbl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_return)
+ movl (v), %eax
+ movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
-BEGIN add_unless %ecx
+#define v %ecx
+BEGIN(add_unless)
addl %eax, %esi
adcl %edx, %edi
- addl ($v), %eax
- adcl 4($v), %edx
+ addl (v), %eax
+ adcl 4(v), %edx
cmpl %eax, %esi
je 3f
1:
- movl %eax, ($v)
- movl %edx, 4($v)
+ movl %eax, (v)
+ movl %edx, 4(v)
movl $1, %eax
2:
-RETURN
+ RET
3:
cmpl %edx, %edi
jne 1b
xorl %eax, %eax
jmp 2b
-END_
+ENDP
+#undef v
-BEGIN inc_not_zero %esi
- movl ($v), %eax
- movl 4($v), %edx
+#define v %esi
+BEGIN(inc_not_zero)
+ movl (v), %eax
+ movl 4(v), %edx
testl %eax, %eax
je 3f
1:
addl $1, %eax
adcl $0, %edx
- movl %eax, ($v)
- movl %edx, 4($v)
+ movl %eax, (v)
+ movl %edx, 4(v)
movl $1, %eax
2:
-RETURN
+ RET
3:
testl %edx, %edx
jne 1b
jmp 2b
-END_
+ENDP
+#undef v
-BEGIN dec_if_positive %esi
- movl ($v), %eax
- movl 4($v), %edx
+#define v %esi
+BEGIN(dec_if_positive)
+ movl (v), %eax
+ movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
js 1f
- movl %eax, ($v)
- movl %edx, 4($v)
+ movl %eax, (v)
+ movl %edx, 4(v)
1:
-END
+RET_ENDP
+#undef v
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a9..aa4326bfb24a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -52,7 +52,7 @@ ENDPROC(clear_page)
.align 8
.quad clear_page
.quad 1b
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
.byte .Lclear_page_end - clear_page
.byte 2b - 1b
.previous
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/lib/cmpxchg.c
index 2056ccf572cc..5d619f6df3ee 100644
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ b/arch/x86/lib/cmpxchg.c
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
}
EXPORT_SYMBOL(cmpxchg_386_u32);
#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
- u64 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u64 *)ptr;
- if (prev == old)
- *(u64 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2fc..6fec2d1cebe1 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -113,7 +113,7 @@ ENDPROC(copy_page)
.align 8
.quad copy_page
.quad 1b
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
.byte .Lcopy_page_end - copy_page
.byte 2b - 1b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 71100c98e337..a460158b5ac5 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -29,7 +29,7 @@
.align 8
.quad 0b
.quad 2b
- .byte \feature /* when feature is set */
+ .word \feature /* when feature is set */
.byte 5
.byte 5
.previous
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928af..bcbcd1e0f7d5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -131,7 +131,7 @@ ENDPROC(__memcpy)
.align 8
.quad memcpy
.quad .Lmemcpy_c
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
/*
* Replace only beginning, memcpy is used to apply alternatives,
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e88d3b81644a..09d344269652 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -121,7 +121,7 @@ ENDPROC(__memset)
.align 8
.quad memset
.quad .Lmemset_c
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
.byte .Lfinal - memset
.byte .Lmemset_e - .Lmemset_c
.previous
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a725b7f760ae..0002a3a33081 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -37,6 +37,28 @@ struct addr_marker {
const char *name;
};
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+ KERNEL_SPACE_NR,
+ LOW_KERNEL_NR,
+ VMALLOC_START_NR,
+ VMEMMAP_START_NR,
+ HIGH_KERNEL_NR,
+ MODULES_VADDR_NR,
+ MODULES_END_NR,
+#else
+ KERNEL_SPACE_NR,
+ VMALLOC_START_NR,
+ VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+ PKMAP_BASE_NR,
+# endif
+ FIXADDR_START_NR,
+#endif
+};
+
/* Address space markers hints */
static struct addr_marker address_markers[] = {
{ 0, "User Space" },
@@ -331,14 +353,12 @@ static int pt_dump_init(void)
#ifdef CONFIG_X86_32
/* Not a compile-time constant on x86-32 */
- address_markers[2].start_address = VMALLOC_START;
- address_markers[3].start_address = VMALLOC_END;
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
# ifdef CONFIG_HIGHMEM
- address_markers[4].start_address = PKMAP_BASE;
- address_markers[5].start_address = FIXADDR_START;
-# else
- address_markers[4].start_address = FIXADDR_START;
+ address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
+ address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
#endif
pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f62777940dfb..4c4508e8a204 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -802,8 +802,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
up_read(&mm->mmap_sem);
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER))
+ if (!(error_code & PF_USER)) {
no_context(regs, error_code, address);
+ return;
+ }
/* User-space => ok to do another page fault: */
if (is_prefetch(regs, error_code, address))
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe0..5e8fa12ef861 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
return kmap_atomic_prot(page, type, kmap_prot);
}
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr)
EXPORT_SYMBOL(kmap);
EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
EXPORT_SYMBOL(kmap_atomic_prot);
EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d1..9a6674689a20 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
* linux/arch/x86_64/mm/init.c
*
* Copyright (C) 1995 Linus Torvalds
- * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
*/
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c110..3ba6e0608c55 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, unsigned long prot_val, void *caller)
{
- unsigned long pfn, offset, vaddr;
- resource_size_t last_addr;
+ unsigned long offset, vaddr;
+ resource_size_t pfn, last_pfn, last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- for (pfn = phys_addr >> PAGE_SHIFT;
- (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
- pfn++) {
-
+ last_pfn = last_addr >> PAGE_SHIFT;
+ for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
int is_ram = page_is_ram(pfn);
if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
+ phys_addr &= PHYSICAL_PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
@@ -613,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
return;
}
offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 5d0e67fff1a6..e5d5e2ce9f77 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -45,6 +45,8 @@ struct kmmio_fault_page {
* Protected by kmmio_lock, when linked into kmmio_page_table.
*/
int count;
+
+ bool scheduled_for_release;
};
struct kmmio_delayed_release {
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page,
BUG_ON(f->count < 0);
if (!f->count) {
disarm_kmmio_fault_page(f);
- f->release_next = *release_list;
- *release_list = f;
+ if (!f->scheduled_for_release) {
+ f->release_next = *release_list;
+ *release_list = f;
+ f->scheduled_for_release = true;
+ }
}
}
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
prevp = &f->release_next;
} else {
*prevp = f->release_next;
+ f->release_next = NULL;
+ f->scheduled_for_release = false;
}
- f = f->release_next;
+ f = *prevp;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
kmmio_count--;
spin_unlock_irqrestore(&kmmio_lock, flags);
+ if (!release_list)
+ return;
+
drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
if (!drelease) {
pr_crit("leaking kmmio_fault_page objects.\n");
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 64121a18b8cb..f6ff57b7efa5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
-static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{
int ram_page = 0, not_rampage = 0;
unsigned long page_nr;
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 308e32570d84..38e6d174c497 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = {
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
-static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
/* IA32 Manual 3, 3-432*/
-static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
static unsigned int rw32[] = {
- 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+ 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
};
-static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
-static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
static unsigned int mw64[] = {};
#else /* not __i386__ */
static unsigned char prefix_codes[] = {
@@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = {
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
-static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
-static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
static unsigned int rw32[] = {
- 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+ 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
};
/* 8 bit only */
-static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
/* 16 bit only */
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
/* 16 or 32 bit */
static unsigned int mw32[] = { 0xC7 };
/* 16, 32 or 64 bit */
-static unsigned int mw64[] = { 0x89, 0x8B };
+static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
#endif /* not __i386__ */
struct prefix_bits {
@@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
{
unsigned int opcode;
- unsigned char mod_rm;
int reg;
unsigned char *p;
struct prefix_bits prf;
@@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
goto err;
do_work:
- mod_rm = *p;
- reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+ /* for STOS, source register is fixed */
+ if (opcode == 0xAA || opcode == 0xAB) {
+ reg = arg_AX;
+ } else {
+ unsigned char mod_rm = *p;
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+ }
switch (get_ins_reg_width(ins_addr)) {
case 1:
return *get_reg_w8(reg, prf.rex, regs);
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 8565d944f7cf..38868adf07ea 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -90,6 +90,27 @@ static void do_test(unsigned long size)
iounmap(p);
}
+/*
+ * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
+ * a short time. We had a bug in deferred freeing procedure which tried
+ * to free this region multiple times (ioremap can reuse the same address
+ * for many mappings).
+ */
+static void do_test_bulk_ioremapping(void)
+{
+ void __iomem *p;
+ int i;
+
+ for (i = 0; i < 10; ++i) {
+ p = ioremap_nocache(mmio_address, PAGE_SIZE);
+ if (p)
+ iounmap(p);
+ }
+
+ /* Force freeing. If it will crash we will know why. */
+ synchronize_rcu();
+}
+
static int __init init(void)
{
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
@@ -104,6 +125,7 @@ static int __init init(void)
"and writing 16 kB of rubbish in there.\n",
size >> 10, mmio_address);
do_test(size);
+ do_test_bulk_ioremapping();
pr_info("All done.\n");
return 0;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d3..c03f14ab6667 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
static void do_flush_tlb_all(void *info)
{
- unsigned long cpu = smp_processor_id();
-
__flush_tlb_all();
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
- leave_mm(cpu);
+ leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b28d2f1253bb..f6b48f6c5951 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type)
if (force_arch_perfmon && cpu_has_arch_perfmon)
return 0;
+ /*
+ * Documentation on identifying Intel processors by CPU family
+ * and model can be found in the Intel Software Developer's
+ * Manuals (SDM):
+ *
+ * http://www.intel.com/products/processor/manuals/
+ *
+ * As of May 2010 the documentation for this was in the:
+ * "Intel 64 and IA-32 Architectures Software Developer's
+ * Manual Volume 3B: System Programming Guide", "Table B-1
+ * CPUID Signature Values of DisplayFamily_DisplayModel".
+ */
switch (cpu_model) {
case 0 ... 2:
*cpu_type = "i386/ppro";
@@ -655,12 +667,13 @@ static int __init ppro_init(char **cpu_type)
case 15: case 23:
*cpu_type = "i386/core_2";
break;
+ case 0x1a:
+ case 0x1e:
case 0x2e:
- case 26:
spec = &op_arch_perfmon_spec;
*cpu_type = "i386/core_i7";
break;
- case 28:
+ case 0x1c:
*cpu_type = "i386/atom";
break;
default:
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2ec04c424a62..15466c096ba5 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
},
},
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
+ /* 2006 AMD HT/VIA system with two host bridges */
+ {
+ .callback = set_use_crs,
+ .ident = "ASRock ALiveSATA2-GLAN",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
+ },
+ },
{}
};
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 215a27ae050d..a0772af64efb 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void)
static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
{
struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+ struct resource *bar_r;
+ int bar;
+
+ if (pci_probe & PCI_NOASSIGN_BARS) {
+ /*
+ * If the BIOS did not assign the BAR, zero out the
+ * resource so the kernel doesn't attmept to assign
+ * it later on in pci_assign_unassigned_resources
+ */
+ for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
+ bar_r = &dev->resource[bar];
+ if (bar_r->start == 0 && bar_r->end != 0) {
+ bar_r->flags = 0;
+ bar_r->end = 0;
+ }
+ }
+ }
if (pci_probe & PCI_NOASSIGN_ROMS) {
if (rom_r->parent)
@@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str)
} else if (!strcmp(str, "norom")) {
pci_probe |= PCI_NOASSIGN_ROMS;
return NULL;
+ } else if (!strcmp(str, "nobar")) {
+ pci_probe |= PCI_NOASSIGN_BARS;
+ return NULL;
} else if (!strcmp(str, "assign-busses")) {
pci_probe |= PCI_ASSIGN_ALL_BUSSES;
return NULL;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9810a0f76c91..f547ee05f715 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
/* Update IRQ for all devices with the same pirq value */
- while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
+ for_each_pci_dev(dev2) {
pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
@@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void)
u8 pin;
DBG(KERN_DEBUG "PCI: IRQ fixup\n");
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ for_each_pci_dev(dev) {
/*
* If the BIOS has set an out of range IRQ number, just
* ignore it. Also keep track of which IRQ's are
@@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void)
return;
dev = NULL;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ for_each_pci_dev(dev) {
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 8d460eaf524f..c89266be6048 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void)
return 0;
}
-void pcibios_scan_specific_bus(int busn)
+void __devinit pcibios_scan_specific_bus(int busn)
{
int devfn;
long node;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b350..e7e8c5f54956 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
* Distribute under GPLv2
*
* Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e5..460f314d13e5 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
* Distribute under GPLv2
*
* Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 6b4ffedb93c9..4a2afa1bac51 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
+ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
GCOV_PROFILE := n
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
new file mode 100755
index 000000000000..7ee90a9b549d
--- /dev/null
+++ b/arch/x86/vdso/checkundef.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+nm="$1"
+file="$2"
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+ exit 0
+else
+ echo "$file: undefined symbols found" >&2
+ exit 1
+fi
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e92007..36df991985b2 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
#ifdef CONFIG_X86_64
-__initcall(sysenter_setup);
+subsys_initcall(sysenter_setup);
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b8140..4b5d26f108bb 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void)
*(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
#include "vextern.h"
#undef VEXTERN
+ vunmap(vbase);
return 0;
oom:
@@ -74,7 +75,7 @@ static int __init init_vdso_vars(void)
vdso_enabled = 0;
return -ENOMEM;
}
-__initcall(init_vdso_vars);
+subsys_initcall(init_vdso_vars);
struct linux_binprm;
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb0..68128a1b401a 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,11 @@ config XEN
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
+config XEN_PVHVM
+ def_bool y
+ depends on XEN
+ depends on X86_LOCAL_APIC
+
config XEN_MAX_DOMAIN_MEMORY
int "Maximum allowed size of a domain in gigabytes"
default 8 if X86_32
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f2..779385158915 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,9 +12,10 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o
+ grant-table.o suspend.o platform-pci-unplug.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a8..7d46c8441418 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
+#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -35,8 +36,10 @@
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvm.h>
#include <xen/hvc-console.h>
#include <asm/paravirt.h>
@@ -55,7 +58,9 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
+#include <asm/setup.h>
#include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -76,6 +81,10 @@ struct shared_info xen_dummy_shared_info;
void *xen_initial_gdt;
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
@@ -97,6 +106,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+ if (setup_max_cpus > MAX_VIRT_CPUS)
+ setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
+
static void xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
@@ -104,13 +121,17 @@ static void xen_vcpu_setup(int cpu)
struct vcpu_info *vcpup;
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- if (!have_vcpu_info_placement)
- return; /* already tested, not available */
+ if (cpu < MAX_VIRT_CPUS)
+ per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- vcpup = &per_cpu(xen_vcpu_info, cpu);
+ if (!have_vcpu_info_placement) {
+ if (cpu >= MAX_VIRT_CPUS)
+ clamp_max_cpus();
+ return;
+ }
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
info.mfn = arbitrary_virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
@@ -125,6 +146,7 @@ static void xen_vcpu_setup(int cpu)
if (err) {
printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
have_vcpu_info_placement = 0;
+ clamp_max_cpus();
} else {
/* This cpu is using the registered vcpu info, even if
later ones fail to. */
@@ -731,7 +753,6 @@ static void set_xen_basic_apic_ops(void)
#endif
-
static void xen_clts(void)
{
struct multicall_space mcs;
@@ -926,10 +947,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
.patch = xen_patch,
};
-static const struct pv_time_ops xen_time_ops __initdata = {
- .sched_clock = xen_sched_clock,
-};
-
static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.cpuid = xen_cpuid,
@@ -1028,6 +1045,23 @@ static void xen_crash_shutdown(struct pt_regs *regs)
xen_reboot(SHUTDOWN_crash);
}
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ xen_reboot(SHUTDOWN_crash);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+ .notifier_call= xen_panic_event,
+};
+
+int xen_panic_handler_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ return 0;
+}
+
static const struct machine_ops __initdata xen_machine_ops = {
.restart = xen_restart,
.halt = xen_machine_halt,
@@ -1067,7 +1101,6 @@ asmlinkage void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- pv_time_ops = xen_time_ops;
pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
@@ -1075,13 +1108,7 @@ asmlinkage void __init xen_start_kernel(void)
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
- x86_init.timers.timer_init = xen_time_init;
- x86_init.timers.setup_percpu_clockev = x86_init_noop;
- x86_cpuinit.setup_percpu_clockev = x86_init_noop;
-
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
- x86_platform.set_wallclock = xen_set_wallclock;
+ xen_init_time_ops();
/*
* Set up some pagetable state before starting to set any ptes.
@@ -1145,6 +1172,10 @@ asmlinkage void __init xen_start_kernel(void)
pgd = (pgd_t *)xen_start_info->pt_base;
+ if (!xen_initial_domain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
+ __supported_pte_mask |= _PAGE_IOMAP;
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1206,3 +1237,139 @@ asmlinkage void __init xen_start_kernel(void)
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
+
+static uint32_t xen_cpuid_base(void)
+{
+ uint32_t base, eax, ebx, ecx, edx;
+ char signature[13];
+
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &ebx, &ecx, &edx);
+ *(uint32_t *)(signature + 0) = ebx;
+ *(uint32_t *)(signature + 4) = ecx;
+ *(uint32_t *)(signature + 8) = edx;
+ signature[12] = 0;
+
+ if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+ return base;
+ }
+
+ return 0;
+}
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ *major = eax >> 16;
+ *minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info = xen_info;
+ pv_info.kernel_rpl = 0;
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+
+ return 0;
+}
+
+void xen_hvm_init_shared_info(void)
+{
+ int cpu;
+ struct xen_add_to_physmap xatp;
+ static struct shared_info *shared_info_page = 0;
+
+ if (!shared_info_page)
+ shared_info_page = (struct shared_info *)
+ extend_brk(PAGE_SIZE, PAGE_SIZE);
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+
+ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+ /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions. We don't need the vcpu_info placement
+ * optimizations because we don't use any pv_mmu or pv_irq op on
+ * HVM.
+ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_init_shared_info is run at resume time too and
+ * in that case multiple vcpus might be online. */
+ for_each_online_cpu(cpu) {
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ }
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+ .notifier_call = xen_hvm_cpu_notify,
+};
+
+static void __init xen_hvm_guest_init(void)
+{
+ int r;
+ int major, minor;
+
+ r = init_hvm_pv_info(&major, &minor);
+ if (r < 0)
+ return;
+
+ xen_hvm_init_shared_info();
+
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_have_vector_callback = 1;
+ register_cpu_notifier(&xen_hvm_cpu_notifier);
+ xen_unplug_emulated_devices();
+ have_vcpu_info_placement = 0;
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
+}
+
+static bool __init xen_hvm_platform(void)
+{
+ if (xen_pv_domain())
+ return false;
+
+ if (!xen_cpuid_base())
+ return false;
+
+ return true;
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+ .name = "Xen HVM",
+ .detect = xen_hvm_platform,
+ .init_platform = xen_hvm_guest_init,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce5..42086ac406af 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
+#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/gfp.h>
@@ -51,14 +52,19 @@
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/paravirt.h>
+#include <asm/e820.h>
#include <asm/linkage.h>
+#include <asm/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/version.h>
+#include <xen/interface/memory.h>
#include <xen/hvc-console.h>
#include "multicalls.h"
@@ -67,6 +73,13 @@
#define MMU_UPDATE_HISTO 30
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct {
@@ -377,6 +390,28 @@ static bool xen_page_pinned(void *ptr)
return PagePinned(page);
}
+static bool xen_iomap_pte(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_IOMAP;
+}
+
+static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+
+ /* ptep might be kmapped when using 32-bit HIGHPTE */
+ u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ u->val = pte_val_ma(pteval);
+
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -453,6 +488,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
+ if (xen_iomap_pte(pteval)) {
+ xen_set_iomap_pte(ptep, pteval);
+ goto out;
+ }
+
ADD_STATS(set_pte_at, 1);
// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -523,8 +563,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
return val;
}
+static pteval_t iomap_pte(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+
+ /* We assume the pte frame number is a MFN, so
+ just use it as-is. */
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
pteval_t xen_pte_val(pte_t pte)
{
+ if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
+ return pte.pte;
+
return pte_mfn_to_pfn(pte.pte);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -537,7 +594,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
pte_t xen_make_pte(pteval_t pte)
{
- pte = pte_pfn_to_mfn(pte);
+ phys_addr_t addr = (pte & PTE_PFN_MASK);
+
+ /*
+ * Unprivileged domains are allowed to do IOMAPpings for
+ * PCI passthrough, but not map ISA space. The ISA
+ * mappings are just dummy local mappings to keep other
+ * parts of the kernel happy.
+ */
+ if (unlikely(pte & _PAGE_IOMAP) &&
+ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+ pte = iomap_pte(pte);
+ } else {
+ pte &= ~_PAGE_IOMAP;
+ pte = pte_pfn_to_mfn(pte);
+ }
+
return native_make_pte(pte);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -593,6 +665,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
void xen_set_pte(pte_t *ptep, pte_t pte)
{
+ if (xen_iomap_pte(pte)) {
+ xen_set_iomap_pte(ptep, pte);
+ return;
+ }
+
ADD_STATS(pte_update, 1);
// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
@@ -609,6 +686,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
+ if (xen_iomap_pte(pte)) {
+ xen_set_iomap_pte(ptep, pte);
+ return;
+ }
+
set_64bit((u64 *)ptep, native_pte_val(pte));
}
@@ -935,8 +1017,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
- vm_unmap_aliases();
-
xen_mc_batch();
if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1500,7 +1580,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
- vm_unmap_aliases();
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -1811,9 +1890,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
pte = pfn_pte(phys, prot);
break;
- default:
+ case FIX_PARAVIRT_BOOTMAP:
+ /* This is an MFN, but it isn't an IO mapping from the
+ IO domain */
pte = mfn_pte(phys, prot);
break;
+
+ default:
+ /* By default, set_fixmap is used for hardware mappings */
+ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+ break;
}
__native_set_fixmap(idx, pte);
@@ -1939,8 +2025,240 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
+
+ vmap_lazy_unmap = false;
+}
+
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+ unsigned long *in_frames,
+ unsigned long *out_frames)
+{
+ int i;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+ mcs = __xen_mc_entry(0);
+
+ if (in_frames)
+ in_frames[i] = virt_to_mfn(vaddr);
+
+ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+ set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+ if (out_frames)
+ out_frames[i] = virt_to_pfn(vaddr);
+ }
+ xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+ unsigned long *mfns,
+ unsigned long first_mfn)
+{
+ unsigned i, limit;
+ unsigned long mfn;
+
+ xen_mc_batch();
+
+ limit = 1u << order;
+ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+ struct multicall_space mcs;
+ unsigned flags;
+
+ mcs = __xen_mc_entry(0);
+ if (mfns)
+ mfn = mfns[i];
+ else
+ mfn = first_mfn + i;
+
+ if (i < (limit - 1))
+ flags = 0;
+ else {
+ if (order == 0)
+ flags = UVMF_INVLPG | UVMF_ALL;
+ else
+ flags = UVMF_TLB_FLUSH | UVMF_ALL;
+ }
+
+ MULTI_update_va_mapping(mcs.mc, vaddr,
+ mfn_pte(mfn, PAGE_KERNEL), flags);
+
+ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ }
+
+ xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment. Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+ unsigned long *pfns_in,
+ unsigned long extents_out,
+ unsigned int order_out,
+ unsigned long *mfns_out,
+ unsigned int address_bits)
+{
+ long rc;
+ int success;
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = extents_in,
+ .extent_order = order_in,
+ .extent_start = pfns_in,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = extents_out,
+ .extent_order = order_out,
+ .extent_start = mfns_out,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ BUG_ON(extents_in << order_in != extents_out << order_out);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == extents_in);
+
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+
+ return success;
}
+int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
+ unsigned int address_bits)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long flags;
+ int success;
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = virt_to_pfn(vstart);
+ success = xen_exchange_memory(1UL << order, 0, in_frames,
+ 1, order, &out_frame,
+ address_bits);
+
+ /* 3. Map the new extent in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+ else
+ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long flags;
+ int success;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = virt_to_mfn(vstart);
+
+ /* 2. Zap current PTEs. */
+ xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+ 0, out_frames, 0);
+
+ /* 4. Map new pages in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+ else
+ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc;
+
+ a.domid = DOMID_SELF;
+ a.gpa = __pa(mm->pgd);
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc = 0;
+
+ a.domid = DOMID_SELF;
+ a.gpa = 0x00;
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ if (rc < 0) {
+ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+ return 0;
+ }
+ return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+ if (is_pagetable_dying_supported())
+ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+}
+#endif
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7f5ecf..fa938c4aa2f7 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
unsigned long xen_read_cr2_direct(void);
extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 000000000000..a013ec9d0c54
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,58 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+ .mapping_error = xen_swiotlb_dma_mapping_error,
+ .alloc_coherent = xen_swiotlb_alloc_coherent,
+ .free_coherent = xen_swiotlb_free_coherent,
+ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = xen_swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+ .map_sg = xen_swiotlb_map_sg_attrs,
+ .unmap_sg = xen_swiotlb_unmap_sg_attrs,
+ .map_page = xen_swiotlb_map_page,
+ .unmap_page = xen_swiotlb_unmap_page,
+ .dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use xen_swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+ /* If running as PV guest, either iommu=soft, or swiotlb=force will
+ * activate this IOMMU. If running as PV privileged, activate it
+ * irregardlesss.
+ */
+ if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
+ (xen_pv_domain()))
+ xen_swiotlb = 1;
+
+ /* If we are running under Xen, we MUST disable the native SWIOTLB.
+ * Don't worry about swiotlb_force flag activating the native, as
+ * the 'swiotlb' flag is the only one turning it on. */
+ if (xen_pv_domain())
+ swiotlb = 0;
+
+ return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+ if (xen_swiotlb) {
+ xen_swiotlb_init(1);
+ dma_ops = &xen_swiotlb_dma_ops;
+ }
+}
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 000000000000..554c002a1e1a
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,137 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+/* store the value of xen_emul_unplug after the unplug is done */
+int xen_platform_pci_unplug;
+EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+#ifdef CONFIG_XEN_PVHVM
+static int xen_emul_unplug;
+
+static int __init check_platform_magic(void)
+{
+ short magic;
+ char protocol;
+
+ magic = inw(XEN_IOPORT_MAGIC);
+ if (magic != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ return XEN_PLATFORM_ERR_MAGIC;
+ }
+
+ protocol = inb(XEN_IOPORT_PROTOVER);
+
+ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ protocol);
+
+ switch (protocol) {
+ case 1:
+ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ return XEN_PLATFORM_ERR_BLACKLIST;
+ }
+ break;
+ default:
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+ return XEN_PLATFORM_ERR_PROTOCOL;
+ }
+
+ return 0;
+}
+
+void __init xen_unplug_emulated_devices(void)
+{
+ int r;
+
+ /* check the version of the xen platform PCI device */
+ r = check_platform_magic();
+ /* If the version matches enable the Xen platform PCI driver.
+ * Also enable the Xen platform PCI driver if the version is really old
+ * and the user told us to ignore it. */
+ if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+ (xen_emul_unplug & XEN_UNPLUG_IGNORE)))
+ return;
+ /* Set the default value of xen_emul_unplug depending on whether or
+ * not the Xen PV frontends and the Xen platform PCI driver have
+ * been compiled for this kernel (modules or built-in are both OK). */
+ if (!xen_emul_unplug) {
+ if (xen_must_unplug_nics()) {
+ printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated NICs.\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ }
+ if (xen_must_unplug_disks()) {
+ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated disks.\n"
+ "You might have to change the root device\n"
+ "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+ "in your root= kernel command line option\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ }
+ }
+ /* Now unplug the emulated devices */
+ if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE))
+ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+ xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+ char *p, *q;
+ int l;
+
+ for (p = arg; p; p = q) {
+ q = strchr(p, ',');
+ if (q) {
+ l = q - p;
+ q++;
+ } else {
+ l = strlen(p);
+ }
+ if (!strncmp(p, "all", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL;
+ else if (!strncmp(p, "ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ else if (!strncmp(p, "aux-ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+ else if (!strncmp(p, "nics", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ else if (!strncmp(p, "ignore", l))
+ xen_emul_unplug |= XEN_UNPLUG_IGNORE;
+ else
+ printk(KERN_WARNING "unrecognised option '%s' "
+ "in parameter 'xen_emul_unplug'\n", p);
+ }
+ return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd4..328b00305426 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
#include <xen/page.h>
#include <xen/interface/callback.h>
#include <xen/interface/physdev.h>
+#include <xen/interface/memory.h>
#include <xen/features.h>
#include "xen-ops.h"
@@ -32,6 +33,73 @@ extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
+static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
+ phys_addr_t end_addr)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned long start, end;
+ unsigned long len = 0;
+ unsigned long pfn;
+ int ret;
+
+ start = PFN_UP(start_addr);
+ end = PFN_DOWN(end_addr);
+
+ if (end <= start)
+ return 0;
+
+ printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
+ start, end);
+ for(pfn = start; pfn < end; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ reservation.nr_extents = 1;
+
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
+ start, end, ret);
+ if (ret == 1) {
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ len++;
+ }
+ }
+ printk(KERN_CONT "%ld pages freed\n", len);
+
+ return len;
+}
+
+static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
+ const struct e820map *e820)
+{
+ phys_addr_t max_addr = PFN_PHYS(max_pfn);
+ phys_addr_t last_end = 0;
+ unsigned long released = 0;
+ int i;
+
+ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
+ phys_addr_t end = e820->map[i].addr;
+ end = min(max_addr, end);
+
+ released += xen_release_chunk(last_end, end);
+ last_end = e820->map[i].addr + e820->map[i].size;
+ }
+
+ if (last_end < max_addr)
+ released += xen_release_chunk(last_end, max_addr);
+
+ printk(KERN_INFO "released %ld pages of unused memory\n", released);
+ return released;
+}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -67,6 +135,8 @@ char * __init xen_memory_setup(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+
return "Xen";
}
@@ -156,6 +226,8 @@ void __init xen_arch_setup(void)
struct physdev_set_iopl set_iopl;
int rc;
+ xen_panic_handler_init();
+
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a29693fd3138..25f232b18a82 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -394,6 +394,8 @@ static void stop_self(void *v)
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
+ set_cpu_online(cpu, false);
+
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c661108034..1d789d56877c 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -26,6 +26,18 @@ void xen_pre_suspend(void)
BUG();
}
+void xen_hvm_post_suspend(int suspend_cancelled)
+{
+ int cpu;
+ xen_hvm_init_shared_info();
+ xen_callback_vector();
+ if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ for_each_online_cpu(cpu) {
+ xen_setup_runstate_info(cpu);
+ }
+ }
+}
+
void xen_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3c6c59ed302..1a5353a753fc 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
#include <asm/xen/hypercall.h>
#include <xen/events.h>
+#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
@@ -155,47 +156,8 @@ static void do_stolen_accounting(void)
account_idle_ticks(ticks);
}
-/*
- * Xen sched_clock implementation. Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
-{
- struct vcpu_runstate_info state;
- cycle_t now;
- u64 ret;
- s64 offset;
-
- /*
- * Ideally sched_clock should be called on a per-cpu basis
- * anyway, so preempt should already be disabled, but that's
- * not current practice at the moment.
- */
- preempt_disable();
-
- now = xen_clocksource_read();
-
- get_runstate_snapshot(&state);
-
- WARN_ON(state.state != RUNSTATE_running);
-
- offset = now - state.state_entry_time;
- if (offset < 0)
- offset = 0;
-
- ret = state.time[RUNSTATE_blocked] +
- state.time[RUNSTATE_running] +
- offset;
-
- preempt_enable();
-
- return ret;
-}
-
-
/* Get the TSC speed from Xen */
-unsigned long xen_tsc_khz(void)
+static unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts)
put_cpu_var(xen_vcpu);
}
-unsigned long xen_get_wallclock(void)
+static unsigned long xen_get_wallclock(void)
{
struct timespec ts;
@@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void)
return ts.tv_sec;
}
-int xen_set_wallclock(unsigned long now)
+static int xen_set_wallclock(unsigned long now)
{
/* do nothing for domU */
return -1;
@@ -473,7 +435,11 @@ void xen_timer_resume(void)
}
}
-__init void xen_time_init(void)
+static const struct pv_time_ops xen_time_ops __initdata = {
+ .sched_clock = xen_clocksource_read,
+};
+
+static __init void xen_time_init(void)
{
int cpu = smp_processor_id();
struct timespec tp;
@@ -497,3 +463,47 @@ __init void xen_time_init(void)
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
+
+__init void xen_init_time_ops(void)
+{
+ pv_time_ops = xen_time_ops;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+ int cpu = smp_processor_id();
+ xen_setup_runstate_info(cpu);
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+}
+
+__init void xen_hvm_init_time_ops(void)
+{
+ /* vector callback is needed otherwise we cannot receive interrupts
+ * on cpu > 0 */
+ if (!xen_have_vector_callback && num_present_cpus() > 1)
+ return;
+ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+ "disable pv timer\n");
+ return;
+ }
+
+ pv_time_ops = xen_time_ops;
+ x86_init.timers.setup_percpu_clockev = xen_time_init;
+ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bce..7c8ab86163e9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -38,6 +38,10 @@ void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void __init xen_unplug_emulated_devices(void);
+
void __init xen_build_dynamic_phys_to_machine(void);
void xen_init_irq_ops(void);
@@ -46,11 +50,8 @@ void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
-unsigned long xen_tsc_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
@@ -101,4 +102,6 @@ void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
+extern int xen_panic_handler_init(void);
+
#endif /* XEN_OPS_H */