summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig11
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/eboot.c73
-rw-r--r--arch/x86/boot/compressed/error.h4
-rw-r--r--arch/x86/boot/compressed/head_64.S86
-rw-r--r--arch/x86/boot/compressed/kaslr.c194
-rw-r--r--arch/x86/boot/compressed/misc.c6
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/pagetable.c20
-rw-r--r--arch/x86/boot/copy.S20
-rw-r--r--arch/x86/boot/string.c8
-rw-r--r--arch/x86/boot/string.h1
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/sha1-mb/Makefile2
-rw-r--r--arch/x86/crypto/sha256-mb/Makefile2
-rw-r--r--arch/x86/entry/entry_32.S30
-rw-r--r--arch/x86/entry/entry_64.S14
-rw-r--r--arch/x86/events/core.c27
-rw-r--r--arch/x86/events/intel/core.c67
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/include/asm/asm.h1
-rw-r--r--arch/x86/include/asm/atomic.h13
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/extable.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/init.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mshyperv.h3
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/paravirt.h10
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level.h47
-rw-r--r--arch/x86/include/asm/pgtable.h55
-rw-r--r--arch/x86/include/asm/pgtable_64.h22
-rw-r--r--arch/x86/include/asm/pmem.h2
-rw-r--r--arch/x86/include/asm/processor-flags.h36
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/special_insns.h10
-rw-r--r--arch/x86/include/asm/timer.h8
-rw-r--r--arch/x86/include/asm/tlbbatch.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h114
-rw-r--r--arch/x86/include/asm/uaccess.h11
-rw-r--r--arch/x86/include/asm/uv/uv.h11
-rw-r--r--arch/x86/include/uapi/asm/Kbuild59
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h15
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/apic/htirq.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c22
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c22
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c26
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c18
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/ftrace.c20
-rw-r--r--arch/x86/kernel/head64.c145
-rw-r--r--arch/x86/kernel/head_64.S131
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c9
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/ldt.c56
-rw-r--r--arch/x86/kernel/machine_kexec_64.c8
-rw-r--r--arch/x86/kernel/nmi_selftest.c2
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/setup_percpu.c10
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c4
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c206
-rw-r--r--arch/x86/kernel/unwind_frame.c49
-rw-r--r--arch/x86/kvm/cpuid.c20
-rw-r--r--arch/x86/kvm/emulate.c3
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu.c22
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h39
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c31
-rw-r--r--arch/x86/kvm/vmx.c280
-rw-r--r--arch/x86/kvm/x86.c117
-rw-r--r--arch/x86/lib/copy_user_64.S7
-rw-r--r--arch/x86/lib/csum-copy_64.S12
-rw-r--r--arch/x86/lib/kaslr.c3
-rw-r--r--arch/x86/lib/msr-reg.S8
-rw-r--r--arch/x86/lib/x86-opcode-map.txt2
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/extable.c3
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/gup.c496
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/ident_map.c14
-rw-r--r--arch/x86/mm/init.c10
-rw-r--r--arch/x86/mm/init_64.c128
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c12
-rw-r--r--arch/x86/mm/kaslr.c81
-rw-r--r--arch/x86/mm/mmap.c3
-rw-r--r--arch/x86/mm/numa_32.c1
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c2
-rw-r--r--arch/x86/mm/tlb.c458
-rw-r--r--arch/x86/net/Makefile2
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c9
-rw-r--r--arch/x86/platform/efi/efi_32.c9
-rw-r--r--arch/x86/platform/efi/efi_64.c88
-rw-r--r--arch/x86/platform/efi/quirks.c140
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c2
-rw-r--r--arch/x86/platform/uv/tlb_uv.c24
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c5
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/um/ptrace_64.c2
-rw-r--r--arch/x86/um/shared/sysdep/kernel-offsets.h9
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/efi.c45
-rw-r--r--arch/x86/xen/enlighten_pv.c57
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/mmu_pv.c192
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-pvh.S2
150 files changed, 2312 insertions, 2046 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd18994a9555..737212c0333e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,7 @@ config X86
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT
@@ -360,7 +360,7 @@ config SMP
Management" code will be disabled if you say Y here.
See also <file:Documentation/x86/i386/IO-APIC.txt>,
- <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
+ <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
<http://www.tldp.org/docs.html#howto>.
If you don't know what to do here, say N.
@@ -2776,10 +2776,6 @@ config COMPAT_FOR_U64_ALIGNMENT
config SYSVIPC_COMPAT
def_bool y
depends on SYSVIPC
-
-config KEYS_COMPAT
- def_bool y
- depends on KEYS
endif
endmenu
@@ -2797,6 +2793,9 @@ config X86_DMA_REMAP
bool
depends on STA2X11
+config HAVE_GENERIC_GUP
+ def_bool y
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4430dd489620..bf240b920473 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -159,7 +159,7 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
# If '-Os' is enabled, disable it and print a warning.
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
- $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+ $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
endif
endif
@@ -179,7 +179,8 @@ ifdef CONFIG_JUMP_LABEL
endif
ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
- KBUILD_CFLAGS += -maccumulate-outgoing-args
+ # This compiler flag is not supported by Clang:
+ KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
endif
# Stackpointer is addressed different for 32 bit and 64 bit x86
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 44163e8c3868..2c860ad4fe06 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
quiet_cmd_check_data_rel = DATAREL $@
define cmd_check_data_rel
for obj in $(filter %.o,$^); do \
- readelf -S $$obj | grep -qF .rel.local && { \
+ ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \
echo "error: $$obj has data relocations!" >&2; \
exit 1; \
} || true; \
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 73ccf63b0f48..9dc1ce6ba3c0 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
-static unsigned long get_cmd_line_ptr(void)
+unsigned long get_cmd_line_ptr(void)
{
unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index cbf4b87f55b9..c3e869eaef0c 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
- /* The first GDT is a dummy and the second is unused. */
- desc += 2;
+ /* The first GDT is a dummy. */
+ desc++;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* __KERNEL32_CS */
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ } else {
+ /* Second entry is unused on 32-bit */
+ desc++;
+ }
+ /* __KERNEL_CS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
desc->p = 1;
desc->limit = 0xf;
desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ desc->l = 1;
+ desc->d = 0;
+ } else {
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ }
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
-
desc++;
+
+ /* __KERNEL_DS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
-
-#ifdef CONFIG_X86_64
- /* Task segment value */
desc++;
- desc->limit0 = 0x0000;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_TSS;
- desc->s = 0;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit = 0x0;
- desc->avl = 0;
- desc->l = 0;
- desc->d = 0;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
-#endif /* CONFIG_X86_64 */
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Task segment value */
+ desc->limit0 = 0x0000;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_TSS;
+ desc->s = 0;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0x0;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = 0;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ }
asm volatile("cli");
asm volatile ("lgdt %0" : : "m" (*gdt));
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
index 2e59dac07f9e..d732e608e3af 100644
--- a/arch/x86/boot/compressed/error.h
+++ b/arch/x86/boot/compressed/error.h
@@ -1,7 +1,9 @@
#ifndef BOOT_COMPRESSED_ERROR_H
#define BOOT_COMPRESSED_ERROR_H
+#include <linux/compiler.h>
+
void warn(char *m);
-void error(char *m);
+void error(char *m) __noreturn;
#endif /* BOOT_COMPRESSED_ERROR_H */
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..fbf4c32d0b62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -346,6 +346,48 @@ preferred_addr:
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
+#ifdef CONFIG_X86_5LEVEL
+ /* Check if 5-level paging has already enabled */
+ movq %cr4, %rax
+ testl $X86_CR4_LA57, %eax
+ jnz lvl5
+
+ /*
+ * At this point we are in long mode with 4-level paging enabled,
+ * but we want to enable 5-level paging.
+ *
+ * The problem is that we cannot do it directly. Setting LA57 in
+ * long mode would trigger #GP. So we need to switch off long mode
+ * first.
+ *
+ * NOTE: This is not going to work if bootloader put us above 4G
+ * limit.
+ *
+ * The first step is go into compatibility mode.
+ */
+
+ /* Clear additional page table */
+ leaq lvl5_pgtable(%rbx), %rdi
+ xorq %rax, %rax
+ movq $(PAGE_SIZE/8), %rcx
+ rep stosq
+
+ /*
+ * Setup current CR3 as the first and only entry in a new top level
+ * page table.
+ */
+ movq %cr3, %rdi
+ leaq 0x7 (%rdi), %rax
+ movq %rax, lvl5_pgtable(%rbx)
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq compatible_mode(%rip), %rax
+ pushq %rax
+ lretq
+lvl5:
+#endif
+
/* Zero EFLAGS */
pushq $0
popfq
@@ -429,6 +471,44 @@ relocated:
jmp *%rax
.code32
+#ifdef CONFIG_X86_5LEVEL
+compatible_mode:
+ /* Setup data and stack segments */
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %ss
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 to 5-level paging */
+ leal lvl5_pgtable(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Enable PAE and LA57 mode */
+ movl %cr4, %eax
+ orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
+ movl %eax, %cr4
+
+ /* Calculate address we are running at */
+ call 1f
+1: popl %edi
+ subl $1b, %edi
+
+ /* Prepare stack for far return to Long Mode */
+ pushl $__KERNEL_CS
+ leal lvl5(%edi), %eax
+ push %eax
+
+ /* Enable paging back */
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ lret
+#endif
+
no_longmode:
/* This isn't an x86-64 CPU so hang */
1:
@@ -442,7 +522,7 @@ gdt:
.word gdt_end - gdt
.long gdt
.word 0
- .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
@@ -486,3 +566,7 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
+#ifdef CONFIG_X86_5LEVEL
+lvl5_pgtable:
+ .fill PAGE_SIZE, 1, 0
+#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 54c24f0a43d3..91f27ab970ef 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -9,16 +9,42 @@
* contain the entire properly aligned running kernel image.
*
*/
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+/*
+ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
+ * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
+ * which is meaningless and will cause compiling error in some cases.
+ * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
+ * as empty.
+ */
+#define _LINUX_EXPORT_H
+#define EXPORT_SYMBOL(sym)
+
#include "misc.h"
#include "error.h"
-#include "../boot.h"
+#include "../string.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <generated/utsrelease.h>
+/* Macros used by the included decompressor code below. */
+#define STATIC
+#include <linux/decompress/mm.h>
+
+extern unsigned long get_cmd_line_ptr(void);
+
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
@@ -62,6 +88,11 @@ struct mem_vector {
static bool memmap_too_large;
+
+/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
+unsigned long long mem_limit = ULLONG_MAX;
+
+
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
@@ -85,49 +116,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
return true;
}
-/**
- * _memparse - Parse a string with mem suffixes into a number
- * @ptr: Where parse begins
- * @retptr: (output) Optional pointer to next char after parse completes
- *
- * Parses a string into a number. The number stored at @ptr is
- * potentially suffixed with K, M, G, T, P, E.
- */
-static unsigned long long _memparse(const char *ptr, char **retptr)
+char *skip_spaces(const char *str)
{
- char *endptr; /* Local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
- case 'E':
- case 'e':
- ret <<= 10;
- case 'P':
- case 'p':
- ret <<= 10;
- case 'T':
- case 't':
- ret <<= 10;
- case 'G':
- case 'g':
- ret <<= 10;
- case 'M':
- case 'm':
- ret <<= 10;
- case 'K':
- case 'k':
- ret <<= 10;
- endptr++;
- default:
- break;
- }
-
- if (retptr)
- *retptr = endptr;
-
- return ret;
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
static int
parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
@@ -142,40 +138,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
return -EINVAL;
oldp = p;
- *size = _memparse(p, &p);
+ *size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
switch (*p) {
- case '@':
- /* Skip this region, usable */
- *start = 0;
- *size = 0;
- return 0;
case '#':
case '$':
case '!':
- *start = _memparse(p + 1, &p);
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /* memmap=nn@ss specifies usable region, should be skipped */
+ *size = 0;
+ /* Fall through */
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
return 0;
}
return -EINVAL;
}
-static void mem_avoid_memmap(void)
+static void mem_avoid_memmap(char *str)
{
- char arg[128];
+ static int i;
int rc;
- int i;
- char *str;
- /* See if we have any memmap areas */
- rc = cmdline_find_option("memmap", arg, sizeof(arg));
- if (rc <= 0)
+ if (i >= MAX_MEMMAP_REGIONS)
return;
- i = 0;
- str = arg;
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
unsigned long long start, size;
@@ -188,9 +185,14 @@ static void mem_avoid_memmap(void)
if (rc < 0)
break;
str = k;
- /* A usable region that should not be skipped */
- if (size == 0)
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0)
+ mem_limit = size;
+
continue;
+ }
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
@@ -202,6 +204,57 @@ static void mem_avoid_memmap(void)
memmap_too_large = true;
}
+static int handle_mem_memmap(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len = strlen((char *)args);
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ return 0;
+
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline )
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0) {
+ warn("Only '--' specified in cmdline");
+ free(tmp_cmdline);
+ return -1;
+ }
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0) {
+ free(tmp_cmdline);
+ return -EINVAL;
+ }
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return 0;
+}
+
/*
* In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
* The mem_avoid array is used to store the ranges that need to be avoided
@@ -323,7 +376,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
- mem_avoid_memmap();
+ handle_mem_memmap();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -432,7 +485,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
{
struct mem_vector region, overlap;
struct slot_area slot_area;
- unsigned long start_orig;
+ unsigned long start_orig, end;
+ struct boot_e820_entry cur_entry;
/* Skip non-RAM entries. */
if (entry->type != E820_TYPE_RAM)
@@ -446,8 +500,15 @@ static void process_e820_entry(struct boot_e820_entry *entry,
if (entry->addr + entry->size < minimum)
return;
- region.start = entry->addr;
- region.size = entry->size;
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->addr, mem_limit);
+ if (entry->addr >= end)
+ return;
+ cur_entry.addr = entry->addr;
+ cur_entry.size = end - entry->addr;
+
+ region.start = cur_entry.addr;
+ region.size = cur_entry.size;
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
@@ -461,7 +522,7 @@ static void process_e820_entry(struct boot_e820_entry *entry,
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above this e820 region? */
- if (region.start > entry->addr + entry->size)
+ if (region.start > cur_entry.addr + cur_entry.size)
return;
/* Reduce size by any delta from the original address. */
@@ -564,9 +625,6 @@ void choose_random_location(unsigned long input,
{
unsigned long random_addr, min_addr;
- /* By default, keep output position unchanged. */
- *virt_addr = *output;
-
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline.");
return;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b3c5a5f030ce..00241c815524 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -338,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
unsigned long output_len)
{
const unsigned long kernel_total_size = VO__end - VO__text;
- unsigned long virt_addr = (unsigned long)output;
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -390,6 +390,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
+ if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
+ error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
@@ -397,7 +399,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifndef CONFIG_RELOCATABLE
if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
error("Destination address does not match LOAD_PHYSICAL_ADDR");
- if ((unsigned long)output != virt_addr)
+ if (virt_addr != LOAD_PHYSICAL_ADDR)
error("Destination virtual address changed when not relocatable");
#endif
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 1c8355eadbd1..766a5211f827 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -81,8 +81,6 @@ static inline void choose_random_location(unsigned long input,
unsigned long output_size,
unsigned long *virt_addr)
{
- /* No change from existing output location. */
- *virt_addr = *output;
}
#endif
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 56589d0a804b..28029be47fbb 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -63,14 +63,14 @@ static void *alloc_pgt_page(void *context)
static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
-static unsigned long level4p;
+static unsigned long top_level_pgt;
/*
* Mapping information structure passed to kernel_ident_mapping_init().
* Due to relocation, pointers must be assigned at run time not build time.
*/
static struct x86_mapping_info mapping_info = {
- .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
};
/* Locates and clears a region for a new top level page table. */
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
* If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely
* overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
*/
- level4p = read_cr3();
- if (level4p == (unsigned long)_pgtable) {
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
- level4p = (unsigned long)alloc_pgt_page(&pgt_data);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
}
}
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
return;
/* Build the mapping. */
- kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
+ kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
start, end);
}
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
*/
void finalize_identity_maps(void)
{
- write_cr3(level4p);
+ write_cr3(top_level_pgt);
}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 1eb7d298b47d..15d9f74b0008 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -65,23 +65,3 @@ GLOBAL(copy_to_fs)
popw %es
retl
ENDPROC(copy_to_fs)
-
-#if 0 /* Not currently used, but can be enabled as needed */
-GLOBAL(copy_from_gs)
- pushw %ds
- pushw %gs
- popw %ds
- calll memcpy
- popw %ds
- retl
-ENDPROC(copy_from_gs)
-
-GLOBAL(copy_to_gs)
- pushw %es
- pushw %gs
- popw %es
- calll memcpy
- popw %es
- retl
-ENDPROC(copy_to_gs)
-#endif
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5457b02fc050..630e3664906b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
return result;
}
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
/**
* strlen - Find the length of a string
* @s: The string to be sized
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index 113588ddb43f..f274a50db5fa 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -22,6 +22,7 @@ extern int strcmp(const char *str1, const char *str2);
extern int strncmp(const char *cs, const char *ct, size_t count);
extern size_t strlen(const char *s);
extern char *strstr(const char *s1, const char *s2);
+extern char *strchr(const char *s, int c);
extern size_t strnlen(const char *s, size_t maxlen);
extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 34b3fa2889d1..9e32d40d71bd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
index 2f8756375df5..2e14acc3da25 100644
--- a/arch/x86/crypto/sha1-mb/Makefile
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
index 41089e7c400c..45b4fca6c4a8 100644
--- a/arch/x86/crypto/sha256-mb/Makefile
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 50bc26949e9e..48ef7bb32c42 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -252,6 +252,23 @@ ENTRY(__switch_to_asm)
END(__switch_to_asm)
/*
+ * The unwinder expects the last frame on the stack to always be at the same
+ * offset from the end of the page, which allows it to validate the stack.
+ * Calling schedule_tail() directly would break that convention because its an
+ * asmlinkage function so its argument has to be pushed on the stack. This
+ * wrapper creates a proper "end of stack" frame header before the call.
+ */
+ENTRY(schedule_tail_wrapper)
+ FRAME_BEGIN
+
+ pushl %eax
+ call schedule_tail
+ popl %eax
+
+ FRAME_END
+ ret
+ENDPROC(schedule_tail_wrapper)
+/*
* A newly forked process directly context switches into this address.
*
* eax: prev task we switched from
@@ -259,24 +276,15 @@ END(__switch_to_asm)
* edi: kernel thread arg
*/
ENTRY(ret_from_fork)
- FRAME_BEGIN /* help unwinder find end of stack */
-
- /*
- * schedule_tail() is asmlinkage so we have to put its 'prev' argument
- * on the stack.
- */
- pushl %eax
- call schedule_tail
- popl %eax
+ call schedule_tail_wrapper
testl %ebx, %ebx
jnz 1f /* kernel threads are uncommon */
2:
/* When we fork, we trace the syscall return in the child, too. */
- leal FRAME_OFFSET(%esp), %eax
+ movl %esp, %eax
call syscall_return_slowpath
- FRAME_END
jmp restore_all
/* kernel thread */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 607d72c4a485..a9a8027a6c0e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,7 +36,6 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
-#include <asm/frame.h>
#include <linux/err.h>
.code64
@@ -266,7 +265,8 @@ return_from_SYSCALL_64:
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
- * Change top 16 bits to be the sign-extension of 47th bit
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
*/
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
@@ -406,19 +406,17 @@ END(__switch_to_asm)
* r12: kernel thread arg
*/
ENTRY(ret_from_fork)
- FRAME_BEGIN /* help unwinder find end of stack */
movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */
+ call schedule_tail /* rdi: 'prev' task parameter */
- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ testq %rbx, %rbx /* from kernel_thread? */
+ jnz 1f /* kernel threads are uncommon */
2:
- leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */
+ movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
- FRAME_END
jmp restore_regs_and_iret
1:
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f5ac83..2de0dd73830a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
return ret;
}
+static struct attribute_group x86_pmu_attr_group;
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_events_group.attrs = tmp;
}
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -2101,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
static void refresh_pce(void *ignored)
{
- if (current->active_mm)
- load_mm_cr4(current->active_mm);
+ load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
static void x86_pmu_event_mapped(struct perf_event *event)
@@ -2255,7 +2264,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
@@ -2267,17 +2276,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
@@ -2289,7 +2298,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
@@ -2334,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
- if (!ldt || idx > ldt->size)
+ if (!ldt || idx > ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4e37a1..31acf2a98394 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -431,11 +431,11 @@ static __initconst const u64 skl_hw_cache_event_ids
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
- [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
- [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
@@ -3160,6 +3160,19 @@ err:
return -ENOMEM;
}
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
if (!cpuc->shared_regs)
return;
@@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = {
NULL
};
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ get_online_cpus();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ put_online_cpus();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ NULL,
+};
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void)
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+ x86_pmu.attrs = intel_pmu_attrs;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index f924629836a8..eb261656a320 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -18,7 +18,7 @@ enum {
LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
};
-static enum {
+static const enum {
LBR_EIP_FLAGS = 1,
LBR_TSX = 2,
} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
@@ -287,7 +287,7 @@ inline u64 lbr_from_signext_quirk_wr(u64 val)
/*
* If quirk is needed, ensure sign extension is 61 bits:
*/
-u64 lbr_from_signext_quirk_rd(u64 val)
+static u64 lbr_from_signext_quirk_rd(u64 val)
{
if (static_branch_unlikely(&lbr_from_quirk_key)) {
/*
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 9d05c7e67f60..a45e2114a846 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -761,7 +761,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init),
- X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 758c1aa5009d..44ec523287f6 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1170,7 +1170,7 @@ static int uncore_event_cpu_online(unsigned int cpu)
pmu = type->pmus;
for (i = 0; i < type->num_boxes; i++, pmu++) {
box = pmu->boxes[pkg];
- if (!box && atomic_inc_return(&box->refcnt) == 1)
+ if (box && atomic_inc_return(&box->refcnt) == 1)
uncore_box_init(box);
}
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index be3d36254040..53728eea1bed 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -562,6 +562,9 @@ struct x86_pmu {
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
+ unsigned long attr_freeze_on_smi;
+ struct attribute **attrs;
+
/*
* CPU Hotplug hooks
*/
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7acb51c49fec..7a9df3beb89b 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -32,6 +32,7 @@
#define _ASM_ADD __ASM_SIZE(add)
#define _ASM_SUB __ASM_SIZE(sub)
#define _ASM_XADD __ASM_SIZE(xadd)
+#define _ASM_MUL __ASM_SIZE(mul)
#define _ASM_AX __ASM_REG(ax)
#define _ASM_BX __ASM_REG(bx)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index caa5798c92f4..33380b871463 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -246,19 +246,6 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
return c;
}
-/**
- * atomic_inc_short - increment of a short integer
- * @v: pointer to type int
- *
- * Atomically adds 1 to @v
- * Returns the new value of @u
- */
-static __always_inline short int atomic_inc_short(short int *v)
-{
- asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
- return *v;
-}
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \
\
if (efi_scratch.use_pgd) { \
- efi_scratch.prev_cr3 = read_cr3(); \
+ efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \
} \
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index b8ad261d11dc..c66d19e3c23e 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -29,6 +29,7 @@ struct pt_regs;
} while (0)
extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern int fixup_bug(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- unsigned int irq_tlb_count;
#endif
+ unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 737da62bfeb0..474eb8c66fee 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -4,8 +4,9 @@
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
void *context; /* context for alloc_pgt_page */
- unsigned long pmd_flag; /* page flag for PMD entry */
+ unsigned long page_flag; /* page flag for PMD or PUD entry */
unsigned long offset; /* ident mapping offset */
+ bool direct_gbpages; /* PUD level 1GB page support */
};
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 055962615779..722d0e568863 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -296,6 +296,7 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
bool ud; /* inject an #UD if host doesn't support insn */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception;
struct x86_exception exception;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f5bddf92faba..695605eb1dfb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,7 +43,7 @@
#define KVM_PRIVATE_MEM_SLOTS 3
#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#define KVM_HALT_POLL_NS_DEFAULT 400000
+#define KVM_HALT_POLL_NS_DEFAULT 200000
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
@@ -1020,6 +1020,8 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
+ int (*write_log_dirty)(struct kvm_vcpu *vcpu);
+
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4fd5195deed0..3f9a3d2a5209 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -266,6 +266,7 @@ static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *s
#endif
int mce_available(struct cpuinfo_x86 *c);
+bool mce_is_memory_error(struct mce *m);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index f9813b6d8b80..79b647a7ebd0 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,12 +37,6 @@ typedef struct {
#endif
} mm_context_t;
-#ifdef CONFIG_SMP
void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 68b329d77b3a..ecfcb6643c9b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
- unsigned int size;
+ unsigned int nr_entries;
};
/*
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
*/
if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->size);
+ set_ldt(ldt->entries, ldt->nr_entries);
else
clear_LDT();
#else
clear_LDT();
#endif
+}
+
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
}
static inline int init_new_context(struct task_struct *tsk,
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
}
#endif
-static inline bool __pkru_allows_pkey(u16 pkey, bool write)
-{
- u32 pkru = read_pkru();
-
- if (!__pkru_allows_read(pkru, pkey))
- return false;
- if (write && !__pkru_allows_write(pkru, pkey))
- return false;
-
- return true;
-}
-
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
+
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+static inline unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || !in_atomic());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index fba100713924..d5acc27ed1cc 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -2,8 +2,7 @@
#define _ASM_X86_MSHYPER_H
#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
+#include <linux/atomic.h>
#include <asm/hyperv.h>
/*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 673f9ac50f6d..18b162322eff 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,8 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
#define MSR_PEBS_FRONTEND 0x000003f7
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 55fa56fe4e45..9ccac1926587 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
}
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
{
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
}
@@ -118,7 +118,7 @@ static inline u64 paravirt_read_msr(unsigned msr)
static inline void paravirt_write_msr(unsigned msr,
unsigned low, unsigned high)
{
- return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+ PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
}
static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
- PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
+ PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7465d6fe336f..cb976bab6299 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct;
struct task_struct;
struct cpumask;
+struct flush_tlb_info;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
+ const struct flush_tlb_info *info);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 50d35e3185f5..c8821bab938f 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+#define gup_get_pte gup_get_pte
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a PTE will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have gotten rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
+ * we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f5af95a0c6b8..77037b6f1caa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
return 0;
}
#endif
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void)
{
/* Default trampoline pgd value */
- trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif
}
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * 'pteval' can come from a PTE, PMD or PUD. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+ return __pte_access_permitted(pte_val(pte), write);
+}
+
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+ return __pte_access_permitted(pmd_val(pmd), write);
+}
+
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+ return __pte_access_permitted(pud_val(pud), write);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..2160c1fee920 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
#include <linux/bitops.h>
#include <linux/threads.h>
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pgd_t init_top_pgt[];
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
extern void paging_init(void);
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-#endif /* !__ASSEMBLY__ */
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
+ int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long)nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (end < start)
+ return false;
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ return false;
+ return true;
+}
+#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index d5a22bac9988..0ff8fe71b255 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -98,7 +98,7 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
if (bytes < 8) {
if (!IS_ALIGNED(dest, 4) || (bytes != 4))
- arch_wb_cache_pmem(addr, 1);
+ arch_wb_cache_pmem(addr, bytes);
} else {
if (!IS_ALIGNED(dest, 8)) {
dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
#else
#define X86_VM_MASK 0 /* No VM86 support */
#endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..2e1696294af5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+ return __read_cr3() & CR3_ADDR_MASK;
+}
+
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__pa(pgdir));
@@ -860,8 +868,6 @@ extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
}
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
native_write_cr2(x);
}
-static inline unsigned long read_cr3(void)
+/*
+ * Careful! CR3 contains more than just an address. You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
{
- return native_read_cr3();
+ return __native_read_cr3();
}
static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 27e9f9d769b8..2016962103df 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
- u32 __count;
- /* u32 hole */
-}; /* 24 bytes -- do not grow */
+}; /* 16 bytes */
-extern struct cyc2ns_data *cyc2ns_read_begin(void);
-extern void cyc2ns_read_end(struct cyc2ns_data *);
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..f4a6ff352a0e
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,14 @@
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+};
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..50ea3482e1d1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,7 @@
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
+#include <asm/smp.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
#endif
struct tlb_state {
-#ifdef CONFIG_SMP
- struct mm_struct *active_mm;
+ /*
+ * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
+ * are on. This means that it may not match current->active_mm,
+ * which will contain the previous user mm when we're in lazy TLB
+ * mode even if we've already switched back to swapper_pg_dir.
+ */
+ struct mm_struct *loaded_mm;
int state;
-#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
- native_write_cr3(native_read_cr3());
+ native_write_cr3(__native_read_cr3());
preempt_enable();
}
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
+ * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
*/
-
-#ifndef CONFIG_SMP
-
-/* "_up" is for UniProcessor.
- *
- * This is a helper for other header functions. *Not* intended to be called
- * directly. All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
-}
-
-static inline void flush_tlb_all(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb_all();
-}
-
-static inline void local_flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned long vmflag)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
-{
- flush_tlb_all();
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
+struct flush_tlb_info {
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+};
#define local_flush_tlb() __flush_tlb()
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start, unsigned long end);
+ const struct flush_tlb_info *info);
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-static inline void reset_lazy_tlbstate(void)
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
{
- this_cpu_write(cpu_tlbstate.state, 0);
- this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
-#endif /* SMP */
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, start, end) \
- native_flush_tlb_others(mask, mm, start, end)
+#define flush_tlb_others(mask, info) \
+ native_flush_tlb_others(mask, info)
#endif
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 68766b276d9e..a059aac9e937 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -319,10 +319,10 @@ do { \
#define __get_user_asm_u64(x, ptr, retval, errret) \
({ \
__typeof__(ptr) __ptr = (ptr); \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %2,%%eax\n" \
"2: movl %3,%%edx\n" \
- "3: " ASM_CLAC "\n" \
+ "3:\n" \
".section .fixup,\"ax\"\n" \
"4: mov %4,%0\n" \
" xorl %%eax,%%eax\n" \
@@ -331,7 +331,7 @@ do { \
".previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
- : "=r" (retval), "=A"(x) \
+ : "=r" (retval), "=&A"(x) \
: "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \
"i" (errret), "0" (retval)); \
})
@@ -703,14 +703,15 @@ extern struct movsl_mask {
#define unsafe_put_user(x, ptr, err_label) \
do { \
int __pu_err; \
- __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
+ __typeof__(*(ptr)) __pu_val = (x); \
+ __put_user_size(__pu_val, (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
if (unlikely(__pu_err)) goto err_label; \
} while (0)
#define unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
- unsigned long __gu_val; \
+ __inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
if (unlikely(__gu_err)) goto err_label; \
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6686820feae9..b5a32231abd8 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
+#include <asm/tlbflush.h>
+
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
struct cpumask;
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- unsigned int cpu);
+ const struct flush_tlb_info *info);
#else /* X86_UV */
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned int cpu)
+uv_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
{ return cpumask; }
#endif /* X86_UV */
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 3dec769cadf7..83b6e9a0dce4 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -4,62 +4,3 @@ include include/uapi/asm-generic/Kbuild.asm
genhdr-y += unistd_32.h
genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
-header-y += a.out.h
-header-y += auxvec.h
-header-y += bitsperlong.h
-header-y += boot.h
-header-y += bootparam.h
-header-y += byteorder.h
-header-y += debugreg.h
-header-y += e820.h
-header-y += errno.h
-header-y += fcntl.h
-header-y += hw_breakpoint.h
-header-y += hyperv.h
-header-y += ioctl.h
-header-y += ioctls.h
-header-y += ipcbuf.h
-header-y += ist.h
-header-y += kvm.h
-header-y += kvm_para.h
-header-y += kvm_perf.h
-header-y += ldt.h
-header-y += mce.h
-header-y += mman.h
-header-y += msgbuf.h
-header-y += msr-index.h
-header-y += msr.h
-header-y += mtrr.h
-header-y += param.h
-header-y += perf_regs.h
-header-y += poll.h
-header-y += posix_types.h
-header-y += posix_types_32.h
-header-y += posix_types_64.h
-header-y += posix_types_x32.h
-header-y += prctl.h
-header-y += processor-flags.h
-header-y += ptrace-abi.h
-header-y += ptrace.h
-header-y += resource.h
-header-y += sembuf.h
-header-y += setup.h
-header-y += shmbuf.h
-header-y += sigcontext.h
-header-y += sigcontext32.h
-header-y += siginfo.h
-header-y += signal.h
-header-y += socket.h
-header-y += sockios.h
-header-y += stat.h
-header-y += statfs.h
-header-y += svm.h
-header-y += swab.h
-header-y += termbits.h
-header-y += termios.h
-header-y += types.h
-header-y += ucontext.h
-header-y += unistd.h
-header-y += vm86.h
-header-y += vmx.h
-header-y += vsyscall.h
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 432df4b1baec..f4fef5a24ebd 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -34,16 +34,10 @@
#define HV_X64_MSR_REFERENCE_TSC 0x40000021
/*
- * There is a single feature flag that signifies the presence of the MSR
- * that can be used to retrieve both the local APIC Timer frequency as
- * well as the TSC frequency.
+ * There is a single feature flag that signifies if the partition has access
+ * to MSRs with local APIC and TSC frequencies.
*/
-
-/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */
-#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11)
-
-/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */
-#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11)
+#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
/*
* Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
@@ -73,6 +67,9 @@
*/
#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+/* Frequency MSRs available */
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8)
+
/* Crash MSR available */
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
+#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..a01892bdd61a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c5b8f760473c..32e14d137416 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
- /* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ /*
+ * 0xe8 is a relative jump; fix the offset.
+ *
+ * Instruction length is checked before the opcode to avoid
+ * accessing uninitialized bytes for zero-length replacements.
+ */
+ if (a->replacementlen == 5 && *insnbuf == 0xe8) {
*(s32 *)(insnbuf + 1) += replacement - instr;
DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
*(s32 *)(insnbuf + 1),
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index ae50d3454d78..81ff48905623 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -150,7 +150,7 @@ static const struct irq_domain_ops htirq_domain_ops = {
.deactivate = htirq_domain_deactivate,
};
-void arch_init_htirq_domain(struct irq_domain *parent)
+void __init arch_init_htirq_domain(struct irq_domain *parent)
{
if (disable_apic)
return;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..247880fc29f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c61aec7e65f4..4c5d1882bec5 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -136,7 +136,7 @@ static struct msi_domain_info pci_msi_domain_info = {
.handler_name = "edge",
};
-void arch_init_msi_domain(struct irq_domain *parent)
+void __init arch_init_msi_domain(struct irq_domain *parent)
{
if (disable_apic)
return;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f3557a1eb562..e66d8e48e456 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -405,7 +405,7 @@ int __init arch_probe_nr_irqs(void)
}
#ifdef CONFIG_X86_IO_APIC
-static void init_legacy_irqs(void)
+static void __init init_legacy_irqs(void)
{
int i, node = cpu_to_node(0);
struct apic_chip_data *data;
@@ -424,7 +424,7 @@ static void init_legacy_irqs(void)
}
}
#else
-static void init_legacy_irqs(void) { }
+static inline void init_legacy_irqs(void) { }
#endif
int __init arch_early_irq_init(void)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ee8f11800295..bb5abe8f5fd4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -799,8 +799,9 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
- /* AMD CPUs don't reset SS attributes on SYSRET */
- set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_has(c, X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index a70fd61095f8..6f077445647a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -255,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f5af0cc7eb0d..9257bd9dc664 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -856,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_cdp;
+ goto out_destroy;
static_branch_enable(&rdt_enable_key);
goto out;
+out_destroy:
+ kernfs_remove(kn_info);
out_cdp:
cdp_disable();
out:
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5abd4bf73d6e..5cfbaeb6529a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m)
return 1;
}
-static bool memory_error(struct mce *m)
+bool mce_is_memory_error(struct mce *m)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (m->cpuvendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ } else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -529,6 +527,7 @@ static bool memory_error(struct mce *m)
return false;
}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool cec_add_mce(struct mce *m)
{
@@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m)
return false;
/* We eat only correctable DRAM errors with usable addresses. */
- if (memory_error(m) &&
+ if (mce_is_memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
@@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 7889ae492af0..21b185793c80 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -10,7 +10,7 @@
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
- * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
*
* early loader:
* Copyright (C) 2013 Advanced Micro Devices, Inc.
@@ -251,7 +251,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
#endif
}
-void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
struct ucode_cpu_info *uci;
struct cpio_data cp;
@@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
- desc.data, desc.size);
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret != UCODE_OK)
return -EINVAL;
@@ -352,8 +351,6 @@ void reload_ucode_amd(void)
u32 rev, dummy;
mc = (struct microcode_amd *)amd_ucode_patch;
- if (!mc)
- return;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
@@ -677,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
@@ -691,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
- if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(cpu);
+ if (save) {
+ struct ucode_patch *p = find_patch(0);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -724,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ if (!refresh_fw || !bsp)
return UCODE_OK;
if (c->x86 >= 0x15)
@@ -745,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b4a4cd39b358..9cb98ee103db 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -1,7 +1,7 @@
/*
* CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* 2006 Shaohua Li <shaohua.li@intel.com>
* 2013-2016 Borislav Petkov <bp@alien8.de>
*
@@ -290,6 +290,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
return (struct cpio_data){ NULL, 0, "" };
if (initrd_start)
start = initrd_start;
+ } else {
+ /*
+ * The picture with physical addresses is a bit different: we
+ * need to get the *physical* address to which the ramdisk was
+ * relocated, i.e., relocated_ramdisk (not initrd_start) and
+ * since we're running from physical addresses, we need to access
+ * relocated_ramdisk through its *physical* address too.
+ */
+ u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
+ if (*rr)
+ start = *rr;
}
return find_cpio_data(path, (void *)start, size, NULL);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 8325d8a09ab0..59edbe9d4ccb 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,7 +1,7 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* Intel CPU microcode early update for Linux
@@ -42,7 +42,7 @@
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
/* Current microcode patch used in early patching on the APs. */
-struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *intel_ucode_patch;
static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
unsigned int s2, unsigned int p2)
@@ -166,7 +166,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
static void save_microcode_patch(void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p;
+ struct ucode_patch *iter, *tmp, *p = NULL;
bool prev_found = false;
unsigned int sig, pf;
@@ -202,6 +202,18 @@ static void save_microcode_patch(void *data, unsigned int size)
else
list_add_tail(&p->plist, &microcode_cache);
}
+
+ /*
+ * Save for early loading. On 32-bit, that needs to be a physical
+ * address as the APs are running from physical addresses, before
+ * paging has been enabled.
+ */
+ if (p) {
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
+ }
}
static int microcode_sanity_check(void *mc, int print_err)
@@ -607,6 +619,14 @@ int __init save_microcode_in_initrd_intel(void)
struct ucode_cpu_info uci;
struct cpio_data cp;
+ /*
+ * initrd is going away, clear patch ptr. We will scan the microcode one
+ * last time before jettisoning and save a patch, if found. Then we will
+ * update that pointer too, with a stable patch address to use when
+ * resuming the cores.
+ */
+ intel_ucode_patch = NULL;
+
if (!load_builtin_intel_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path, false);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 04cb8d34ccb8..70e717fccdd6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,6 +161,15 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
}
#endif
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_host_info_eax;
@@ -193,8 +202,15 @@ static void __init ms_hyperv_init_platform(void)
hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
}
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
- if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
/*
* Get the APIC frequency.
*/
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */
- pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index c2f8dde3255c..d5d44c452624 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -90,6 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
* Boot time FPU feature detection code:
*/
unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
static void __init fpu__init_system_mxcsr(void)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0651e974dcb3..9bef1bbeba63 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
-static inline void tramp_free(void *tramp)
+static inline void tramp_free(void *tramp, int size)
{
+ int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ set_memory_nx((unsigned long)tramp, npages);
+ set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp);
}
#else
@@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
-static inline void tramp_free(void *tramp) { }
+static inline void tramp_free(void *tramp, int size) { }
#endif
/* Defined as markers to the end of the ftrace default trampolines */
@@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
if (WARN_ON(ret < 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Are we pointing to the reference? */
if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret;
+ int ret, npages;
if (ops->trampoline) {
/*
@@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
+ npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+ set_memory_rw(ops->trampoline, npages);
} else {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
+ npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
+ set_memory_ro(ops->trampoline, npages);
/* The update should never fail */
WARN_ON(ret);
@@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- tramp_free((void *)ops->trampoline);
+ tramp_free((void *)ops->trampoline, ops->trampoline_size);
ops->trampoline = 0;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..46c3c73e7f43 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
/*
* Manage page tables very early on.
*/
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt = 2;
+static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+#define __head __section(.head.text)
+
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+ return ptr - (void *)_text + (void *)physaddr;
+}
+
+void __head __startup_64(unsigned long physaddr)
+{
+ unsigned long load_delta, *p;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ int i;
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_PAGE_MASK)
+ for (;;);
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = fixup_pointer(&early_top_pgt, physaddr);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d[511] += load_delta;
+ }
+
+ pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud[510] += load_delta;
+ pud[511] += load_delta;
+
+ pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+ pmd[506] += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+
+ i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
+ p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ }
+
+ i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
+ pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
+ pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
+ pmd[idx] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ */
+
+ pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+ }
+
+ /* Fixup phys_base */
+ p = fixup_pointer(&phys_base, physaddr);
+ *p += load_delta;
+}
+
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
- memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1;
again:
- pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
pgd = *pgd_p;
/*
@@ -67,8 +171,25 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (pgd)
- pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -77,7 +198,7 @@ again:
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
- clear_page(init_level4_pgt);
+ clear_page(init_top_pgt);
kasan_early_init();
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- /* set init_level4_pgt kernel high mapping*/
- init_level4_pgt[511] = early_level4_pgt[511];
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ac9d327d2e42..6225550883df 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
*
*/
+#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -72,101 +73,12 @@ startup_64:
/* Sanitize CPU configuration */
call verify_cpu
- /*
- * Compute the delta between the address I am compiled to run at and the
- * address I am actually running at.
- */
- leaq _text(%rip), %rbp
- subq $_text - __START_KERNEL_map, %rbp
-
- /* Is the address not 2M aligned? */
- testl $~PMD_PAGE_MASK, %ebp
- jnz bad_address
-
- /*
- * Is the address too large?
- */
- leaq _text(%rip), %rax
- shrq $MAX_PHYSMEM_BITS, %rax
- jnz bad_address
-
- /*
- * Fixup the physical addresses in the page table
- */
- addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
-
- addq %rbp, level3_kernel_pgt + (510*8)(%rip)
- addq %rbp, level3_kernel_pgt + (511*8)(%rip)
-
- addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
leaq _text(%rip), %rdi
- leaq early_level4_pgt(%rip), %rbx
-
- movq %rdi, %rax
- shrq $PGDIR_SHIFT, %rax
-
- leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
- movq %rdx, 0(%rbx,%rax,8)
- movq %rdx, 8(%rbx,%rax,8)
-
- addq $PAGE_SIZE, %rdx
- movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- incl %eax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
-
- addq $PAGE_SIZE * 2, %rbx
- movq %rdi, %rax
- shrq $PMD_SHIFT, %rdi
- addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
- leaq (_end - 1)(%rip), %rcx
- shrq $PMD_SHIFT, %rcx
- subq %rdi, %rcx
- incl %ecx
-
-1:
- andq $(PTRS_PER_PMD - 1), %rdi
- movq %rax, (%rbx,%rdi,8)
- incq %rdi
- addq $PMD_SIZE, %rax
- decl %ecx
- jnz 1b
-
- test %rbp, %rbp
- jz .Lskip_fixup
+ pushq %rsi
+ call __startup_64
+ popq %rsi
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- */
- leaq level2_kernel_pgt(%rip), %rdi
- leaq PAGE_SIZE(%rdi), %r8
- /* See if it is a valid page table entry */
-1: testb $_PAGE_PRESENT, 0(%rdi)
- jz 2f
- addq %rbp, 0(%rdi)
- /* Go to the next page */
-2: addq $8, %rdi
- cmp %r8, %rdi
- jne 1b
-
- /* Fixup phys_base */
- addq %rbp, phys_base(%rip)
-
-.Lskip_fixup:
- movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ movq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
/*
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */
call verify_cpu
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq $(init_top_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
+ /* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+ orl $X86_CR4_LA57, %ecx
+#endif
movq %rcx, %cr4
- /* Setup early boot stage 4 level pagetables. */
+ /* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -417,9 +332,13 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PAGE(early_top_pgt)
.fill 511,8,0
+#ifdef CONFIG_X86_5LEVEL
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
+
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index be22f5a2192e..4e3b8a587c88 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -418,6 +418,7 @@ struct legacy_pic default_legacy_pic = {
};
struct legacy_pic *legacy_pic = &default_legacy_pic;
+EXPORT_SYMBOL(legacy_pic);
static int __init i8259A_init_ops(void)
{
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 5b2bbfbb3712..6b877807598b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -52,6 +52,7 @@
#include <linux/ftrace.h>
#include <linux/frame.h>
#include <linux/kasan.h>
+#include <linux/moduleloader.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -417,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn)
}
}
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+ set_memory_nx((unsigned long)page & PAGE_MASK, 1);
+ set_memory_rw((unsigned long)page & PAGE_MASK, 1);
+ module_memfree(page);
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
" popf\n"
#endif
".global optprobe_template_end\n"
- "optprobe_template_end:\n");
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index da5c09789984..43e10d6fdbed 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token)
*/
rcu_irq_exit();
native_safe_halt();
- rcu_irq_enter();
local_irq_disable();
+ rcu_irq_enter();
}
}
if (!n.halted)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a15831ac58..a870910c8565 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
#include <asm/syscalls.h>
/* context.lock is held for us, so we don't need any locking. */
-static void flush_ldt(void *current_mm)
+static void flush_ldt(void *__mm)
{
+ struct mm_struct *mm = __mm;
mm_context_t *pc;
- if (current->active_mm != current_mm)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &current->active_mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->size);
+ pc = &mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(unsigned int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
struct ldt_struct *new_ldt;
unsigned int alloc_size;
- if (size > LDT_ENTRIES)
+ if (num_entries > LDT_ENTRIES)
return NULL;
new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
- alloc_size = size * LDT_ENTRY_SIZE;
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
/*
* Xen is very picky: it requires a page-aligned LDT that has no
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
}
- new_ldt->size = size;
+ new_ldt->nr_entries = num_entries;
return new_ldt;
}
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
- paravirt_alloc_ldt(ldt->entries, ldt->size);
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
/* context.lock is held */
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
- paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
goto out_unlock;
}
- new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
retval = -ENOMEM;
goto out_unlock;
}
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
- new_ldt->size * LDT_ENTRY_SIZE);
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
mm->context.ldt = new_ldt;
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int retval;
- unsigned long size;
struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
mutex_lock(&mm->context.lock);
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- size = mm->context.ldt->size * LDT_ENTRY_SIZE;
- if (size > bytecount)
- size = bytecount;
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
- if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
- if (size != bytecount) {
+ if (entries_size != bytecount) {
/* Zero-fill the rest and pretend we read bytecount bytes. */
- if (clear_user(ptr + size, bytecount - size)) {
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct ldt_struct *new_ldt, *old_ldt;
- unsigned int oldsize, newsize;
+ unsigned int old_nr_entries, new_nr_entries;
struct user_desc ldt_info;
struct desc_struct ldt;
int error;
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
mutex_lock(&mm->context.lock);
- old_ldt = mm->context.ldt;
- oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max(ldt_info.entry_number + 1, oldsize);
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
error = -ENOMEM;
- new_ldt = alloc_ldt_struct(newsize);
+ new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt)
goto out_unlock;
if (old_ldt)
- memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index ce640428d6fe..cb0a30473c23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -114,7 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
.context = image,
- .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
};
unsigned long mstart, mend;
pgd_t *level4p;
@@ -123,6 +123,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
+
+ if (direct_gbpages)
+ info.direct_gbpages = true;
+
for (i = 0; i < nr_pfn_mapped; i++) {
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
@@ -343,7 +347,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
VMCOREINFO_NUMBER(phys_base);
- VMCOREINFO_SYMBOL(init_level4_pgt);
+ VMCOREINFO_SYMBOL(init_top_pgt);
#ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 6d9582ec0324..d27f8d84c4ff 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -78,7 +78,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
/* Don't wait longer than a second */
timeout = USEC_PER_SEC;
- while (!cpumask_empty(mask) && timeout--)
+ while (!cpumask_empty(mask) && --timeout)
udelay(1);
/* What happens if we timeout, do we still unregister?? */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
+ .read_cr3 = __native_read_cr3,
.write_cr3 = native_write_cr3,
.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..3ca198080ea9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct inactive_task_frame *frame =
- (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
- return READ_ONCE_NOCHECK(frame->ret_addr);
-}
-
-/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ff40e74c9181..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- smp_processor_id());
+ raw_smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6840bf3940b..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt->entries,
- dead_task->mm->context.ldt->size);
+ dead_task->mm->context.ldt->nr_entries);
BUG();
}
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/tboot.h>
#include <linux/delay.h>
+#include <linux/frame.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
* Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 603a1669a2ec..65622f07e633 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -503,7 +503,7 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
- low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
(unsigned long)(low_size >> 20));
@@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p)
*/
x86_configure_nx();
- simple_udelay_calibration();
-
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ simple_udelay_calibration();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
@@ -1225,6 +1225,21 @@ void __init setup_arch(char **cmdline_p)
kasan_init();
+#ifdef CONFIG_X86_32
+ /* sync back kernel address range */
+ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ /*
+ * sync back low identity map too. It is used for example
+ * in the 32-bit EFI stub.
+ */
+ clone_pgd_range(initial_page_table,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+#endif
+
tboot_probe();
map_vsyscall();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index bb1e8cc0bc84..10edd1e69a68 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -291,11 +291,11 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_32
/*
- * Sync back kernel address range. We want to make sure that
- * all kernel mappings, including percpu mappings, are available
- * in the smpboot asm. We can't reliably pick up percpu
- * mappings using vmalloc_fault(), because exception dispatch
- * needs percpu data.
+ * Sync back kernel address range again. We already did this in
+ * setup_arch(), but percpu data also needs to be available in
+ * the smpboot asm. We can't reliably pick up percpu mappings
+ * using vmalloc_fault(), because exception dispatch needs
+ * percpu data.
*/
clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..b474c8de7fba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
void play_dead_common(void)
{
idle_task_exit();
- reset_lazy_tlbstate();
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f07f83b3611b..5f25cfbd952e 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
mutex_lock(&child->mm->context.lock);
if (unlikely(!child->mm->context.ldt ||
- seg >= child->mm->context.ldt->size))
+ seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */
else {
desc = &child->mm->context.ldt->entries[seg];
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 207b8f2582c7..213ddf3e937d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -187,7 +187,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4b1724059909..a4eb27918ceb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,7 +514,7 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (!intel_iommu_tboot_noforce)
+ if (intel_iommu_tboot_noforce)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3995d3a777d4..bf54309b85da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -182,7 +182,7 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
}
-static int fixup_bug(struct pt_regs *regs, int trapnr)
+int fixup_bug(struct pt_regs *regs, int trapnr)
{
if (trapnr != X86_TRAP_UD)
return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 82c6d7f1fd73..b9389d72b2f7 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state)
return (unsigned long *)task_pt_regs(state->task) - 2;
}
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
#ifdef CONFIG_X86_32
#define GCC_REALIGN_WORDS 3
#else
@@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state)
return last_frame(state) - GCC_REALIGN_WORDS;
}
-static bool is_last_task_frame(struct unwind_state *state)
+static bool is_last_aligned_frame(struct unwind_state *state)
{
unsigned long *last_bp = last_frame(state);
unsigned long *aligned_bp = last_aligned_frame(state);
/*
- * We have to check for the last task frame at two different locations
- * because gcc can occasionally decide to realign the stack pointer and
- * change the offset of the stack frame in the prologue of a function
- * called by head/entry code. Examples:
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
*
* <start_secondary>:
* push %edi
@@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state)
* push %rbp
* mov %rsp,%rbp
*
- * Note that after aligning the stack, it pushes a duplicate copy of
- * the return address before pushing the frame pointer.
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
*/
- return (state->bp == last_bp ||
- (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
}
/*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a181ae76c71c..59ca2eea522c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -780,18 +780,20 @@ out:
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
+ struct kvm_cpuid_entry2 *ej;
+ int j = i;
+ int nent = vcpu->arch.cpuid_nent;
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
- }
- }
- return 0; /* silence gcc, even though control never reaches here */
+ do {
+ j = (j + 1) % nent;
+ ej = &vcpu->arch.cpuid_entries[j];
+ } while (ej->function != e->function);
+
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+ return j;
}
/* find an entry with matching function, matching index (if needed), and that
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c25cfaf584e7..80890dee66ce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
return X86EMUL_CONTINUE;
}
@@ -4173,7 +4174,7 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt)
static int check_svme(struct x86_emulate_ctxt *ctxt)
{
- u64 efer;
+ u64 efer = 0;
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c329d2894905..d24c8742d9b0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,8 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
+ preempt_disable();
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
+ preempt_enable();
}
static bool start_hv_timer(struct kvm_lapic *apic)
@@ -1934,7 +1936,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
kvm_lapic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 558676538fca..cb8225969255 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1498,6 +1498,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
+/**
+ * kvm_arch_write_log_dirty - emulate dirty page logging
+ * @vcpu: Guest mode vcpu
+ *
+ * Emulate arch specific page modification logging for the
+ * nested hypervisor
+ */
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->write_log_dirty)
+ return kvm_x86_ops->write_log_dirty(vcpu);
+
+ return 0;
+}
+
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
@@ -3683,12 +3698,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
+ if (is_guest_mode(vcpu))
+ return false;
+
return kvm_x86_ops->interrupt_allowed(vcpu);
}
@@ -3704,7 +3722,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && can_do_async_pf(vcpu)) {
+ if (!prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d8ccb32f7308..330bf3a811fb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,6 +76,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
@@ -202,4 +203,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 314d2071b337..b0454c7e4cff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -226,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (level == walker->level && write_fault &&
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_arch_write_log_dirty(vcpu))
+ return -EINVAL;
+#endif
pte |= PT_GUEST_DIRTY_MASK;
}
if (pte == orig_pte)
@@ -279,11 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
unsigned nested_access;
gpa_t pte_gpa;
bool have_ad;
int offset;
+ u64 walk_nx_mask = 0;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
const int fetch_fault = access & PFERR_FETCH_MASK;
@@ -298,6 +304,7 @@ retry_walk:
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
@@ -309,8 +316,6 @@ retry_walk:
walker->max_level = walker->level;
ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
- accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
-
/*
* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
* by the MOV to CR instruction are treated as reads and do not cause the
@@ -318,14 +323,14 @@ retry_walk:
*/
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
- pt_access = pte_access = ACC_ALL;
+ pte_access = ~0;
++walker->level;
do {
gfn_t real_gfn;
unsigned long host_addr;
- pt_access &= pte_access;
+ pt_access = pte_access;
--walker->level;
index = PT_INDEX(addr, walker->level);
@@ -367,6 +372,12 @@ retry_walk:
trace_kvm_mmu_paging_element(pte, walker->level);
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
@@ -375,14 +386,16 @@ retry_walk:
goto error;
}
- accessed_dirty &= pte;
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
- errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
+ walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
goto error;
@@ -399,7 +412,7 @@ retry_walk:
walker->gfn = real_gpa >> PAGE_SHIFT;
if (!write_fault)
- FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
else
/*
* On a write fault, fold the dirty bit into accessed_dirty.
@@ -417,10 +430,8 @@ retry_walk:
goto retry_walk;
}
- walker->pt_access = pt_access;
- walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pte_access, pt_access);
+ __func__, (u64)pte, walker->pte_access, walker->pt_access);
return 1;
error:
@@ -448,7 +459,7 @@ error:
*/
if (!(errcode & PFERR_RSVD_MASK)) {
vcpu->arch.exit_qualification &= 0x187;
- vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+ vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
}
#endif
walker->fault.address = addr;
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 9d4a8504a95a..5ab4a364348e 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c27ac6923a18..33460fcdeef9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
+#include <linux/frame.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -1272,7 +1273,8 @@ static void init_vmcb(struct vcpu_svm *svm)
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
{
u64 *avic_physical_id_table;
struct kvm_arch *vm_data = &vcpu->kvm->arch;
@@ -1806,7 +1808,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
* AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
- var->unusable = !var->present || (var->type == 0);
+ var->unusable = !var->present;
switch (seg) {
case VCPU_SREG_TR:
@@ -1839,6 +1841,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ /* This is symmetric with svm_set_segment() */
var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
@@ -1979,18 +1982,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
s->base = var->base;
s->limit = var->limit;
s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
/*
* This is always accurate, except if SYSRET returned to a segment
@@ -1999,7 +1998,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
* would entail passing the CPL to userspace and back.
*/
if (seg == VCPU_SREG_SS)
- svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
mark_dirty(svm->vmcb, VMCB_SEG);
}
@@ -4907,6 +4907,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
mark_all_clean(svm->vmcb);
}
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5fd459c4043..6dcc4873e435 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
+#include <linux/frame.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -48,6 +49,7 @@
#include <asm/kexec.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
#include "trace.h"
#include "pmu.h"
@@ -248,6 +250,7 @@ struct __packed vmcs12 {
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
+ u64 pml_address;
u64 guest_ia32_debugctl;
u64 guest_ia32_pat;
u64 guest_ia32_efer;
@@ -369,6 +372,7 @@ struct __packed vmcs12 {
u16 guest_ldtr_selector;
u16 guest_tr_selector;
u16 guest_intr_status;
+ u16 guest_pml_index;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -407,6 +411,7 @@ struct nested_vmx {
/* Has the level1 guest done vmxon? */
bool vmxon;
gpa_t vmxon_ptr;
+ bool pml_full;
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
@@ -593,6 +598,7 @@ struct vcpu_vmx {
int gs_ldt_reload_needed;
int fs_reload_needed;
u64 msr_host_bndcfgs;
+ unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state;
struct {
@@ -742,6 +748,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
FIELD(GUEST_INTR_STATUS, guest_intr_status),
+ FIELD(GUEST_PML_INDEX, guest_pml_index),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -767,6 +774,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(PML_ADDRESS, pml_address),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
@@ -1314,6 +1322,11 @@ static inline bool report_flexpriority(void)
return flexpriority_enabled;
}
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
+}
+
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
return vmcs12->cpu_based_vm_exec_control & bit;
@@ -1348,6 +1361,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
vmx_xsaves_supported();
}
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -2410,7 +2428,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
if (!(vmcs12->exception_bitmap & (1u << nr)))
return 0;
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
return 1;
@@ -2751,8 +2769,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_1GB_PAGE_BIT;
- if (enable_ept_ad_bits)
+ if (enable_ept_ad_bits) {
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_PML;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+ }
} else
vmx->nested.nested_vmx_ept_caps = 0;
@@ -4994,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
- unsigned long cr0, cr4;
+ unsigned long cr0, cr3, cr4;
cr0 = read_cr0();
WARN_ON(cr0 & X86_CR0_TS);
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->host_state.vmcs_host_cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow();
@@ -6486,7 +6514,7 @@ static __init int hardware_setup(void)
enable_ept_ad_bits = 0;
}
- if (!cpu_has_vmx_ept_ad_bits())
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
if (!cpu_has_vmx_unrestricted_guest())
@@ -6896,97 +6924,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * This function performs the various checks including
- * - if it's 4KB aligned
- * - No bits beyond the physical address width are set
- * - Returns 0 on success or else 1
- * (Intel SDM Section 30.3)
- */
-static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
- gpa_t *vmpointer)
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
{
gva_t gva;
- gpa_t vmptr;
struct x86_exception e;
- struct page *page;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
+ sizeof(*vmpointer), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
- switch (exit_reason) {
- case EXIT_REASON_VMON:
- /*
- * SDM 3: 24.11.5
- * The first 4 bytes of VMXON region contain the supported
- * VMCS revision identifier
- *
- * Note - IA32_VMX_BASIC[48] will never be 1
- * for the nested case;
- * which replaces physical address width with 32
- *
- */
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
- if (*(u32 *)kmap(page) != VMCS12_REVISION) {
- kunmap(page);
- nested_release_page_clean(page);
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
- kunmap(page);
- nested_release_page_clean(page);
- vmx->nested.vmxon_ptr = vmptr;
- break;
- case EXIT_REASON_VMCLEAR:
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
- break;
- case EXIT_REASON_VMPTRLD:
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
- break;
- default:
- return 1; /* shouldn't happen */
- }
-
- if (vmpointer)
- *vmpointer = vmptr;
return 0;
}
@@ -7048,6 +7000,8 @@ out_msr_bitmap:
static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
+ gpa_t vmptr;
+ struct page *page;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7077,9 +7031,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
-
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ page = nested_get_page(vcpu, vmptr);
+ if (page == NULL) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+ kunmap(page);
+ nested_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ kunmap(page);
+ nested_release_page_clean(page);
+
+ vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
return ret;
@@ -7195,9 +7177,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vmx);
@@ -7527,9 +7519,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
if (vmx->nested.current_vmptr != vmptr) {
struct vmcs12 *new_vmcs12;
struct page *page;
@@ -7895,11 +7897,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
int cr = exit_qualification & 15;
- int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_readl(vcpu, reg);
+ int reg;
+ unsigned long val;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_readl(vcpu, reg);
switch (cr) {
case 0:
if (vmcs12->cr0_guest_host_mask &
@@ -7954,6 +7958,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* lmsw can change bits 1..3 of cr0, and only set bit 0 of
* cr0. Other attempted changes are ignored, with no exit.
*/
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
if (vmcs12->cr0_guest_host_mask & 0xe &
(val ^ vmcs12->cr0_read_shadow))
return true;
@@ -8114,7 +8119,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_PREEMPTION_TIMER:
return false;
case EXIT_REASON_PML_FULL:
- /* We don't expose PML support to L1. */
+ /* We emulate PML support to L1. */
return false;
default:
return true;
@@ -8657,6 +8662,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
);
}
}
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
static bool vmx_has_high_real_mode_segbase(void)
{
@@ -8825,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr, cr4;
+ unsigned long debugctlmsr, cr3, cr4;
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
@@ -8847,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->host_state.vmcs_host_cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -9033,6 +9045,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
@@ -9364,13 +9377,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason;
+ unsigned long exit_qualification = vcpu->arch.exit_qualification;
- if (fault->error_code & PFERR_RSVD_MASK)
+ if (vmx->nested.pml_full) {
+ exit_reason = EXIT_REASON_PML_FULL;
+ vmx->nested.pml_full = false;
+ exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+ } else if (fault->error_code & PFERR_RSVD_MASK)
exit_reason = EXIT_REASON_EPT_MISCONFIG;
else
exit_reason = EXIT_REASON_EPT_VIOLATION;
- nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
+
+ nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
vmcs12->guest_physical_address = fault->address;
}
@@ -9713,6 +9733,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u64 address = vmcs12->pml_address;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !IS_ALIGNED(address, 4096) ||
+ address >> maxphyaddr)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
@@ -9886,7 +9922,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
bool from_vmentry, u32 *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
+ u32 exec_control, vmcs12_exec_ctrl;
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -10017,8 +10053,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
- exec_control |= vmcs12->secondary_vm_exec_control;
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
+ vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
+ ~SECONDARY_EXEC_ENABLE_PML;
+ exec_control |= vmcs12_exec_ctrl;
+ }
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
vmcs_write64(EOI_EXIT_BITMAP0,
@@ -10248,6 +10287,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_vmx_check_pml_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
vmx->nested.nested_vmx_procbased_ctls_low,
vmx->nested.nested_vmx_procbased_ctls_high) ||
@@ -10266,6 +10308,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx->nested.nested_vmx_entry_ctls_high))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
@@ -11143,6 +11188,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
kvm_flush_pml_buffers(kvm);
}
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t gpa;
+ struct page *page = NULL;
+ u64 *pml_address;
+
+ if (is_guest_mode(vcpu)) {
+ WARN_ON_ONCE(vmx->nested.pml_full);
+
+ /*
+ * Check if PML is enabled for the nested guest.
+ * Whether eptp bit 6 is set is already checked
+ * as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+
+ page = nested_get_page(vcpu, vmcs12->pml_address);
+ if (!page)
+ return 0;
+
+ pml_address = kmap(page);
+ pml_address[vmcs12->guest_pml_index--] = gpa;
+ kunmap(page);
+ nested_release_page_clean(page);
+ }
+
+ return 0;
+}
+
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
@@ -11502,6 +11587,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .write_log_dirty = vmx_write_pml_buffer,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 464da936c53d..0e846f0cb83b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1763,6 +1763,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ u64 ret;
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
@@ -1774,10 +1775,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
spin_unlock(&ka->pvclock_gtod_sync_lock);
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- return __pvclock_read_cycles(&hv_clock, rdtsc());
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+
+ put_cpu();
+
+ return ret;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
@@ -3288,11 +3296,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
}
}
+#define XSAVE_MXCSR_OFFSET 24
+
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
@@ -3300,11 +3311,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~kvm_supported_xcr0())
+ if (xstate_bv & ~kvm_supported_xcr0() ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE)
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
@@ -4818,16 +4831,20 @@ emul_write:
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
- /* TODO: String I/O for in kernel device */
- int r;
+ int r = 0, i;
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
+ for (i = 0; i < vcpu->arch.pio.count; i++) {
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ if (r)
+ break;
+ pd += vcpu->arch.pio.size;
+ }
return r;
}
@@ -4865,6 +4882,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (vcpu->arch.pio.count)
goto data_avail;
+ memset(vcpu->arch.pio_data, 0, size * count);
+
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
@@ -5048,6 +5067,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
if (var.unusable) {
memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
return false;
}
@@ -5292,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
@@ -5507,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * rflags is the old, "raw" value of the flags. The new value has
- * not been saved yet.
- *
- * This is correct even for TF set by the guest, because "the
- * processor will not generate this exception after the instruction
- * that sets the TF flag".
- */
- if (unlikely(rflags & X86_EFLAGS_TF)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
- DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5546,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
int r = EMULATE_DONE;
kvm_x86_ops->skip_emulated_instruction(vcpu);
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ kvm_vcpu_do_singlestep(vcpu, &r);
return r == EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
@@ -5705,8 +5727,9 @@ restart:
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ if (r == EMULATE_DONE &&
+ (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
__kvm_set_rflags(vcpu, ctxt->eflags);
@@ -8373,10 +8396,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (vcpu->arch.pv.pv_unhalted)
return true;
- if (atomic_read(&vcpu->arch.nmi_queued))
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_ops->nmi_allowed(vcpu)))
return true;
- if (kvm_test_request(KVM_REQ_SMI, vcpu))
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending && !is_smm(vcpu)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -8583,8 +8609,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
return true;
else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_can_do_async_pf(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index c5959576c315..020f75cc8cf6 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -37,7 +37,7 @@ ENTRY(copy_user_generic_unrolled)
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
- jz 17f
+ jz .L_copy_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
@@ -58,7 +58,8 @@ ENTRY(copy_user_generic_unrolled)
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
-17: movl %edx,%ecx
+.L_copy_short_string:
+ movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz 20f
@@ -174,6 +175,8 @@ EXPORT_SYMBOL(copy_user_generic_string)
*/
ENTRY(copy_user_enhanced_fast_string)
ASM_STAC
+ cmpl $64,%edx
+ jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
movl %edx,%ecx
1: rep
movsb
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 7e48807b2fa1..45a53dfe1859 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -55,7 +55,7 @@ ENTRY(csum_partial_copy_generic)
movq %r12, 3*8(%rsp)
movq %r14, 4*8(%rsp)
movq %r13, 5*8(%rsp)
- movq %rbp, 6*8(%rsp)
+ movq %r15, 6*8(%rsp)
movq %r8, (%rsp)
movq %r9, 1*8(%rsp)
@@ -74,7 +74,7 @@ ENTRY(csum_partial_copy_generic)
/* main loop. clear in 64 byte blocks */
/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
- /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+ /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */
.p2align 4
.Lloop:
source
@@ -89,7 +89,7 @@ ENTRY(csum_partial_copy_generic)
source
movq 32(%rdi), %r10
source
- movq 40(%rdi), %rbp
+ movq 40(%rdi), %r15
source
movq 48(%rdi), %r14
source
@@ -103,7 +103,7 @@ ENTRY(csum_partial_copy_generic)
adcq %r11, %rax
adcq %rdx, %rax
adcq %r10, %rax
- adcq %rbp, %rax
+ adcq %r15, %rax
adcq %r14, %rax
adcq %r13, %rax
@@ -121,7 +121,7 @@ ENTRY(csum_partial_copy_generic)
dest
movq %r10, 32(%rsi)
dest
- movq %rbp, 40(%rsi)
+ movq %r15, 40(%rsi)
dest
movq %r14, 48(%rsi)
dest
@@ -203,7 +203,7 @@ ENTRY(csum_partial_copy_generic)
movq 3*8(%rsp), %r12
movq 4*8(%rsp), %r14
movq 5*8(%rsp), %r13
- movq 6*8(%rsp), %rbp
+ movq 6*8(%rsp), %r15
addq $7*8, %rsp
ret
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index 5761a4f19455..ab2d1d73e9e7 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -5,6 +5,7 @@
* kernel starts. This file is included in the compressed kernel and
* normally linked in the regular.
*/
+#include <asm/asm.h>
#include <asm/kaslr.h>
#include <asm/msr.h>
#include <asm/archrandom.h>
@@ -79,7 +80,7 @@ unsigned long kaslr_get_random_long(const char *purpose)
}
/* Circular multiply for better bit diffusion */
- asm("mul %3"
+ asm(_ASM_MUL "%3"
: "=a" (random), "=d" (raw)
: "a" (random), "rm" (mix_const));
random += raw;
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index c81556409bbb..10ffa7e8519f 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -13,14 +13,14 @@
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
pushq %rbx
- pushq %rbp
+ pushq %r12
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
movl 4(%rdi), %ecx
movl 8(%rdi), %edx
movl 12(%rdi), %ebx
- movl 20(%rdi), %ebp
+ movl 20(%rdi), %r12d
movl 24(%rdi), %esi
movl 28(%rdi), %edi
1: \op
@@ -29,10 +29,10 @@ ENTRY(\op\()_safe_regs)
movl %ecx, 4(%r10)
movl %edx, 8(%r10)
movl %ebx, 12(%r10)
- movl %ebp, 20(%r10)
+ movl %r12d, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
- popq %rbp
+ popq %r12
popq %rbx
ret
3:
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 767be7c76034..12e377184ee4 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1009,7 +1009,7 @@ GrpTable: Grp15
1: fxstor | RDGSBASE Ry (F3),(11B)
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
-4: XSAVE
+4: XSAVE | ptwrite Ey (F3),(11B)
5: XRSTOR | lfence (11B)
6: XSAVEOPT | clwb (66) | mfence (11B)
7: clflush | clflushopt (66) | sfence (11B)
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 5e044d506b7a..a179254a5122 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
seg >>= 3;
mutex_lock(&current->mm->context.lock);
- if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
+ if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
ret = current->mm->context.ldt->entries[seg];
mutex_unlock(&current->mm->context.lock);
#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bce6990b1d81..0470826d2bdc 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
{
#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_level4_pgt;
+ pgd_t *start = (pgd_t *) &init_top_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 35ea061010a1..0ea8afcb929c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -162,6 +162,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
if (fixup_exception(regs, trapnr))
return;
+ if (fixup_bug(regs, trapnr))
+ return;
+
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
- pgd_paddr = read_cr3();
+ pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3());
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d;
pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later
* case just flush:
*/
- pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d;
pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
pgd_t *pgd;
pte_t *pte;
- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 456dfdfd2249..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Lockless get_user_pages_fast for x86
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/memremap.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X86_PAE
- return READ_ONCE(*ptep);
-#else
- /*
- * With get_user_pages_fast, we walk down the pagetables without taking
- * any locks. For this we would like to load the pointers atomically,
- * but that is not possible (without expensive cmpxchg8b) on PAE. What
- * we do have is the guarantee that a pte will only either go from not
- * present to present, or present to not present or both -- it will not
- * switch to a completely different present page without a TLB flush in
- * between; something that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees l iff pte_high
- * sees h. We load pte_high *after* loading pte_low, which ensures we
- * don't see an older value of pte_high. *Then* we recheck pte_low,
- * which ensures that we haven't picked up a changed pte high. We might
- * have got rubbish values from pte_low and pte_high, but we are
- * guaranteed that pte_low will not have the present bit set *unless*
- * it is 'l'. And get_user_pages_fast only operates on present ptes, so
- * we're safe.
- *
- * gup_get_pte should not be used or copied outside gup.c without being
- * very careful -- it does not atomically load the pte or anything that
- * is likely to be useful for you.
- */
- pte_t pte;
-
-retry:
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- if (unlikely(pte.pte_low != ptep->pte_low))
- goto retry;
-
- return pte;
-#endif
-}
-
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
-{
- while ((*nr) - nr_start) {
- struct page *page = pages[--(*nr)];
-
- ClearPageReferenced(page);
- put_page(page);
- }
-}
-
-/*
- * 'pteval' can come from a pte, pmd, pud or p4d. We only check
- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 4 types.
- */
-static inline int pte_allows_gup(unsigned long pteval, int write)
-{
- unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
-
- if (write)
- need_pte_bits |= _PAGE_RW;
-
- if ((pteval & need_pte_bits) != need_pte_bits)
- return 0;
-
- /* Check memory protection keys permissions. */
- if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
- return 0;
-
- return 1;
-}
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
- pte_t *ptep, *ptem;
-
- /*
- * Keep the original mapped PTE value (ptem) around since we
- * might increment ptep off the end of the page when finishing
- * our loop iteration.
- */
- ptem = ptep = pte_offset_map(&pmd, addr);
- do {
- pte_t pte = gup_get_pte(ptep);
- struct page *page;
-
- /* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_protnone(pte))
- break;
-
- if (!pte_allows_gup(pte_val(pte), write))
- break;
-
- if (pte_devmap(pte)) {
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- break;
- }
- } else if (pte_special(pte))
- break;
-
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- get_page(page);
- put_dev_pagemap(pgmap);
- SetPageReferenced(page);
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
- if (addr == end)
- ret = 1;
- pte_unmap(ptem);
-
- return ret;
-}
-
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
- VM_BUG_ON_PAGE(page != compound_head(page), page);
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, nr);
- SetPageReferenced(page);
-}
-
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- return 0;
- }
- SetPageReferenced(page);
- pages[*nr] = page;
- get_page(page);
- put_dev_pagemap(pgmap);
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
- return 1;
-}
-
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pmd_val(pmd), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
- if (pmd_devmap(pmd))
- return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pmd_page(pmd);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = *pmdp;
-
- next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
- return 0;
- if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
- return 0;
- if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- }
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pud_val(pud), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
- if (pud_devmap(pud))
- return __gup_device_huge_pud(pud, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pud_page(pud);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&p4d, addr);
- do {
- pud_t pud = *pudp;
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (unlikely(pud_large(pud))) {
- if (!gup_huge_pud(pud, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- }
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- p4d_t *p4dp;
-
- p4dp = p4d_offset(&pgd, addr);
- do {
- p4d_t p4d = *p4dp;
-
- next = p4d_addr_end(addr, end);
- if (p4d_none(p4d))
- return 0;
- BUILD_BUG_ON(p4d_large(p4d));
- if (!gup_pud_range(p4d, addr, next, write, pages, nr))
- return 0;
- } while (p4dp++, addr = next, addr != end);
-
- return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
- return 0;
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
-
- return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
-
- end = start + len;
- if (end < start)
- goto slow_irqon;
-
-#ifdef CONFIG_X86_64
- if (end >> __VIRTUAL_MASK_SHIFT)
- goto slow_irqon;
-#endif
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_disable();
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
-
-slow:
- local_irq_enable();
-slow_irqon:
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- ret = get_user_pages_unlocked(start,
- (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
-
- return ret;
- }
-}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 302f43fd9c28..adad702b39cd 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -148,7 +148,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 04210a29dd60..adab1595f4bd 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -13,7 +13,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
if (pmd_present(*pmd))
continue;
- set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag));
+ set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag));
}
}
@@ -30,6 +30,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
if (next > end)
next = end;
+ if (info->direct_gbpages) {
+ pud_t pudval;
+
+ if (pud_present(*pud))
+ continue;
+
+ addr &= PUD_MASK;
+ pudval = __pud((addr - info->offset) | info->page_flag);
+ set_pud(pud, pudval);
+ continue;
+ }
+
if (pud_present(*pud)) {
pmd = pmd_offset(pud, 0);
ident_pmd_init(info, pmd, addr, next);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cbc87ea98751..673541eb3b3f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,16 +161,16 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
* use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
page_size_mask |= 1 << PG_LEVEL_2M;
-#endif
+ else
+ direct_gbpages = 0;
/* Enable PSE if available */
if (boot_cpu_has(X86_FEATURE_PSE))
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
- .active_mm = &init_mm,
+ .loaded_mm = &init_mm,
.state = 0,
-#endif
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 41270b96403d..dae6a5e5ad4a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,12 +92,50 @@ __setup("noexec32=", nonx32_setup);
* When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
+#ifdef CONFIG_X86_5LEVEL
void sync_global_pgds(unsigned long start, unsigned long end)
{
- unsigned long address;
+ unsigned long addr;
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ const pgd_t *pgd_ref = pgd_offset_k(addr);
+ struct page *page;
+
+ /* Check for overflow */
+ if (addr < start)
+ break;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
- for (address = start; address <= end; address += PGDIR_SIZE) {
- pgd_t *pgd_ref = pgd_offset_k(address);
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+#else
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ pgd_t *pgd_ref = pgd_offset_k(addr);
const p4d_t *p4d_ref;
struct page *page;
@@ -106,7 +144,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
* handle synchonization on p4d level.
*/
BUILD_BUG_ON(pgd_none(*pgd_ref));
- p4d_ref = p4d_offset(pgd_ref, address);
+ p4d_ref = p4d_offset(pgd_ref, addr);
if (p4d_none(*p4d_ref))
continue;
@@ -117,8 +155,8 @@ void sync_global_pgds(unsigned long start, unsigned long end)
p4d_t *p4d;
spinlock_t *pgt_lock;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- p4d = p4d_offset(pgd, address);
+ pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d = p4d_offset(pgd, addr);
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
+#endif
/*
* NOTE: This function is marked __ref because it calls __init function
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
return paddr_last;
}
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ unsigned long paddr_next, paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = p4d_index(vaddr);
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+
+ for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d;
+ pud_t *pud;
+
+ vaddr = (unsigned long)__va(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_p4d(p4d, __p4d(0));
+ continue;
+ }
+
+ if (!p4d_none(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ paddr_last = phys_pud_init(pud, paddr,
+ paddr_end,
+ page_size_mask);
+ __flush_tlb_all();
+ continue;
+ }
+
+ pud = alloc_low_page();
+ paddr_last = phys_pud_init(pud, paddr, paddr_end,
+ page_size_mask);
+
+ spin_lock(&init_mm.page_table_lock);
+ p4d_populate(&init_mm, p4d, pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ __flush_tlb_all();
+
+ return paddr_last;
+}
+
/*
* Create page table mapping for the physical memory for specific physical
* addresses. The virtual and physical addresses have to be aligned on PMD level
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d;
- pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
- BUILD_BUG_ON(pgd_none(*pgd));
- p4d = p4d_offset(pgd, vaddr);
- if (p4d_val(*p4d)) {
- pud = (pud_t *)p4d_page_vaddr(*p4d);
- paddr_last = phys_pud_init(pud, __pa(vaddr),
+ if (pgd_val(*pgd)) {
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
- pud = alloc_low_page();
- paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+ p4d = alloc_low_page();
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- p4d_populate(&init_mm, p4d, pud);
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ pgd_populate(&init_mm, pgd, p4d);
+ else
+ p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -990,7 +1080,13 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
pud_base = pud_offset(p4d, 0);
remove_pud_table(pud_base, addr, next, direct);
- free_pud_table(pud_base, p4d);
+ /*
+ * For 4-level page tables we do not want to free PUDs, but in the
+ * 5-level case we should free them. This code will have to change
+ * to adapt for boot-time switching between 4 and 5 level page tables.
+ */
+ if (CONFIG_PGTABLE_LEVELS == 5)
+ free_pud_table(pud_base, p4d);
}
if (direct)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
/* Don't assume we're using swapper_pg_dir at this point */
- pgd_t *base = __va(read_cr3());
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(addr)];
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0c7d8129bed6..88215ac16b24 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,7 +12,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static int __init map_range(struct range *range)
@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
- kasan_map_early_shadow(early_level4_pgt);
- kasan_map_early_shadow(init_level4_pgt);
+ kasan_map_early_shadow(early_top_pgt);
+ kasan_map_early_shadow(init_top_pgt);
}
void __init kasan_init(void)
@@ -121,8 +121,8 @@ void __init kasan_init(void)
register_die_notifier(&kasan_die_notifier);
#endif
- memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
- load_cr3(early_level4_pgt);
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+ load_cr3(early_top_pgt);
__flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -148,7 +148,7 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
- load_cr3(init_level4_pgt);
+ load_cr3(init_top_pgt);
__flush_tlb_all();
/*
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..af599167fe3c 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
*
* Entropy is generated using the KASLR early boot functions now shared in
* the lib directory (originally written by Kees Cook). Randomization is
- * done on PGD & PUD page table levels to increase possible addresses. The
- * physical memory mapping code was adapted to support PUD level virtual
- * addresses. This implementation on the best configuration provides 30,000
- * possible virtual addresses in average for each memory region. An additional
- * low memory page is used to ensure each CPU can start with a PGD aligned
- * virtual address (for realmode).
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
+ * The physical memory mapping code was adapted to support P4D/PUD level
+ * virtual addresses. This implementation on the best configuration provides
+ * 30,000 possible virtual addresses in average for each memory region.
+ * An additional low memory page is used to ensure each CPU can start with
+ * a PGD aligned virtual address (for realmode).
*
* The order of each memory region is not changed. The feature looks at
* the available space for the regions based on different configuration
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 64/* Maximum */ },
+ { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB },
{ &vmemmap_base, 1 },
};
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- entropy = (rand % (entropy + 1)) & PUD_MASK;
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ entropy = (rand % (entropy + 1)) & P4D_MASK;
+ else
+ entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- vaddr = round_up(vaddr + 1, PUD_SIZE);
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ vaddr = round_up(vaddr + 1, P4D_SIZE);
+ else
+ vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
}
-/*
- * Create PGD aligned trampoline table to allow real mode initialization
- * of additional CPUs. Consume only 1 low memory page.
- */
-void __meminit init_trampoline(void)
+static void __meminit init_trampoline_pud(void)
{
unsigned long paddr, paddr_next;
pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
- if (!kaslr_memory_enabled()) {
- init_trampoline_default();
- return;
- }
-
pud_page_tramp = alloc_low_page();
paddr = 0;
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
+
+static void __meminit init_trampoline_p4d(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ p4d_t *p4d_page, *p4d_page_tramp;
+ int i;
+
+ p4d_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+
+ for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d, *p4d_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ *p4d_tramp = *p4d;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+
+ if (!kaslr_memory_enabled()) {
+ init_trampoline_default();
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ init_trampoline_p4d();
+ else
+ init_trampoline_pud();
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 19ad095b41df..797295e792b2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
- return 1;
-
return sysctl_legacy_va_layout;
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 6b7ce6279133..aca6295350f3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -100,5 +100,6 @@ void __init initmem_init(void)
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
+ __vmalloc_start_set = true;
setup_bootmem_allocator();
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1dcd2be4cce4..c8520b2c62d2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -186,7 +186,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level;
unsigned long addr;
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
on_each_cpu(__cpa_flush_range, NULL, 1);
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 38868adf07ea..f6ae6830b341 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -9,7 +9,7 @@
#include <linux/mmiotrace.h>
static unsigned long mmio_address;
-module_param(mmio_address, ulong, 0);
+module_param_hw(mmio_address, ulong, iomem, 0);
MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
"(or 8 MB if read_far is non-zero).");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..014d07a80053 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
#include <linux/debugfs.h>
/*
- * Smarter SMP flushing macros.
+ * TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
@@ -28,39 +28,28 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
-#ifdef CONFIG_SMP
-
-struct flush_tlb_info {
- struct mm_struct *flush_mm;
- unsigned long flush_start;
- unsigned long flush_end;
-};
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
void leave_mm(int cpu)
{
- struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+ /*
+ * It's plausible that we're in lazy TLB mode while our mm is init_mm.
+ * If so, our callers still expect us to flush the TLB, but there
+ * aren't any user TLB entries in init_mm to worry about.
+ *
+ * This needs to happen before any other sanity checks due to
+ * intel_idle's shenanigans.
+ */
+ if (loaded_mm == &init_mm)
+ return;
+
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
- cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
- load_cr3(swapper_pg_dir);
- /*
- * This gets called in the idle path where RCU
- * functions differently. Tracing normally
- * uses RCU, so we have to call the tracepoint
- * specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- }
+
+ switch_mm(NULL, &init_mm, NULL);
}
EXPORT_SYMBOL_GPL(leave_mm);
-#endif /* CONFIG_SMP */
-
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
+ struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
- if (likely(prev != next)) {
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
- pgd_t *pgd = next->pgd + stack_pgd_index;
-
- if (unlikely(pgd_none(*pgd)))
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- }
+ /*
+ * NB: The scheduler will call us with prev == next when
+ * switching from lazy TLB mode to normal mode if active_mm
+ * isn't changing. When this happens, there is no guarantee
+ * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+ *
+ * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
+ */
-#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (real_prev == next) {
+ /*
+ * There's nothing to do: we always keep the per-mm control
+ * regs in sync with cpu_tlbstate.loaded_mm. Just
+ * sanity-check mm_cpumask.
+ */
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ return;
+ }
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
*/
- load_cr3(next->pgd);
+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ pgd_t *pgd = next->pgd + stack_pgd_index;
- /* Stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ }
- /* Load per-mm CR4 state */
- load_mm_cr4(next);
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT, if the LDT is different.
- *
- * It's possible that prev->context.ldt doesn't match
- * the LDT register. This can happen if leave_mm(prev)
- * was called and then modify_ldt changed
- * prev->context.ldt but suppressed an IPI to this CPU.
- * In this case, prev->context.ldt != NULL, because we
- * never set context.ldt to NULL while the mm still
- * exists. That means that next->context.ldt !=
- * prev->context.ldt, because mms never share an LDT.
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
-#endif
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ */
+ load_cr3(next->pgd);
+
+ /*
+ * This gets called via leave_mm() in the idle path where RCU
+ * functions differently. Tracing normally uses RCU, so we have to
+ * call the tracepoint specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+ /* Stop flush ipis for the previous mm */
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+
+ /* Load per-mm CR4 and LDTR state */
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+}
+
+static void flush_tlb_func_common(const struct flush_tlb_info *f,
+ bool local, enum tlb_flush_reason reason)
+{
+ /* This code cannot presently handle being reentered. */
+ VM_WARN_ON(!irqs_disabled());
+
+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+ leave_mm(smp_processor_id());
+ return;
}
-#ifdef CONFIG_SMP
- else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * On established mms, the mm_cpumask is only changed
- * from irq context, from ptep_clear_flush() while in
- * lazy tlb mode, and here. Irqs are blocked during
- * schedule, protecting us from simultaneous changes.
- */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- /*
- * We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
- */
- load_cr3(next->pgd);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_mm_cr4(next);
- load_mm_ldt(next);
+ if (f->end == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
+ } else {
+ unsigned long addr;
+ unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ addr = f->start;
+ while (addr < f->end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
}
+ if (local)
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+ trace_tlb_flush(reason, nr_pages);
}
-#endif
}
-#ifdef CONFIG_SMP
+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+{
+ const struct flush_tlb_info *f = info;
-/*
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) set cpu_tlbstate to TLBSTATE_OK
- * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
- * if cpu0 was in lazy tlb mode.
- * 1a2) update cpu active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
- * mm, and in the worst case we perform a superfluous tlb flush.
- * 1b) thread switch without mm change
- * cpu active_mm is correct, cpu0 already handles flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
+ flush_tlb_func_common(f, true, reason);
+}
-/*
- * TLB flush funcation:
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-static void flush_tlb_func(void *info)
+static void flush_tlb_func_remote(void *info)
{
- struct flush_tlb_info *f = info;
+ const struct flush_tlb_info *f = info;
inc_irq_stat(irq_tlb_count);
- if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
- (f->flush_end - f->flush_start) / PAGE_SIZE;
- addr = f->flush_start;
- while (addr < f->flush_end) {
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
- }
- } else
- leave_mm(smp_processor_id());
-
+ flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
- struct flush_tlb_info info;
-
- info.flush_mm = mm;
- info.flush_start = start;
- info.flush_end = end;
-
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- if (end == TLB_FLUSH_ALL)
+ if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
- (end - start) >> PAGE_SHIFT);
+ (info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
unsigned int cpu;
cpu = smp_processor_id();
- cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+ cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func,
- &info, 1);
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
}
/*
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
- unsigned long addr;
- /* do a global flush by default */
- unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
-
- preempt_disable();
+ int cpu;
- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
- base_pages_to_flush = (end - start) >> PAGE_SHIFT;
- if (base_pages_to_flush > tlb_single_page_flush_ceiling)
- base_pages_to_flush = TLB_FLUSH_ALL;
+ struct flush_tlb_info info = {
+ .mm = mm,
+ };
- if (current->active_mm != mm) {
- /* Synchronize with switch_mm. */
- smp_mb();
+ cpu = get_cpu();
- goto out;
- }
-
- if (!current->mm) {
- leave_mm(smp_processor_id());
+ /* Synchronize with switch_mm. */
+ smp_mb();
- /* Synchronize with switch_mm. */
- smp_mb();
-
- goto out;
- }
-
- /*
- * Both branches below are implicit full barriers (MOV to CR or
- * INVLPG) that synchronize with switch_mm.
- */
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
+ /* Should we flush just the requested range? */
+ if ((end != TLB_FLUSH_ALL) &&
+ !(vmflag & VM_HUGETLB) &&
+ ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ info.start = start;
+ info.end = end;
} else {
- /* flush range by one by one 'invlpg' */
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
- __flush_tlb_single(addr);
- }
- }
- trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
-out:
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- start = 0UL;
- end = TLB_FLUSH_ALL;
+ info.start = 0UL;
+ info.end = TLB_FLUSH_ALL;
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, end);
- preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
+ if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+ local_irq_enable();
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), &info);
+ put_cpu();
}
+
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
unsigned long addr;
/* flush range by one by one 'invlpg' */
- for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+ for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
}
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+ (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
struct flush_tlb_info info;
- info.flush_start = start;
- info.flush_end = end;
+ info.start = start;
+ info.end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ struct flush_tlb_info info = {
+ .mm = NULL,
+ .start = 0UL,
+ .end = TLB_FLUSH_ALL,
+ };
+
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+ local_irq_enable();
+ }
+
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&batch->cpumask, &info);
+ cpumask_clear(&batch->cpumask);
+
+ put_cpu();
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c33ddb0..fefb4b619598 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,4 +1,6 @@
#
# Arch-specific network modules
#
+OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+
obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index f1d83b34c329..2f56e1ed61c3 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,5 @@
OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 7e76a4d8304b..f084d8718ac4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -828,9 +828,11 @@ static void __init kexec_enter_virtual_mode(void)
/*
* We don't do virtual mode, since we don't do runtime services, on
- * non-native EFI
+ * non-native EFI. With efi=old_map, we don't do runtime services in
+ * kexec kernel because in the initial boot something else might
+ * have been mapped at these virtual addresses.
*/
- if (!efi_is_native()) {
+ if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) {
efi_memmap_unmap();
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
@@ -1012,7 +1014,6 @@ static void __init __efi_enter_virtual_mode(void)
* necessary relocation fixups for the new virtual addresses.
*/
efi_runtime_update_mappings();
- efi_dump_pagetable();
/* clean DUMMY object */
efi_delete_dummy_variable();
@@ -1027,6 +1028,8 @@ void __init efi_enter_virtual_mode(void)
kexec_enter_virtual_mode();
else
__efi_enter_virtual_mode();
+
+ efi_dump_pagetable();
}
/*
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 3481268da3d0..52f7faa1538f 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -44,7 +44,14 @@ int __init efi_alloc_page_tables(void)
}
void efi_sync_low_kernel_mappings(void) {}
-void __init efi_dump_pagetable(void) {}
+
+void __init efi_dump_pagetable(void)
+{
+#ifdef CONFIG_EFI_PGT_DUMP
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+#endif
+}
+
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
return 0;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index c488625c9712..9bf72f5bfedb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -71,14 +71,16 @@ static void __init early_code_mapping_set_exec(int executable)
pgd_t * __init efi_call_phys_prolog(void)
{
- unsigned long vaddress;
- pgd_t *save_pgd;
+ unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
+ pgd_t *save_pgd, *pgd_k, *pgd_efi;
+ p4d_t *p4d, *p4d_k, *p4d_efi;
+ pud_t *pud;
int pgd;
- int n_pgds;
+ int n_pgds, i, j;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
- save_pgd = (pgd_t *)read_cr3();
+ save_pgd = (pgd_t *)__read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt);
goto out;
}
@@ -88,10 +90,49 @@ pgd_t * __init efi_call_phys_prolog(void)
n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
+ /*
+ * Build 1:1 identity mapping for efi=old_map usage. Note that
+ * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
+ * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
+ * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
+ * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
+ * This means here we can only reuse the PMD tables of the direct mapping.
+ */
for (pgd = 0; pgd < n_pgds; pgd++) {
- save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
- vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
- set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
+ addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
+ vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
+ pgd_efi = pgd_offset_k(addr_pgd);
+ save_pgd[pgd] = *pgd_efi;
+
+ p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+ if (!p4d) {
+ pr_err("Failed to allocate p4d table!\n");
+ goto out;
+ }
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ addr_p4d = addr_pgd + i * P4D_SIZE;
+ p4d_efi = p4d + p4d_index(addr_p4d);
+
+ pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+ if (!pud) {
+ pr_err("Failed to allocate pud table!\n");
+ goto out;
+ }
+
+ for (j = 0; j < PTRS_PER_PUD; j++) {
+ addr_pud = addr_p4d + j * PUD_SIZE;
+
+ if (addr_pud > (max_pfn << PAGE_SHIFT))
+ break;
+
+ vaddr = (unsigned long)__va(addr_pud);
+
+ pgd_k = pgd_offset_k(vaddr);
+ p4d_k = p4d_offset(pgd_k, vaddr);
+ pud[j] = *pud_offset(p4d_k, vaddr);
+ }
+ }
}
out:
__flush_tlb_all();
@@ -104,8 +145,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
/*
* After the lock is released, the original page table is restored.
*/
- int pgd_idx;
+ int pgd_idx, i;
int nr_pgds;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
write_cr3((unsigned long)save_pgd);
@@ -115,9 +159,28 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
- for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+ for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
+ pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ continue;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_offset(pgd,
+ pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
+
+ if (!(p4d_val(*p4d) & _PAGE_PRESENT))
+ continue;
+
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
+ pud_free(&init_mm, pud);
+ }
+
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ p4d_free(&init_mm, p4d);
+ }
+
kfree(save_pgd);
__flush_tlb_all();
@@ -526,7 +589,10 @@ void __init efi_runtime_update_mappings(void)
void __init efi_dump_pagetable(void)
{
#ifdef CONFIG_EFI_PGT_DUMP
- ptdump_walk_pgd_level(NULL, efi_pgd);
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+ else
+ ptdump_walk_pgd_level(NULL, efi_pgd);
#endif
}
@@ -583,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_scratch.prev_cr3 = read_cr3();
+ efi_scratch.prev_cr3 = __read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt);
__flush_tlb_all();
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 26615991d69c..8a99a2e96537 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -15,12 +15,66 @@
#include <asm/e820/api.h>
#include <asm/efi.h>
#include <asm/uv/uv.h>
+#include <asm/cpu_device_id.h>
#define EFI_MIN_RESERVE 5120
#define EFI_DUMMY_GUID \
EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+#define QUARK_CSH_SIGNATURE 0x5f435348 /* _CSH */
+#define QUARK_SECURITY_HEADER_SIZE 0x400
+
+/*
+ * Header prepended to the standard EFI capsule on Quark systems the are based
+ * on Intel firmware BSP.
+ * @csh_signature: Unique identifier to sanity check signed module
+ * presence ("_CSH").
+ * @version: Current version of CSH used. Should be one for Quark A0.
+ * @modulesize: Size of the entire module including the module header
+ * and payload.
+ * @security_version_number_index: Index of SVN to use for validation of signed
+ * module.
+ * @security_version_number: Used to prevent against roll back of modules.
+ * @rsvd_module_id: Currently unused for Clanton (Quark).
+ * @rsvd_module_vendor: Vendor Identifier. For Intel products value is
+ * 0x00008086.
+ * @rsvd_date: BCD representation of build date as yyyymmdd, where
+ * yyyy=4 digit year, mm=1-12, dd=1-31.
+ * @headersize: Total length of the header including including any
+ * padding optionally added by the signing tool.
+ * @hash_algo: What Hash is used in the module signing.
+ * @cryp_algo: What Crypto is used in the module signing.
+ * @keysize: Total length of the key data including including any
+ * padding optionally added by the signing tool.
+ * @signaturesize: Total length of the signature including including any
+ * padding optionally added by the signing tool.
+ * @rsvd_next_header: 32-bit pointer to the next Secure Boot Module in the
+ * chain, if there is a next header.
+ * @rsvd: Reserved, padding structure to required size.
+ *
+ * See also QuartSecurityHeader_t in
+ * Quark_EDKII_v1.2.1.1/QuarkPlatformPkg/Include/QuarkBootRom.h
+ * from https://downloadcenter.intel.com/download/23197/Intel-Quark-SoC-X1000-Board-Support-Package-BSP
+ */
+struct quark_security_header {
+ u32 csh_signature;
+ u32 version;
+ u32 modulesize;
+ u32 security_version_number_index;
+ u32 security_version_number;
+ u32 rsvd_module_id;
+ u32 rsvd_module_vendor;
+ u32 rsvd_date;
+ u32 headersize;
+ u32 hash_algo;
+ u32 cryp_algo;
+ u32 keysize;
+ u32 signaturesize;
+ u32 rsvd_next_header;
+ u32 rsvd[2];
+};
+
static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
static bool efi_no_storage_paranoia;
@@ -360,6 +414,9 @@ void __init efi_free_boot_services(void)
free_bootmem_late(start, size);
}
+ if (!num_entries)
+ return;
+
new_size = efi.memmap.desc_size * num_entries;
new_phys = efi_memmap_alloc(num_entries);
if (!new_phys) {
@@ -501,3 +558,86 @@ bool efi_poweroff_required(void)
{
return acpi_gbl_reduced_hardware || acpi_no_s5;
}
+
+#ifdef CONFIG_EFI_CAPSULE_QUIRK_QUARK_CSH
+
+static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
+ size_t hdr_bytes)
+{
+ struct quark_security_header *csh = *pkbuff;
+
+ /* Only process data block that is larger than the security header */
+ if (hdr_bytes < sizeof(struct quark_security_header))
+ return 0;
+
+ if (csh->csh_signature != QUARK_CSH_SIGNATURE ||
+ csh->headersize != QUARK_SECURITY_HEADER_SIZE)
+ return 1;
+
+ /* Only process data block if EFI header is included */
+ if (hdr_bytes < QUARK_SECURITY_HEADER_SIZE +
+ sizeof(efi_capsule_header_t))
+ return 0;
+
+ pr_debug("Quark security header detected\n");
+
+ if (csh->rsvd_next_header != 0) {
+ pr_err("multiple Quark security headers not supported\n");
+ return -EINVAL;
+ }
+
+ *pkbuff += csh->headersize;
+ cap_info->total_size = csh->headersize;
+
+ /*
+ * Update the first page pointer to skip over the CSH header.
+ */
+ cap_info->pages[0] += csh->headersize;
+
+ return 1;
+}
+
+#define ICPU(family, model, quirk_handler) \
+ { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \
+ (unsigned long)&quirk_handler }
+
+static const struct x86_cpu_id efi_capsule_quirk_ids[] = {
+ ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */
+ { }
+};
+
+int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
+ size_t hdr_bytes)
+{
+ int (*quirk_handler)(struct capsule_info *, void **, size_t);
+ const struct x86_cpu_id *id;
+ int ret;
+
+ if (hdr_bytes < sizeof(efi_capsule_header_t))
+ return 0;
+
+ cap_info->total_size = 0;
+
+ id = x86_match_cpu(efi_capsule_quirk_ids);
+ if (id) {
+ /*
+ * The quirk handler is supposed to return
+ * - a value > 0 if the setup should continue, after advancing
+ * kbuff as needed
+ * - 0 if not enough hdr_bytes are available yet
+ * - a negative error code otherwise
+ */
+ quirk_handler = (typeof(quirk_handler))id->driver_data;
+ ret = quirk_handler(cap_info, &kbuff, hdr_bytes);
+ if (ret <= 0)
+ return ret;
+ }
+
+ memcpy(&cap_info->header, kbuff, sizeof(cap_info->header));
+
+ cap_info->total_size += cap_info->header.imagesize;
+
+ return __efi_capsule_setup_info(cap_info);
+}
+
+#endif
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
{
- void *pgd_addr = __va(read_cr3());
+ void *pgd_addr = __va(read_cr3_pa());
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 42e65fee5673..2983faab5b18 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long ns;
- ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return ns;
}
@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long cyc;
- cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return cyc;
}
@@ -1121,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- unsigned int cpu)
+ const struct flush_tlb_info *info)
{
+ unsigned int cpu = smp_processor_id();
int locals = 0, remotes = 0, hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
@@ -1179,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
- if (!end || (end - start) <= PAGE_SIZE)
- address = start;
+ if (!info->end || (info->end - info->start) <= PAGE_SIZE)
+ address = info->start;
else
address = TLB_FLUSH_ALL;
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index a6a198c33623..05041871ac90 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
+
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
*/
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
- ctxt->cr3 = read_cr3();
+ ctxt->cr3 = __read_cr3();
ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6a61194ffd58..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -104,7 +104,7 @@ static int set_up_temporary_mappings(void)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
- .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
.offset = __PAGE_OFFSET,
};
unsigned long mstart, mend;
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */
- pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+ pgd = (pgd_t *)__va(read_cr3_pa()) +
+ pgd_index(relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code);
if (p4d_large(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a163a90af4aa..cd4be19c36dc 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_level4_pgt[511].pgd;
+ trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif
}
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index a5c9910d234f..09a085bde0d4 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -125,7 +125,7 @@ int poke_user(struct task_struct *child, long addr, long data)
else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
(addr <= offsetof(struct user, u_debugreg[7]))) {
addr -= offsetof(struct user, u_debugreg[0]);
- addr = addr >> 2;
+ addr = addr >> 3;
if ((addr == 4) || (addr == 5))
return -EIO;
child->thread.arch.debugregs[addr] = data;
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h
index 46a9df99f3c5..7e1d35b6ad5c 100644
--- a/arch/x86/um/shared/sysdep/kernel-offsets.h
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -2,16 +2,9 @@
#include <linux/sched.h>
#include <linux/elf.h>
#include <linux/crypto.h>
+#include <linux/kbuild.h>
#include <asm/mman.h>
-#define DEFINE(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : : )
-
-#define OFFSET(sym, str, mem) \
- DEFINE(sym, offsetof(struct str, mem));
-
void foo(void)
{
#include <common-offsets.h>
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index fffb0a16f9e3..bced7a369a11 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,3 +1,6 @@
+OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_xen-pvh.o := y
+
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 30bb2e80cfe7..a18703be9ead 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -54,38 +54,6 @@ static efi_system_table_t efi_systab_xen __initdata = {
.tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */
};
-static const struct efi efi_xen __initconst = {
- .systab = NULL, /* Initialized later. */
- .runtime_version = 0, /* Initialized later. */
- .mps = EFI_INVALID_TABLE_ADDR,
- .acpi = EFI_INVALID_TABLE_ADDR,
- .acpi20 = EFI_INVALID_TABLE_ADDR,
- .smbios = EFI_INVALID_TABLE_ADDR,
- .smbios3 = EFI_INVALID_TABLE_ADDR,
- .sal_systab = EFI_INVALID_TABLE_ADDR,
- .boot_info = EFI_INVALID_TABLE_ADDR,
- .hcdp = EFI_INVALID_TABLE_ADDR,
- .uga = EFI_INVALID_TABLE_ADDR,
- .uv_systab = EFI_INVALID_TABLE_ADDR,
- .fw_vendor = EFI_INVALID_TABLE_ADDR,
- .runtime = EFI_INVALID_TABLE_ADDR,
- .config_table = EFI_INVALID_TABLE_ADDR,
- .get_time = xen_efi_get_time,
- .set_time = xen_efi_set_time,
- .get_wakeup_time = xen_efi_get_wakeup_time,
- .set_wakeup_time = xen_efi_set_wakeup_time,
- .get_variable = xen_efi_get_variable,
- .get_next_variable = xen_efi_get_next_variable,
- .set_variable = xen_efi_set_variable,
- .query_variable_info = xen_efi_query_variable_info,
- .update_capsule = xen_efi_update_capsule,
- .query_capsule_caps = xen_efi_query_capsule_caps,
- .get_next_high_mono_count = xen_efi_get_next_high_mono_count,
- .reset_system = xen_efi_reset_system,
- .set_virtual_address_map = NULL, /* Not used under Xen. */
- .flags = 0 /* Initialized later. */
-};
-
static efi_system_table_t __init *xen_efi_probe(void)
{
struct xen_platform_op op = {
@@ -102,7 +70,18 @@ static efi_system_table_t __init *xen_efi_probe(void)
/* Here we know that Xen runs on EFI platform. */
- efi = efi_xen;
+ efi.get_time = xen_efi_get_time;
+ efi.set_time = xen_efi_set_time;
+ efi.get_wakeup_time = xen_efi_get_wakeup_time;
+ efi.set_wakeup_time = xen_efi_set_wakeup_time;
+ efi.get_variable = xen_efi_get_variable;
+ efi.get_next_variable = xen_efi_get_next_variable;
+ efi.set_variable = xen_efi_set_variable;
+ efi.query_variable_info = xen_efi_query_variable_info;
+ efi.update_capsule = xen_efi_update_capsule;
+ efi.query_capsule_caps = xen_efi_query_capsule_caps;
+ efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
+ efi.reset_system = xen_efi_reset_system;
efi_systab_xen.tables = info->cfg.addr;
efi_systab_xen.nr_tables = info->cfg.nent;
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index a732bc2b9dfc..f33eef4ebd12 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -142,9 +142,7 @@ static void __init xen_banner(void)
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
- pr_info("Booting paravirtualized kernel %son %s\n",
- xen_feature(XENFEAT_auto_translated_physmap) ?
- "with PVH extensions " : "", pv_info.name);
+ pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -277,31 +275,19 @@ static bool __init xen_check_mwait(void)
static bool __init xen_check_xsave(void)
{
- unsigned int err, eax, edx;
+ unsigned int cx, xsave_mask;
- /*
- * Xen 4.0 and older accidentally leaked the host XSAVE flag into guest
- * view, despite not being able to support guests using the
- * functionality. Probe for the actual availability of XSAVE by seeing
- * whether xgetbv executes successfully or raises #UD.
- */
- asm volatile("1: .byte 0x0f,0x01,0xd0\n\t" /* xgetbv */
- "xor %[err], %[err]\n"
- "2:\n\t"
- ".pushsection .fixup,\"ax\"\n\t"
- "3: movl $1,%[err]\n\t"
- "jmp 2b\n\t"
- ".popsection\n\t"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err), "=a" (eax), "=d" (edx)
- : "c" (0));
-
- return err == 0;
+ cx = cpuid_ecx(1);
+
+ xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
+
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ return (cx & xsave_mask) == xsave_mask;
}
static void __init xen_init_capabilities(void)
{
- setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS);
setup_force_cpu_cap(X86_FEATURE_XENPV);
setup_clear_cpu_cap(X86_FEATURE_DCA);
setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
@@ -317,10 +303,7 @@ static void __init xen_init_capabilities(void)
else
setup_clear_cpu_cap(X86_FEATURE_MWAIT);
- if (xen_check_xsave()) {
- setup_force_cpu_cap(X86_FEATURE_XSAVE);
- setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
- } else {
+ if (!xen_check_xsave()) {
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
}
@@ -972,15 +955,10 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
void xen_setup_shared_info(void)
{
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- set_fixmap(FIX_PARAVIRT_BOOTMAP,
- xen_start_info->shared_info);
+ set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
- HYPERVISOR_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
- } else
- HYPERVISOR_shared_info =
- (struct shared_info *)__va(xen_start_info->shared_info);
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
#ifndef CONFIG_SMP
/* In UP this is as good a place as any to set up shared info */
@@ -988,6 +966,13 @@ void xen_setup_shared_info(void)
#endif
xen_setup_mfn_list_list();
+
+ /*
+ * Now that shared info is set up we can start using routines that
+ * point to pvclock area.
+ */
+ if (system_state == SYSTEM_BOOTING)
+ xen_init_time_ops();
}
/* This is called once we have the cpu_possible_mask */
@@ -1286,8 +1271,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
- xen_init_time_ops();
-
/*
* Set up some pagetable state before starting to set any ptes.
*/
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e375a5e815f..3be06f3caf3c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,7 +42,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
}
EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
-void xen_flush_tlb_all(void)
+static void xen_flush_tlb_all(void)
{
struct mmuext_op *op;
struct multicall_space mcs;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 9d9ae6650aa1..1d7a7213a310 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -355,10 +355,8 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
pteval_t flags = val & PTE_FLAGS_MASK;
unsigned long mfn;
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- mfn = __pfn_to_mfn(pfn);
- else
- mfn = pfn;
+ mfn = __pfn_to_mfn(pfn);
+
/*
* If there's no mfn for the pfn, then just create an
* empty non-present pte. Unfortunately this loses
@@ -647,9 +645,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
limit--;
BUG_ON(limit >= FIXADDR_TOP);
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
/*
* 64-bit has a great big hole in the middle of the address
* space, which contains the Xen mappings. On 32-bit these
@@ -980,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
-
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
- we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
+static void drop_mm_ref_this_cpu(void *info)
{
struct mm_struct *mm = info;
- struct mm_struct *active_mm;
-
- active_mm = this_cpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
leave_mm(smp_processor_id());
- /* If this cpu still has a stale cr3 reference, then make sure
- it has been flushed. */
+ /*
+ * If this cpu still has a stale cr3 reference, then make sure
+ * it has been flushed.
+ */
if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
- load_cr3(swapper_pg_dir);
+ xen_mc_flush();
}
+#ifdef CONFIG_SMP
+/*
+ * Another cpu may still have their %cr3 pointing at the pagetable, so
+ * we need to repoint it somewhere else before we can unpin it.
+ */
static void xen_drop_mm_ref(struct mm_struct *mm)
{
cpumask_var_t mask;
unsigned cpu;
- if (current->active_mm == mm) {
- if (current->mm == mm)
- load_cr3(swapper_pg_dir);
- else
- leave_mm(smp_processor_id());
- }
+ drop_mm_ref_this_cpu(mm);
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
@@ -1018,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
- smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
}
return;
}
cpumask_copy(mask, mm_cpumask(mm));
- /* It's possible that a vcpu may have a stale reference to our
- cr3, because its in lazy mode, and it hasn't yet flushed
- its set of pending hypercalls yet. In this case, we can
- look at its actual current cr3 value, and force it to flush
- if needed. */
+ /*
+ * It's possible that a vcpu may have a stale reference to our
+ * cr3, because its in lazy mode, and it hasn't yet flushed
+ * its set of pending hypercalls yet. In this case, we can
+ * look at its actual current cr3 value, and force it to flush
+ * if needed.
+ */
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
cpumask_set_cpu(cpu, mask);
}
- if (!cpumask_empty(mask))
- smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
free_cpumask_var(mask);
}
#else
static void xen_drop_mm_ref(struct mm_struct *mm)
{
- if (current->active_mm == mm)
- load_cr3(swapper_pg_dir);
+ drop_mm_ref_this_cpu(mm);
}
#endif
@@ -1289,9 +1279,6 @@ static void __init xen_pagetable_cleanhighmap(void)
static void __init xen_pagetable_p2m_setup(void)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
xen_vmalloc_p2m_tree();
#ifdef CONFIG_X86_64
@@ -1314,8 +1301,7 @@ static void __init xen_pagetable_init(void)
xen_build_mfn_list_list();
/* Remap memory freed due to conflicts with E820 map */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_remap_memory();
+ xen_remap_memory();
xen_setup_shared_info();
}
@@ -1375,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
}
static void xen_flush_tlb_others(const struct cpumask *cpus,
- struct mm_struct *mm, unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
struct {
struct mmuext_op op;
@@ -1388,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
} *args;
struct multicall_space mcs;
- trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
+ trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus))
return; /* nothing to do */
@@ -1402,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+ if (info->end != TLB_FLUSH_ALL &&
+ (info->end - info->start) <= PAGE_SIZE) {
args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = start;
+ args->op.arg1.linear_addr = info->start;
}
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1479,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
* At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them
* in xen_setup_kernel_pagetable and graft as appropriate them in the
- * init_level4_pgt and its friends. Then when we are happy we load
- * the new init_level4_pgt - and continue on.
+ * init_top_pgt and its friends. Then when we are happy we load
+ * the new init_top_pgt - and continue on.
*
* The generic code starts (start_kernel) and 'init_mem_mapping' sets
* up the rest of the pagetables. When it has completed it loads the cr3.
@@ -1923,23 +1909,22 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
pt_end = pt_base + xen_start_info->nr_pt_frames;
/* Zap identity mapping */
- init_level4_pgt[0] = __pgd(0);
-
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* Pre-constructed entries are in pfn, so convert to mfn */
- /* L4[272] -> level3_ident_pgt
- * L4[511] -> level3_kernel_pgt */
- convert_pfn_mfn(init_level4_pgt);
-
- /* L3_i[0] -> level2_ident_pgt */
- convert_pfn_mfn(level3_ident_pgt);
- /* L3_k[510] -> level2_kernel_pgt
- * L3_k[511] -> level2_fixmap_pgt */
- convert_pfn_mfn(level3_kernel_pgt);
-
- /* L3_k[511][506] -> level1_fixmap_pgt */
- convert_pfn_mfn(level2_fixmap_pgt);
- }
+ init_top_pgt[0] = __pgd(0);
+
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ /* L4[272] -> level3_ident_pgt */
+ /* L4[511] -> level3_kernel_pgt */
+ convert_pfn_mfn(init_top_pgt);
+
+ /* L3_i[0] -> level2_ident_pgt */
+ convert_pfn_mfn(level3_ident_pgt);
+ /* L3_k[510] -> level2_kernel_pgt */
+ /* L3_k[511] -> level2_fixmap_pgt */
+ convert_pfn_mfn(level3_kernel_pgt);
+
+ /* L3_k[511][506] -> level1_fixmap_pgt */
+ convert_pfn_mfn(level2_fixmap_pgt);
+
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1960,36 +1945,32 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map))
- init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
-
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
- set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
- set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
-
- /* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
- /* Unpin Xen-provided one */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+ init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
+ /* Make pagetable pieces RO */
+ set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_top_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- /*
- * At this stage there can be no user pgd, and no page
- * structure to attach it to, so make sure we just set kernel
- * pgd.
- */
- xen_mc_batch();
- __xen_write_cr3(true, __pa(init_level4_pgt));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
- } else
- native_write_cr3(__pa(init_level4_pgt));
+ /*
+ * At this stage there can be no user pgd, and no page structure to
+ * attach it to, so make sure we just set kernel pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(init_top_pgt));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -2025,7 +2006,8 @@ static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
/*
* Translate a virtual address to a physical one without relying on mapped
- * page tables.
+ * page tables. Don't rely on big pages being aligned in (guest) physical
+ * space!
*/
static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
{
@@ -2035,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
pmd_t pmd;
pte_t pte;
- pa = read_cr3();
+ pa = read_cr3_pa();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd)));
if (!pgd_present(pgd))
@@ -2046,7 +2028,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
sizeof(pud)));
if (!pud_present(pud))
return 0;
- pa = pud_pfn(pud) << PAGE_SHIFT;
+ pa = pud_val(pud) & PTE_PFN_MASK;
if (pud_large(pud))
return pa + (vaddr & ~PUD_MASK);
@@ -2054,7 +2036,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
sizeof(pmd)));
if (!pmd_present(pmd))
return 0;
- pa = pmd_pfn(pmd) << PAGE_SHIFT;
+ pa = pmd_val(pmd) & PTE_PFN_MASK;
if (pmd_large(pmd))
return pa + (vaddr & ~PMD_MASK);
@@ -2115,7 +2097,7 @@ void __init xen_relocate_p2m(void)
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
- pgd = __va(read_cr3());
+ pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
idx_p4d = 0;
save_pud = n_pud;
@@ -2222,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
{
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
- BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(read_cr3_pa() != __pa(initial_page_table));
BUG_ON(cr3 != __pa(swapper_pg_dir));
/*
@@ -2402,9 +2384,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
static void __init xen_post_allocator_init(void)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
@@ -2510,9 +2489,6 @@ void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2649,9 +2625,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
* this function are redundant and can be ignored.
*/
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
if (unlikely(order > MAX_CONTIG_ORDER))
return -ENOMEM;
@@ -2688,9 +2661,6 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
int success;
unsigned long vstart;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
if (unlikely(order > MAX_CONTIG_ORDER))
return;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 090c7eb4dca9..a1895a8e85c1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -406,7 +406,7 @@ static void __init xen_time_init(void)
pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
-void __init xen_init_time_ops(void)
+void __ref xen_init_time_ops(void)
{
pv_time_ops = xen_time_ops;
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 5e246716d58f..e1a5fbeae08d 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
wrmsr
/* Enable pre-constructed page tables. */
- mov $_pa(init_level4_pgt), %eax
+ mov $_pa(init_top_pgt), %eax
mov %eax, %cr3
mov $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0