summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig14
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/eboot.c73
-rw-r--r--arch/x86/boot/compressed/head_64.S86
-rw-r--r--arch/x86/boot/compressed/kaslr.c194
-rw-r--r--arch/x86/boot/compressed/misc.c6
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/pagetable.c18
-rw-r--r--arch/x86/boot/copy.S20
-rw-r--r--arch/x86/boot/string.c8
-rw-r--r--arch/x86/boot/string.h1
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/sha1-mb/Makefile2
-rw-r--r--arch/x86/crypto/sha256-mb/Makefile2
-rw-r--r--arch/x86/entry/entry_64.S3
-rw-r--r--arch/x86/events/core.c28
-rw-r--r--arch/x86/events/intel/core.c78
-rw-r--r--arch/x86/events/intel/cqm.c16
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/include/asm/amd_nb.h3
-rw-r--r--arch/x86/include/asm/apic.h36
-rw-r--r--arch/x86/include/asm/atomic.h13
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/extable.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mshyperv.h4
-rw-r--r--arch/x86/include/asm/msr-index.h20
-rw-r--r--arch/x86/include/asm/paravirt.h10
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pci.h8
-rw-r--r--arch/x86/include/asm/pgtable-3level.h47
-rw-r--r--arch/x86/include/asm/pgtable.h55
-rw-r--r--arch/x86/include/asm/pgtable_64.h22
-rw-r--r--arch/x86/include/asm/processor-flags.h36
-rw-r--r--arch/x86/include/asm/processor.h15
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/special_insns.h10
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/timer.h8
-rw-r--r--arch/x86/include/asm/tlbbatch.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h114
-rw-r--r--arch/x86/include/asm/uv/uv.h11
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h15
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/apic/apic.c115
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c4
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/htirq.c21
-rw-r--r--arch/x86/kernel/apic/io_apic.c44
-rw-r--r--arch/x86/kernel/apic/msi.c55
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/vector.c53
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c36
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c26
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c79
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c55
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c569
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c273
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c18
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c24
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c18
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c10
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/head64.c145
-rw-r--r--arch/x86/kernel/head_64.S131
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/irq.c78
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/ldt.c56
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/nmi_selftest.c2
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c4
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c210
-rw-r--r--arch/x86/kernel/tsc_sync.c21
-rw-r--r--arch/x86/kvm/cpuid.c20
-rw-r--r--arch/x86/kvm/emulate.c1
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu.c7
-rw-r--r--arch/x86/kvm/mmu.h1
-rw-r--r--arch/x86/kvm/svm.c28
-rw-r--r--arch/x86/kvm/vmx.c173
-rw-r--r--arch/x86/kvm/x86.c72
-rw-r--r--arch/x86/lib/copy_user_64.S7
-rw-r--r--arch/x86/lib/msr-reg.S8
-rw-r--r--arch/x86/lib/x86-opcode-map.txt2
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/extable.c3
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/gup.c496
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init.c10
-rw-r--r--arch/x86/mm/init_64.c116
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c12
-rw-r--r--arch/x86/mm/kaslr.c81
-rw-r--r--arch/x86/mm/mmap.c3
-rw-r--r--arch/x86/mm/pat.c9
-rw-r--r--arch/x86/mm/tlb.c458
-rw-r--r--arch/x86/net/Makefile2
-rw-r--r--arch/x86/pci/ce4100.c87
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/pci/legacy.c18
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c9
-rw-r--r--arch/x86/platform/efi/efi_32.c9
-rw-r--r--arch/x86/platform/efi/efi_64.c88
-rw-r--r--arch/x86/platform/efi/quirks.c140
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c2
-rw-r--r--arch/x86/platform/uv/tlb_uv.c24
-rw-r--r--arch/x86/platform/uv/uv_irq.c18
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c9
-rw-r--r--arch/x86/ras/Kconfig11
-rw-r--r--arch/x86/ras/Makefile2
-rw-r--r--arch/x86/ras/mce_amd_inj.c492
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/efi.c45
-rw-r--r--arch/x86/xen/mmu_pv.c83
-rw-r--r--arch/x86/xen/xen-pvh.S2
159 files changed, 3069 insertions, 2786 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4ccfacc7232a..e767ed24aeb4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,7 @@ config X86
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT
@@ -87,6 +87,8 @@ config X86
select GENERIC_EARLY_IOREMAP
select GENERIC_FIND_FIRST_BIT
select GENERIC_IOMAP
+ select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
+ select GENERIC_IRQ_MIGRATION if SMP
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_PENDING_IRQ if SMP
@@ -166,6 +168,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select IRQ_FORCED_THREADING
+ select PCI_LOCKLESS_CONFIG
select PERF_EVENTS
select RTC_LIB
select RTC_MC146818_LIB
@@ -1082,7 +1085,7 @@ config X86_MCE_THRESHOLD
def_bool y
config X86_MCE_INJECT
- depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
+ depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS
tristate "Machine check injector support"
---help---
Provide support for injecting machine checks for testing purposes.
@@ -2776,10 +2779,6 @@ config COMPAT_FOR_U64_ALIGNMENT
config SYSVIPC_COMPAT
def_bool y
depends on SYSVIPC
-
-config KEYS_COMPAT
- def_bool y
- depends on KEYS
endif
endmenu
@@ -2797,6 +2796,9 @@ config X86_DMA_REMAP
bool
depends on STA2X11
+config HAVE_GENERIC_GUP
+ def_bool y
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index bf240b920473..ad2db82e9953 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -257,8 +257,6 @@ drivers-$(CONFIG_PM) += arch/x86/power/
drivers-$(CONFIG_FB) += arch/x86/video/
-drivers-$(CONFIG_RAS) += arch/x86/ras/
-
####
# boot loader support. Several targets are kept for legacy purposes
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 73ccf63b0f48..9dc1ce6ba3c0 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
-static unsigned long get_cmd_line_ptr(void)
+unsigned long get_cmd_line_ptr(void)
{
unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index cbf4b87f55b9..c3e869eaef0c 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
- /* The first GDT is a dummy and the second is unused. */
- desc += 2;
+ /* The first GDT is a dummy. */
+ desc++;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* __KERNEL32_CS */
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ } else {
+ /* Second entry is unused on 32-bit */
+ desc++;
+ }
+ /* __KERNEL_CS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
desc->p = 1;
desc->limit = 0xf;
desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ desc->l = 1;
+ desc->d = 0;
+ } else {
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ }
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
-
desc++;
+
+ /* __KERNEL_DS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
-
-#ifdef CONFIG_X86_64
- /* Task segment value */
desc++;
- desc->limit0 = 0x0000;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_TSS;
- desc->s = 0;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit = 0x0;
- desc->avl = 0;
- desc->l = 0;
- desc->d = 0;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
-#endif /* CONFIG_X86_64 */
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Task segment value */
+ desc->limit0 = 0x0000;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_TSS;
+ desc->s = 0;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0x0;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = 0;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ }
asm volatile("cli");
asm volatile ("lgdt %0" : : "m" (*gdt));
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..fbf4c32d0b62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -346,6 +346,48 @@ preferred_addr:
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
+#ifdef CONFIG_X86_5LEVEL
+ /* Check if 5-level paging has already enabled */
+ movq %cr4, %rax
+ testl $X86_CR4_LA57, %eax
+ jnz lvl5
+
+ /*
+ * At this point we are in long mode with 4-level paging enabled,
+ * but we want to enable 5-level paging.
+ *
+ * The problem is that we cannot do it directly. Setting LA57 in
+ * long mode would trigger #GP. So we need to switch off long mode
+ * first.
+ *
+ * NOTE: This is not going to work if bootloader put us above 4G
+ * limit.
+ *
+ * The first step is go into compatibility mode.
+ */
+
+ /* Clear additional page table */
+ leaq lvl5_pgtable(%rbx), %rdi
+ xorq %rax, %rax
+ movq $(PAGE_SIZE/8), %rcx
+ rep stosq
+
+ /*
+ * Setup current CR3 as the first and only entry in a new top level
+ * page table.
+ */
+ movq %cr3, %rdi
+ leaq 0x7 (%rdi), %rax
+ movq %rax, lvl5_pgtable(%rbx)
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq compatible_mode(%rip), %rax
+ pushq %rax
+ lretq
+lvl5:
+#endif
+
/* Zero EFLAGS */
pushq $0
popfq
@@ -429,6 +471,44 @@ relocated:
jmp *%rax
.code32
+#ifdef CONFIG_X86_5LEVEL
+compatible_mode:
+ /* Setup data and stack segments */
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %ss
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 to 5-level paging */
+ leal lvl5_pgtable(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Enable PAE and LA57 mode */
+ movl %cr4, %eax
+ orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
+ movl %eax, %cr4
+
+ /* Calculate address we are running at */
+ call 1f
+1: popl %edi
+ subl $1b, %edi
+
+ /* Prepare stack for far return to Long Mode */
+ pushl $__KERNEL_CS
+ leal lvl5(%edi), %eax
+ push %eax
+
+ /* Enable paging back */
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ lret
+#endif
+
no_longmode:
/* This isn't an x86-64 CPU so hang */
1:
@@ -442,7 +522,7 @@ gdt:
.word gdt_end - gdt
.long gdt
.word 0
- .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
@@ -486,3 +566,7 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
+#ifdef CONFIG_X86_5LEVEL
+lvl5_pgtable:
+ .fill PAGE_SIZE, 1, 0
+#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 54c24f0a43d3..91f27ab970ef 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -9,16 +9,42 @@
* contain the entire properly aligned running kernel image.
*
*/
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+/*
+ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
+ * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
+ * which is meaningless and will cause compiling error in some cases.
+ * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
+ * as empty.
+ */
+#define _LINUX_EXPORT_H
+#define EXPORT_SYMBOL(sym)
+
#include "misc.h"
#include "error.h"
-#include "../boot.h"
+#include "../string.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <generated/utsrelease.h>
+/* Macros used by the included decompressor code below. */
+#define STATIC
+#include <linux/decompress/mm.h>
+
+extern unsigned long get_cmd_line_ptr(void);
+
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
@@ -62,6 +88,11 @@ struct mem_vector {
static bool memmap_too_large;
+
+/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
+unsigned long long mem_limit = ULLONG_MAX;
+
+
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
@@ -85,49 +116,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
return true;
}
-/**
- * _memparse - Parse a string with mem suffixes into a number
- * @ptr: Where parse begins
- * @retptr: (output) Optional pointer to next char after parse completes
- *
- * Parses a string into a number. The number stored at @ptr is
- * potentially suffixed with K, M, G, T, P, E.
- */
-static unsigned long long _memparse(const char *ptr, char **retptr)
+char *skip_spaces(const char *str)
{
- char *endptr; /* Local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
- case 'E':
- case 'e':
- ret <<= 10;
- case 'P':
- case 'p':
- ret <<= 10;
- case 'T':
- case 't':
- ret <<= 10;
- case 'G':
- case 'g':
- ret <<= 10;
- case 'M':
- case 'm':
- ret <<= 10;
- case 'K':
- case 'k':
- ret <<= 10;
- endptr++;
- default:
- break;
- }
-
- if (retptr)
- *retptr = endptr;
-
- return ret;
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
static int
parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
@@ -142,40 +138,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
return -EINVAL;
oldp = p;
- *size = _memparse(p, &p);
+ *size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
switch (*p) {
- case '@':
- /* Skip this region, usable */
- *start = 0;
- *size = 0;
- return 0;
case '#':
case '$':
case '!':
- *start = _memparse(p + 1, &p);
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /* memmap=nn@ss specifies usable region, should be skipped */
+ *size = 0;
+ /* Fall through */
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
return 0;
}
return -EINVAL;
}
-static void mem_avoid_memmap(void)
+static void mem_avoid_memmap(char *str)
{
- char arg[128];
+ static int i;
int rc;
- int i;
- char *str;
- /* See if we have any memmap areas */
- rc = cmdline_find_option("memmap", arg, sizeof(arg));
- if (rc <= 0)
+ if (i >= MAX_MEMMAP_REGIONS)
return;
- i = 0;
- str = arg;
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
unsigned long long start, size;
@@ -188,9 +185,14 @@ static void mem_avoid_memmap(void)
if (rc < 0)
break;
str = k;
- /* A usable region that should not be skipped */
- if (size == 0)
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0)
+ mem_limit = size;
+
continue;
+ }
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
@@ -202,6 +204,57 @@ static void mem_avoid_memmap(void)
memmap_too_large = true;
}
+static int handle_mem_memmap(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len = strlen((char *)args);
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ return 0;
+
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline )
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0) {
+ warn("Only '--' specified in cmdline");
+ free(tmp_cmdline);
+ return -1;
+ }
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0) {
+ free(tmp_cmdline);
+ return -EINVAL;
+ }
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return 0;
+}
+
/*
* In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
* The mem_avoid array is used to store the ranges that need to be avoided
@@ -323,7 +376,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
- mem_avoid_memmap();
+ handle_mem_memmap();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -432,7 +485,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
{
struct mem_vector region, overlap;
struct slot_area slot_area;
- unsigned long start_orig;
+ unsigned long start_orig, end;
+ struct boot_e820_entry cur_entry;
/* Skip non-RAM entries. */
if (entry->type != E820_TYPE_RAM)
@@ -446,8 +500,15 @@ static void process_e820_entry(struct boot_e820_entry *entry,
if (entry->addr + entry->size < minimum)
return;
- region.start = entry->addr;
- region.size = entry->size;
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->addr, mem_limit);
+ if (entry->addr >= end)
+ return;
+ cur_entry.addr = entry->addr;
+ cur_entry.size = end - entry->addr;
+
+ region.start = cur_entry.addr;
+ region.size = cur_entry.size;
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
@@ -461,7 +522,7 @@ static void process_e820_entry(struct boot_e820_entry *entry,
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above this e820 region? */
- if (region.start > entry->addr + entry->size)
+ if (region.start > cur_entry.addr + cur_entry.size)
return;
/* Reduce size by any delta from the original address. */
@@ -564,9 +625,6 @@ void choose_random_location(unsigned long input,
{
unsigned long random_addr, min_addr;
- /* By default, keep output position unchanged. */
- *virt_addr = *output;
-
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline.");
return;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b3c5a5f030ce..00241c815524 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -338,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
unsigned long output_len)
{
const unsigned long kernel_total_size = VO__end - VO__text;
- unsigned long virt_addr = (unsigned long)output;
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -390,6 +390,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
+ if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
+ error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
@@ -397,7 +399,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifndef CONFIG_RELOCATABLE
if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
error("Destination address does not match LOAD_PHYSICAL_ADDR");
- if ((unsigned long)output != virt_addr)
+ if (virt_addr != LOAD_PHYSICAL_ADDR)
error("Destination virtual address changed when not relocatable");
#endif
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 1c8355eadbd1..766a5211f827 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -81,8 +81,6 @@ static inline void choose_random_location(unsigned long input,
unsigned long output_size,
unsigned long *virt_addr)
{
- /* No change from existing output location. */
- *virt_addr = *output;
}
#endif
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..28029be47fbb 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
-static unsigned long level4p;
+static unsigned long top_level_pgt;
/*
* Mapping information structure passed to kernel_ident_mapping_init().
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
* If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely
* overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
*/
- level4p = read_cr3();
- if (level4p == (unsigned long)_pgtable) {
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
- level4p = (unsigned long)alloc_pgt_page(&pgt_data);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
}
}
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
return;
/* Build the mapping. */
- kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
+ kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
start, end);
}
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
*/
void finalize_identity_maps(void)
{
- write_cr3(level4p);
+ write_cr3(top_level_pgt);
}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 1eb7d298b47d..15d9f74b0008 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -65,23 +65,3 @@ GLOBAL(copy_to_fs)
popw %es
retl
ENDPROC(copy_to_fs)
-
-#if 0 /* Not currently used, but can be enabled as needed */
-GLOBAL(copy_from_gs)
- pushw %ds
- pushw %gs
- popw %ds
- calll memcpy
- popw %ds
- retl
-ENDPROC(copy_from_gs)
-
-GLOBAL(copy_to_gs)
- pushw %es
- pushw %gs
- popw %es
- calll memcpy
- popw %es
- retl
-ENDPROC(copy_to_gs)
-#endif
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5457b02fc050..630e3664906b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
return result;
}
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
/**
* strlen - Find the length of a string
* @s: The string to be sized
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index 113588ddb43f..f274a50db5fa 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -22,6 +22,7 @@ extern int strcmp(const char *str1, const char *str2);
extern int strncmp(const char *cs, const char *ct, size_t count);
extern size_t strlen(const char *s);
extern char *strstr(const char *s1, const char *s2);
+extern char *strchr(const char *s, int c);
extern size_t strnlen(const char *s, size_t maxlen);
extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 34b3fa2889d1..9e32d40d71bd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
index 2f8756375df5..2e14acc3da25 100644
--- a/arch/x86/crypto/sha1-mb/Makefile
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
index 41089e7c400c..45b4fca6c4a8 100644
--- a/arch/x86/crypto/sha256-mb/Makefile
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4a4c0834f965..a9a8027a6c0e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
- * Change top 16 bits to be the sign-extension of 47th bit
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
*/
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f5ac83..ff1ea2fb9705 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
return ret;
}
+static struct attribute_group x86_pmu_attr_group;
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_events_group.attrs = tmp;
}
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -2101,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
static void refresh_pce(void *ignored)
{
- if (current->active_mm)
- load_mm_cr4(current->active_mm);
+ load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
static void x86_pmu_event_mapped(struct perf_event *event)
@@ -2224,7 +2233,6 @@ void perf_check_microcode(void)
if (x86_pmu.check_microcode)
x86_pmu.check_microcode();
}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
@@ -2255,7 +2263,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
@@ -2267,17 +2275,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
@@ -2289,7 +2297,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
@@ -2334,7 +2342,7 @@ static unsigned long get_segment_base(unsigned int segment)
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
- if (!ldt || idx > ldt->size)
+ if (!ldt || idx > ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4e37a1..aa62437d1aa1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -431,11 +431,11 @@ static __initconst const u64 skl_hw_cache_event_ids
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
- [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
- [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
@@ -3160,6 +3160,19 @@ err:
return -ENOMEM;
}
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
if (!cpuc->shared_regs)
return;
@@ -3410,12 +3425,10 @@ static void intel_snb_check_microcode(void)
int pebs_broken = 0;
int cpu;
- get_online_cpus();
for_each_online_cpu(cpu) {
if ((pebs_broken = intel_snb_pebs_broken(cpu)))
break;
}
- put_online_cpus();
if (pebs_broken == x86_pmu.pebs_broken)
return;
@@ -3488,7 +3501,9 @@ static bool check_msr(unsigned long msr, u64 mask)
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
+ cpus_read_lock();
intel_snb_check_microcode();
+ cpus_read_unlock();
}
static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = {
NULL
};
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ get_online_cpus();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ put_online_cpus();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ NULL,
+};
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void)
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+ x86_pmu.attrs = intel_pmu_attrs;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
@@ -4112,13 +4175,12 @@ static __init int fixup_ht_bug(void)
lockup_detector_resume();
- get_online_cpus();
+ cpus_read_lock();
- for_each_online_cpu(c) {
+ for_each_online_cpu(c)
free_excl_cntrs(c);
- }
- put_online_cpus();
+ cpus_read_unlock();
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
return 0;
}
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 8c00dc09a5d2..2521f771f2f5 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -1682,7 +1682,7 @@ static int __init intel_cqm_init(void)
*
* Also, check that the scales match on all cpus.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -1746,14 +1746,14 @@ static int __init intel_cqm_init(void)
* Setup the hot cpu notifier once we are sure cqm
* is enabled to avoid notifier leak.
*/
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING,
- "perf/x86/cqm:starting",
- intel_cqm_cpu_starting, NULL);
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online",
- NULL, intel_cqm_cpu_exit);
-
+ cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
+ "perf/x86/cqm:starting",
+ intel_cqm_cpu_starting, NULL);
+ cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
+ "perf/x86/cqm:online",
+ NULL, intel_cqm_cpu_exit);
out:
- put_online_cpus();
+ cpus_read_unlock();
if (ret) {
kfree(str);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index f924629836a8..eb261656a320 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -18,7 +18,7 @@ enum {
LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
};
-static enum {
+static const enum {
LBR_EIP_FLAGS = 1,
LBR_TSX = 2,
} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
@@ -287,7 +287,7 @@ inline u64 lbr_from_signext_quirk_wr(u64 val)
/*
* If quirk is needed, ensure sign extension is 61 bits:
*/
-u64 lbr_from_signext_quirk_rd(u64 val)
+static u64 lbr_from_signext_quirk_rd(u64 val)
{
if (static_branch_unlikely(&lbr_from_quirk_key)) {
/*
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 758c1aa5009d..44ec523287f6 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1170,7 +1170,7 @@ static int uncore_event_cpu_online(unsigned int cpu)
pmu = type->pmus;
for (i = 0; i < type->num_boxes; i++, pmu++) {
box = pmu->boxes[pkg];
- if (!box && atomic_inc_return(&box->refcnt) == 1)
+ if (box && atomic_inc_return(&box->refcnt) == 1)
uncore_box_init(box);
}
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index be3d36254040..53728eea1bed 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -562,6 +562,9 @@ struct x86_pmu {
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
+ unsigned long attr_freeze_on_smi;
+ struct attribute **attrs;
+
/*
* CPU Hotplug hooks
*/
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 00c88a01301d..da181ad1d5f8 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,6 +3,7 @@
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/refcount.h>
struct amd_nb_bus_dev_range {
u8 bus;
@@ -55,7 +56,7 @@ struct threshold_bank {
struct threshold_block *blocks;
/* initialized to the number of CPUs on the node sharing this bank */
- atomic_t cpus;
+ refcount_t cpus;
};
struct amd_northbridge {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bdffcd9eab2b..5f01671c68f2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,6 +252,8 @@ static inline int x2apic_enabled(void) { return 0; }
#define x2apic_supported() (0)
#endif /* !CONFIG_X86_X2APIC */
+struct irq_data;
+
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
@@ -296,9 +298,9 @@ struct apic {
/* Can't be NULL on 64-bit */
unsigned long (*set_apic_id)(unsigned int id);
- int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid);
+ int (*cpu_mask_to_apicid)(const struct cpumask *cpumask,
+ struct irq_data *irqdata,
+ unsigned int *apicid);
/* ipi */
void (*send_IPI)(int cpu, int vector);
@@ -540,28 +542,12 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
#endif
-static inline int
-flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
-{
- unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
- cpumask_bits(andmask)[0] &
- cpumask_bits(cpu_online_mask)[0] &
- APIC_ALL_CPUS;
-
- if (likely(cpu_mask)) {
- *apicid = (unsigned int)cpu_mask;
- return 0;
- } else {
- return -EINVAL;
- }
-}
-
-extern int
-default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid);
+extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask,
+ struct irq_data *irqdata,
+ unsigned int *apicid);
+extern int default_cpu_mask_to_apicid(const struct cpumask *cpumask,
+ struct irq_data *irqdata,
+ unsigned int *apicid);
static inline void
flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index caa5798c92f4..33380b871463 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -246,19 +246,6 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
return c;
}
-/**
- * atomic_inc_short - increment of a short integer
- * @v: pointer to type int
- *
- * Atomically adds 1 to @v
- * Returns the new value of @u
- */
-static __always_inline short int atomic_inc_short(short int *v)
-{
- asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
- return *v;
-}
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \
\
if (efi_scratch.use_pgd) { \
- efi_scratch.prev_cr3 = read_cr3(); \
+ efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \
} \
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index b8ad261d11dc..c66d19e3c23e 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -29,6 +29,7 @@ struct pt_regs;
} while (0)
extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern int fixup_bug(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- unsigned int irq_tlb_count;
#endif
+ unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 16d3fa211962..668cca540025 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -29,7 +29,6 @@ struct irq_desc;
#include <linux/cpumask.h>
extern int check_irq_vectors_for_cpu_disable(void);
extern void fixup_irqs(void);
-extern void irq_force_complete_move(struct irq_desc *desc);
#endif
#ifdef CONFIG_HAVE_KVM
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index a210eba2727c..023b4a9fc846 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -55,7 +55,8 @@ extern struct irq_domain *
irq_remapping_get_irq_domain(struct irq_alloc_info *info);
/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
-extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent);
+extern struct irq_domain *
+arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
/* Get parent irqdomain for interrupt remapping irqdomain */
static inline struct irq_domain *arch_get_ir_parent_domain(void)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 055962615779..722d0e568863 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -296,6 +296,7 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
bool ud; /* inject an #UD if host doesn't support insn */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception;
struct x86_exception exception;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 3f9a3d2a5209..181264989db5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -285,10 +285,6 @@ int mce_notify_irq(void);
DECLARE_PER_CPU(struct mce, injectm);
-extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off));
-
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index f9813b6d8b80..79b647a7ebd0 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,12 +37,6 @@ typedef struct {
#endif
} mm_context_t;
-#ifdef CONFIG_SMP
void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 68b329d77b3a..ecfcb6643c9b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
- unsigned int size;
+ unsigned int nr_entries;
};
/*
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
*/
if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->size);
+ set_ldt(ldt->entries, ldt->nr_entries);
else
clear_LDT();
#else
clear_LDT();
#endif
+}
+
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
}
static inline int init_new_context(struct task_struct *tsk,
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
}
#endif
-static inline bool __pkru_allows_pkey(u16 pkey, bool write)
-{
- u32 pkru = read_pkru();
-
- if (!__pkru_allows_read(pkru, pkey))
- return false;
- if (write && !__pkru_allows_write(pkru, pkey))
- return false;
-
- return true;
-}
-
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
+
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+static inline unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || !in_atomic());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index fba100713924..2b58c8c1eeaa 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -2,8 +2,7 @@
#define _ASM_X86_MSHYPER_H
#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
+#include <linux/atomic.h>
#include <asm/hyperv.h>
/*
@@ -137,7 +136,6 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
}
}
-#define hv_get_current_tick(tick) rdmsrl(HV_X64_MSR_TIME_REF_COUNT, tick)
#define hv_init_timer(timer, tick) wrmsrl(timer, tick)
#define hv_init_timer_config(config, val) wrmsrl(config, val)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 673f9ac50f6d..d406894cd9a2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,8 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
#define MSR_PEBS_FRONTEND 0x000003f7
@@ -249,9 +251,13 @@
#define HWP_MIN_PERF(x) (x & 0xff)
#define HWP_MAX_PERF(x) ((x & 0xff) << 8)
#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16)
-#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24)
-#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32)
-#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42)
+#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24)
+#define HWP_EPP_PERFORMANCE 0x00
+#define HWP_EPP_BALANCE_PERFORMANCE 0x80
+#define HWP_EPP_BALANCE_POWERSAVE 0xC0
+#define HWP_EPP_POWERSAVE 0xFF
+#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32)
+#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42)
/* IA32_HWP_STATUS */
#define HWP_GUARANTEED_CHANGE(x) (x & 0x1)
@@ -474,9 +480,11 @@
#define MSR_MISC_PWR_MGMT 0x000001aa
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
-#define ENERGY_PERF_BIAS_PERFORMANCE 0
-#define ENERGY_PERF_BIAS_NORMAL 6
-#define ENERGY_PERF_BIAS_POWERSAVE 15
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
+#define ENERGY_PERF_BIAS_POWERSAVE 15
#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 55fa56fe4e45..9ccac1926587 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
}
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
{
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
}
@@ -118,7 +118,7 @@ static inline u64 paravirt_read_msr(unsigned msr)
static inline void paravirt_write_msr(unsigned msr,
unsigned low, unsigned high)
{
- return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+ PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
}
static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
- PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
+ PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7465d6fe336f..cb976bab6299 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct;
struct task_struct;
struct cpumask;
+struct flush_tlb_info;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
+ const struct flush_tlb_info *info);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index f513cc231151..473a7295ab10 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -77,14 +77,8 @@ static inline bool is_vmd(struct pci_bus *bus)
extern unsigned int pcibios_assign_all_busses(void);
extern int pci_legacy_init(void);
-# ifdef CONFIG_ACPI
-# define x86_default_pci_init pci_acpi_init
-# else
-# define x86_default_pci_init pci_legacy_init
-# endif
#else
-# define pcibios_assign_all_busses() 0
-# define x86_default_pci_init NULL
+static inline int pcibios_assign_all_busses(void) { return 0; }
#endif
extern unsigned long pci_mem_start;
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 50d35e3185f5..c8821bab938f 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+#define gup_get_pte gup_get_pte
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a PTE will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have gotten rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
+ * we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f5af95a0c6b8..77037b6f1caa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
return 0;
}
#endif
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void)
{
/* Default trampoline pgd value */
- trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif
}
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * 'pteval' can come from a PTE, PMD or PUD. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+ return __pte_access_permitted(pte_val(pte), write);
+}
+
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+ return __pte_access_permitted(pmd_val(pmd), write);
+}
+
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+ return __pte_access_permitted(pud_val(pud), write);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..2160c1fee920 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
#include <linux/bitops.h>
#include <linux/threads.h>
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pgd_t init_top_pgt[];
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
extern void paging_init(void);
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-#endif /* !__ASSEMBLY__ */
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
+ int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long)nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (end < start)
+ return false;
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ return false;
+ return true;
+}
+#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
#else
#define X86_VM_MASK 0 /* No VM86 support */
#endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..6a79547e8ee0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+ return __read_cr3() & CR3_ADDR_MASK;
+}
+
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__pa(pgdir));
@@ -860,8 +868,6 @@ extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
@@ -901,8 +907,13 @@ static inline int mpx_disable_management(void)
}
#endif /* CONFIG_X86_INTEL_MPX */
+#ifdef CONFIG_CPU_SUP_AMD
extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
+#else
+static inline u16 amd_get_nb_id(int cpu) { return 0; }
+static inline u32 amd_get_nodes_per_socket(void) { return 0; }
+#endif
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ac1d5da14734..e4585a393965 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -44,7 +44,6 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
-extern void setup_default_timer_irq(void);
#ifdef CONFIG_X86_INTEL_MID
extern void x86_intel_mid_early_setup(void);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
}
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
native_write_cr2(x);
}
-static inline unsigned long read_cr3(void)
+/*
+ * Careful! CR3 contains more than just an address. You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
{
- return native_read_cr3();
+ return __native_read_cr3();
}
static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 6136a18152af..2bd96b4df140 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -42,8 +42,7 @@ struct saved_context {
set_debugreg((thread)->debugreg##register, register)
/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern char core_restore_code;
-extern char restore_registers;
+extern char core_restore_code[];
+extern char restore_registers[];
#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 27e9f9d769b8..2016962103df 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
- u32 __count;
- /* u32 hole */
-}; /* 24 bytes -- do not grow */
+}; /* 16 bytes */
-extern struct cyc2ns_data *cyc2ns_read_begin(void);
-extern void cyc2ns_read_end(struct cyc2ns_data *);
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..f4a6ff352a0e
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,14 @@
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+};
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..50ea3482e1d1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,7 @@
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
+#include <asm/smp.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
#endif
struct tlb_state {
-#ifdef CONFIG_SMP
- struct mm_struct *active_mm;
+ /*
+ * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
+ * are on. This means that it may not match current->active_mm,
+ * which will contain the previous user mm when we're in lazy TLB
+ * mode even if we've already switched back to swapper_pg_dir.
+ */
+ struct mm_struct *loaded_mm;
int state;
-#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
- native_write_cr3(native_read_cr3());
+ native_write_cr3(__native_read_cr3());
preempt_enable();
}
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
+ * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
*/
-
-#ifndef CONFIG_SMP
-
-/* "_up" is for UniProcessor.
- *
- * This is a helper for other header functions. *Not* intended to be called
- * directly. All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
-}
-
-static inline void flush_tlb_all(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb_all();
-}
-
-static inline void local_flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned long vmflag)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
-{
- flush_tlb_all();
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
+struct flush_tlb_info {
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+};
#define local_flush_tlb() __flush_tlb()
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start, unsigned long end);
+ const struct flush_tlb_info *info);
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-static inline void reset_lazy_tlbstate(void)
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
{
- this_cpu_write(cpu_tlbstate.state, 0);
- this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
-#endif /* SMP */
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, start, end) \
- native_flush_tlb_others(mask, mm, start, end)
+#define flush_tlb_others(mask, info) \
+ native_flush_tlb_others(mask, info)
#endif
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6686820feae9..b5a32231abd8 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
+#include <asm/tlbflush.h>
+
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
struct cpumask;
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- unsigned int cpu);
+ const struct flush_tlb_info *info);
#else /* X86_UV */
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned int cpu)
+uv_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
{ return cpumask; }
#endif /* X86_UV */
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 432df4b1baec..f4fef5a24ebd 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -34,16 +34,10 @@
#define HV_X64_MSR_REFERENCE_TSC 0x40000021
/*
- * There is a single feature flag that signifies the presence of the MSR
- * that can be used to retrieve both the local APIC Timer frequency as
- * well as the TSC frequency.
+ * There is a single feature flag that signifies if the partition has access
+ * to MSRs with local APIC and TSC frequencies.
*/
-
-/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */
-#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11)
-
-/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */
-#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11)
+#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
/*
* Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
@@ -73,6 +67,9 @@
*/
#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+/* Frequency MSRs available */
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8)
+
/* Crash MSR available */
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
+#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..a01892bdd61a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8233a630280f..dde437f5d14f 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -167,7 +167,8 @@ static int __init ffh_cstate_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (c->x86_vendor != X86_VENDOR_INTEL)
+ if (c->x86_vendor != X86_VENDOR_INTEL &&
+ c->x86_vendor != X86_VENDOR_AMD)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2d75faf743f2..98b3dd8cf2bf 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -54,6 +54,8 @@
#include <asm/mce.h>
#include <asm/tsc.h>
#include <asm/hypervisor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
unsigned int num_processors;
@@ -545,6 +547,81 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+#define DEADLINE_MODEL_MATCH_FUNC(model, func) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func }
+
+#define DEADLINE_MODEL_MATCH_REV(model, rev) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
+
+static u32 hsx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_mask) {
+ case 0x02: return 0x3a; /* EP */
+ case 0x04: return 0x0f; /* EX */
+ }
+
+ return ~0U;
+}
+
+static u32 bdx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_mask) {
+ case 0x02: return 0x00000011;
+ case 0x03: return 0x0700000e;
+ case 0x04: return 0x0f00000c;
+ case 0x05: return 0x0e000003;
+ }
+
+ return ~0U;
+}
+
+static const struct x86_cpu_id deadline_match[] = {
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52),
+
+ {},
+};
+
+static void apic_check_deadline_errata(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(deadline_match);
+ u32 rev;
+
+ if (!m)
+ return;
+
+ /*
+ * Function pointers will have the MSB set due to address layout,
+ * immediate revisions will not.
+ */
+ if ((long)m->driver_data < 0)
+ rev = ((u32 (*)(void))(m->driver_data))();
+ else
+ rev = (u32)m->driver_data;
+
+ if (boot_cpu_data.microcode >= rev)
+ return;
+
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
+ "please update microcode to version: 0x%x (or later)\n", rev);
+}
+
/*
* Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
@@ -563,6 +640,7 @@ static void setup_APIC_timer(void)
levt->cpumask = cpumask_of(smp_processor_id());
if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->name = "lapic-deadline";
levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
CLOCK_EVT_FEAT_DUMMY);
levt->set_next_event = lapic_next_deadline;
@@ -1779,6 +1857,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;
+ apic_check_deadline_errata();
+
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
return;
@@ -2201,23 +2281,32 @@ void default_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
+int default_cpu_mask_to_apicid(const struct cpumask *mask,
+ struct irq_data *irqdata,
+ unsigned int *apicid)
{
- unsigned int cpu;
+ unsigned int cpu = cpumask_first(mask);
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+ *apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ irq_data_update_effective_affinity(irqdata, cpumask_of(cpu));
+ return 0;
+}
- if (likely(cpu < nr_cpu_ids)) {
- *apicid = per_cpu(x86_cpu_to_apicid, cpu);
- return 0;
- }
+int flat_cpu_mask_to_apicid(const struct cpumask *mask,
+ struct irq_data *irqdata,
+ unsigned int *apicid)
- return -EINVAL;
+{
+ struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata);
+ unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS;
+
+ if (!cpu_mask)
+ return -EINVAL;
+ *apicid = (unsigned int)cpu_mask;
+ cpumask_bits(effmsk)[0] = cpu_mask;
+ return 0;
}
/*
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index a4d7ff20ed22..dedd5a41ba48 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -172,7 +172,7 @@ static struct apic apic_flat __ro_after_init = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
@@ -268,7 +268,7 @@ static struct apic apic_physflat __ro_after_init = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 2262eb6df796..6599f437b4ab 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -141,7 +141,7 @@ struct apic apic_noop __ro_after_init = {
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index e08fe2c8dd8c..2fda912219a6 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -267,7 +267,7 @@ static const struct apic apic_numachip1 __refconst = {
.get_apic_id = numachip1_get_apic_id,
.set_apic_id = numachip1_set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
@@ -318,7 +318,7 @@ static const struct apic apic_numachip2 __refconst = {
.get_apic_id = numachip2_get_apic_id,
.set_apic_id = numachip2_set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 56012010332c..456e45e8bf84 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -172,7 +172,7 @@ static struct apic apic_bigsmp __ro_after_init = {
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index ae50d3454d78..56ccf9346b08 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -150,16 +150,27 @@ static const struct irq_domain_ops htirq_domain_ops = {
.deactivate = htirq_domain_deactivate,
};
-void arch_init_htirq_domain(struct irq_domain *parent)
+void __init arch_init_htirq_domain(struct irq_domain *parent)
{
+ struct fwnode_handle *fn;
+
if (disable_apic)
return;
- htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+ fn = irq_domain_alloc_named_fwnode("PCI-HT");
+ if (!fn)
+ goto warn;
+
+ htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL);
+ irq_domain_free_fwnode(fn);
if (!htirq_domain)
- pr_warn("failed to initialize irqdomain for HTIRQ.\n");
- else
- htirq_domain->parent = parent;
+ goto warn;
+
+ htirq_domain->parent = parent;
+ return;
+
+warn:
+ pr_warn("Failed to initialize irqdomain for HTIRQ.\n");
}
int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..b4f5f73febdb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
@@ -2223,6 +2201,8 @@ static int mp_irqdomain_create(int ioapic)
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct fwnode_handle *fn;
+ char *name = "IO-APIC";
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
@@ -2233,9 +2213,25 @@ static int mp_irqdomain_create(int ioapic)
parent = irq_remapping_get_ir_irq_domain(&info);
if (!parent)
parent = x86_vector_domain;
+ else
+ name = "IO-APIC-IR";
+
+ /* Handle device tree enumerated APICs proper */
+ if (cfg->dev) {
+ fn = of_node_to_fwnode(cfg->dev);
+ } else {
+ fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ if (!fn)
+ return -ENOMEM;
+ }
+
+ ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
+ (void *)(long)ioapic);
+
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
- ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
- (void *)(long)ioapic);
if (!ip->irqdomain)
return -ENOMEM;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c61aec7e65f4..9b18be764422 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -136,13 +136,20 @@ static struct msi_domain_info pci_msi_domain_info = {
.handler_name = "edge",
};
-void arch_init_msi_domain(struct irq_domain *parent)
+void __init arch_init_msi_domain(struct irq_domain *parent)
{
+ struct fwnode_handle *fn;
+
if (disable_apic)
return;
- msi_default_domain = pci_msi_create_irq_domain(NULL,
- &pci_msi_domain_info, parent);
+ fn = irq_domain_alloc_named_fwnode("PCI-MSI");
+ if (fn) {
+ msi_default_domain =
+ pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
+ parent);
+ irq_domain_free_fwnode(fn);
+ }
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
}
@@ -167,9 +174,18 @@ static struct msi_domain_info pci_msi_ir_domain_info = {
.handler_name = "edge",
};
-struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
+ const char *name, int id)
{
- return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+ struct fwnode_handle *fn;
+ struct irq_domain *d;
+
+ fn = irq_domain_alloc_named_id_fwnode(name, id);
+ if (!fn)
+ return NULL;
+ d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
+ irq_domain_free_fwnode(fn);
+ return d;
}
#endif
@@ -221,13 +237,20 @@ static struct irq_domain *dmar_get_irq_domain(void)
{
static struct irq_domain *dmar_domain;
static DEFINE_MUTEX(dmar_lock);
+ struct fwnode_handle *fn;
mutex_lock(&dmar_lock);
- if (dmar_domain == NULL)
- dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+ if (dmar_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("DMAR-MSI");
+ if (fn) {
+ dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
x86_vector_domain);
+ irq_domain_free_fwnode(fn);
+ }
+out:
mutex_unlock(&dmar_lock);
-
return dmar_domain;
}
@@ -317,9 +340,10 @@ static struct msi_domain_info hpet_msi_domain_info = {
struct irq_domain *hpet_create_irq_domain(int hpet_id)
{
- struct irq_domain *parent;
- struct irq_alloc_info info;
struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct irq_alloc_info info;
+ struct fwnode_handle *fn;
if (x86_vector_domain == NULL)
return NULL;
@@ -340,7 +364,16 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
else
hpet_msi_controller.name = "IR-HPET-MSI";
- return msi_create_irq_domain(NULL, domain_info, parent);
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
+ }
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ irq_domain_free_fwnode(fn);
+ return d;
}
int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 2e8f7f048f4f..63287659adb6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -102,7 +102,7 @@ static struct apic apic_default __ro_after_init = {
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f3557a1eb562..b3af457ed667 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -103,7 +103,8 @@ static void free_apic_chip_data(struct apic_chip_data *data)
}
static int __assign_irq_vector(int irq, struct apic_chip_data *d,
- const struct cpumask *mask)
+ const struct cpumask *mask,
+ struct irq_data *irqdata)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -141,7 +142,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
/*
* Clear the offline cpus from @vector_cpumask for searching
* and verify whether the result overlaps with @mask. If true,
- * then the call to apic->cpu_mask_to_apicid_and() will
+ * then the call to apic->cpu_mask_to_apicid() will
* succeed as well. If not, no point in trying to find a
* vector in this mask.
*/
@@ -221,34 +222,40 @@ success:
* Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
* as we already established, that mask & d->domain & cpu_online_mask
* is not empty.
+ *
+ * vector_searchmask is a subset of d->domain and has the offline
+ * cpus masked out.
*/
- BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
- &d->cfg.dest_apicid));
+ cpumask_and(vector_searchmask, vector_searchmask, mask);
+ BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata,
+ &d->cfg.dest_apicid));
return 0;
}
static int assign_irq_vector(int irq, struct apic_chip_data *data,
- const struct cpumask *mask)
+ const struct cpumask *mask,
+ struct irq_data *irqdata)
{
int err;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, data, mask);
+ err = __assign_irq_vector(irq, data, mask, irqdata);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
static int assign_irq_vector_policy(int irq, int node,
struct apic_chip_data *data,
- struct irq_alloc_info *info)
+ struct irq_alloc_info *info,
+ struct irq_data *irqdata)
{
if (info && info->mask)
- return assign_irq_vector(irq, data, info->mask);
+ return assign_irq_vector(irq, data, info->mask, irqdata);
if (node != NUMA_NO_NODE &&
- assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+ assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0)
return 0;
- return assign_irq_vector(irq, data, apic->target_cpus());
+ return assign_irq_vector(irq, data, apic->target_cpus(), irqdata);
}
static void clear_irq_vector(int irq, struct apic_chip_data *data)
@@ -360,9 +367,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq + i, node, data, info);
+ err = assign_irq_vector_policy(virq + i, node, data, info,
+ irq_data);
if (err)
goto error;
+ /*
+ * If the apic destination mode is physical, then the
+ * effective affinity is restricted to a single target
+ * CPU. Mark the interrupt accordingly.
+ */
+ if (!apic->irq_dest_mode)
+ irqd_set_single_target(irq_data);
}
return 0;
@@ -405,7 +420,7 @@ int __init arch_probe_nr_irqs(void)
}
#ifdef CONFIG_X86_IO_APIC
-static void init_legacy_irqs(void)
+static void __init init_legacy_irqs(void)
{
int i, node = cpu_to_node(0);
struct apic_chip_data *data;
@@ -424,16 +439,21 @@ static void init_legacy_irqs(void)
}
}
#else
-static void init_legacy_irqs(void) { }
+static inline void init_legacy_irqs(void) { }
#endif
int __init arch_early_irq_init(void)
{
+ struct fwnode_handle *fn;
+
init_legacy_irqs();
- x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
- NULL);
+ fn = irq_domain_alloc_named_fwnode("VECTOR");
+ BUG_ON(!fn);
+ x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
+ NULL);
BUG_ON(x86_vector_domain == NULL);
+ irq_domain_free_fwnode(fn);
irq_set_default_host(x86_vector_domain);
arch_init_msi_domain(x86_vector_domain);
@@ -529,11 +549,12 @@ static int apic_set_affinity(struct irq_data *irq_data,
if (!cpumask_intersects(dest, cpu_online_mask))
return -EINVAL;
- err = assign_irq_vector(irq, data, dest);
+ err = assign_irq_vector(irq, data, dest, irq_data);
return err ? err : IRQ_SET_MASK_OK;
}
static struct irq_chip lapic_controller = {
+ .name = "APIC",
.irq_ack = apic_ack_edge,
.irq_set_affinity = apic_set_affinity,
.irq_retrigger = apic_retrigger_irq,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 5a35f208ed95..481237cb1544 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -4,6 +4,7 @@
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/dmar.h>
+#include <linux/irq.h>
#include <linux/cpu.h>
#include <asm/smp.h>
@@ -104,35 +105,30 @@ static void x2apic_send_IPI_all(int vector)
}
static int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
+x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata,
+ unsigned int *apicid)
{
+ struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata);
+ unsigned int cpu;
u32 dest = 0;
u16 cluster;
- int i;
-
- for_each_cpu_and(i, cpumask, andmask) {
- if (!cpumask_test_cpu(i, cpu_online_mask))
- continue;
- dest = per_cpu(x86_cpu_to_logical_apicid, i);
- cluster = x2apic_cluster(i);
- break;
- }
- if (!dest)
+ cpu = cpumask_first(mask);
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
- for_each_cpu_and(i, cpumask, andmask) {
- if (!cpumask_test_cpu(i, cpu_online_mask))
- continue;
- if (cluster != x2apic_cluster(i))
+ dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ cluster = x2apic_cluster(cpu);
+
+ cpumask_clear(effmsk);
+ for_each_cpu(cpu, mask) {
+ if (cluster != x2apic_cluster(cpu))
continue;
- dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ dest |= per_cpu(x86_cpu_to_logical_apicid, cpu);
+ cpumask_set_cpu(cpu, effmsk);
}
*apicid = dest;
-
return 0;
}
@@ -256,7 +252,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index ff111f05a314..3baf0c3dc875 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -127,7 +127,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b487b3a01615..0d57bb9079c9 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -526,27 +526,15 @@ static void uv_init_apic_ldr(void)
}
static int
-uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
+uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata,
+ unsigned int *apicid)
{
- int unsigned cpu;
+ int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid);
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
-
- if (likely(cpu < nr_cpu_ids)) {
- *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
- return 0;
- }
+ if (!ret)
+ *apicid |= uv_apicid_hibits;
- return -EINVAL;
+ return ret;
}
static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -614,7 +602,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
- .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
.send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 52000010c62e..cdf82492b770 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -21,6 +21,7 @@ obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
+obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000000000000..d869c8671e36
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,79 @@
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct aperfmperf_sample {
+ unsigned int khz;
+ unsigned long jiffies;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 10ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+ u64 aperf, aperf_delta;
+ u64 mperf, mperf_delta;
+ struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+
+ /* Don't bother re-computing within 10 ms */
+ if (time_before(jiffies, s->jiffies + HZ/100))
+ return;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ aperf_delta = aperf - s->aperf;
+ mperf_delta = mperf - s->mperf;
+
+ /*
+ * There is no architectural guarantee that MPERF
+ * increments faster than we can read it.
+ */
+ if (mperf_delta == 0)
+ return;
+
+ /*
+ * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
+ * khz = (cpu_khz * aperf_delta) / mperf_delta
+ */
+ if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+ else /* khz = aperf_delta / (mperf_delta / cpu_khz) */
+ s->khz = div64_u64(aperf_delta,
+ div64_u64(mperf_delta, cpu_khz));
+ s->jiffies = jiffies;
+ s->aperf = aperf;
+ s->mperf = mperf;
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+ return per_cpu(samples.khz, cpu);
+}
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index a70fd61095f8..6f077445647a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -255,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f5af0cc7eb0d..9257bd9dc664 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -856,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_cdp;
+ goto out_destroy;
static_branch_enable(&rdt_enable_key);
goto out;
+out_destroy:
+ kernfs_remove(kn_info);
out_cdp:
cdp_disable();
out:
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
index 9c632cb88546..10cec43aac38 100644
--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -17,6 +17,8 @@
#include "mce-internal.h"
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
static DEFINE_MUTEX(mce_chrdev_read_mutex);
static char mce_helper[128];
@@ -345,24 +347,49 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
}
}
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
+void mce_unregister_injector_chain(struct notifier_block *nb)
{
- mce_write = fn;
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffie or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
}
static const struct file_operations mce_chrdev_ops = {
@@ -388,9 +415,15 @@ static __init int dev_mcelog_init_device(void)
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err) {
- pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
return err;
}
+
mce_register_decode_chain(&dev_mcelog_nb);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 99165b206df3..231ad23b24a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -10,23 +10,105 @@
* Authors:
* Andi Kleen
* Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
*/
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/timer.h>
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
+#include <linux/module.h>
#include <linux/notifier.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/mce.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd_nb.h>
#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "mce-internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+static u8 n_banks;
+
+#define MAX_FLAG_OPT_SIZE 3
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+}
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -143,7 +225,7 @@ static int raise_local(void)
return ret;
}
-static void raise_mce(struct mce *m)
+static void __maybe_unused raise_mce(struct mce *m)
{
int context = MCJ_CTX(m->inject_flags);
@@ -198,55 +280,454 @@ static void raise_mce(struct mce *m)
}
}
-/* Error injection interface */
-static ssize_t mce_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
{
- struct mce m;
+ struct mce *m = (struct mce *)data;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /*
- * There are some cases where real MSR reads could slip
- * through.
- */
- if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
- return -EIO;
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
- if ((unsigned long)usize > sizeof(struct mce))
- usize = sizeof(struct mce);
- if (copy_from_user(&m, ubuf, usize))
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
- if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 cores_per_node;
+
+ cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
+
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+ } else {
+ wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ u64 mcg_status = 0;
+ unsigned int cpu = i_mce.extcpu;
+ u8 b = i_mce.bank;
+
+ rdtscll(i_mce.tsc);
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ mce_inject_log(&i_mce);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
/*
- * Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
*/
- schedule_timeout(2);
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ }
- mutex_lock(&mce_inject_mutex);
- raise_mce(&m);
- mutex_unlock(&mce_inject_mutex);
- return usize;
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+ cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ }
+
+ get_online_cpus();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= n_banks) {
+ pr_err("Non-existent MCE bank: %llu\n", val);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+ do_inject();
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ struct dentry *d;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init debugfs_init(void)
+{
+ unsigned int i;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+ if (!dfs_inj)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+ dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+ dfs_fls[i].perm,
+ dfs_inj,
+ &i_mce,
+ dfs_fls[i].fops);
+
+ if (!dfs_fls[i].d)
+ goto err_dfs_add;
+ }
+
+ return 0;
+
+err_dfs_add:
+ while (i-- > 0)
+ debugfs_remove(dfs_fls[i].d);
+
+ debugfs_remove(dfs_inj);
+ dfs_inj = NULL;
+
+ return -ENODEV;
}
-static int inject_init(void)
+static int __init inject_init(void)
{
+ int err;
+
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
+
+ err = debugfs_init();
+ if (err) {
+ free_cpumask_var(mce_inject_cpumask);
+ return err;
+ }
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
pr_info("Machine check injector initialized\n");
- register_mce_write_callback(mce_write);
- register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
- "mce_notify");
+
return 0;
}
+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
module_init(inject_init);
-/*
- * Cannot tolerate unloading currently because we cannot
- * guarantee all openers of mce_chrdev will get a reference to us.
- */
+module_exit(inject_exit);
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 654ad0668d72..098530a93bb7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -100,7 +100,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
extern struct device_attribute dev_attr_trigger;
#ifdef CONFIG_X86_MCELOG_LEGACY
-extern void mce_work_trigger(void);
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
#else
static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5cfbaeb6529a..6dde0497efc7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -673,7 +673,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
bool error_seen = false;
struct mce m;
- int severity;
int i;
this_cpu_inc(mce_poll_count);
@@ -710,11 +709,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_read_aux(&m, i);
- severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
- if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
- if (m.status & MCI_STATUS_ADDRV)
- m.severity = severity;
+ m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
/*
* Don't get the IP here because it's unlikely to
@@ -1550,7 +1545,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
- if (c->x86 < 17 && cfg->bootlog < 0) {
+ if (c->x86 < 0x11 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
@@ -1832,7 +1827,8 @@ void mce_disable_bank(int bank)
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
* monarchtimeout is how long to wait for other CPUs on machine
* check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
* mce=recovery force enable memcpy_mcsafe()
@@ -1912,12 +1908,13 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
+ * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
* Disabling them for just a single offlined CPU is bad, since it will
* inhibit reporting for all shared resources on the socket like the
* last level cache (LLC), the integrated memory controller (iMC), etc.
*/
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return;
mce_disable_error_reporting();
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 6e4a047e4b68..9e314bcf67cc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-static void get_smca_bank_info(unsigned int bank)
+static void smca_configure(unsigned int bank, unsigned int cpu)
{
- unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ unsigned int i, hwid_mcatype;
struct smca_hwid *s_hwid;
- u32 high, instance_id;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ high |= BIT(5);
+
+ wrmsr(smca_config, low, high);
+ }
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
@@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank)
smca_get_name(s_hwid->bank_type));
smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = instance_id;
+ smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
break;
}
@@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
- u32 smca_low, smca_high, smca_addr;
+ u32 smca_low, smca_high;
struct threshold_block b;
int new;
@@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
goto set_offset;
}
- smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-
- if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
- /*
- * OS is required to set the MCAX bit to acknowledge that it is
- * now using the new MSR ranges and new registers under each
- * bank. It also means that the OS will configure deferred
- * errors in the new MCx_CONFIG register. If the bit is not set,
- * uncorrectable errors will cause a system panic.
- *
- * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
- */
- smca_high |= BIT(0);
-
- /*
- * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
- * registers with the option of additionally logging to
- * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
- *
- * This bit is usually set by BIOS to retain the old behavior
- * for OSes that don't use the new registers. Linux supports the
- * new registers so let's disable that additional logging here.
- *
- * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
- * portion of the MSR).
- */
- smca_high &= ~BIT(2);
-
- /*
- * SMCA sets the Deferred Error Interrupt type per bank.
- *
- * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
- * if the DeferredIntType bit field is available.
- *
- * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
- * high portion of the MSR). OS should set this to 0x1 to enable
- * APIC based interrupt. First, check that no interrupt has been
- * set.
- */
- if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
- smca_high |= BIT(5);
-
- wrmsr(smca_addr, smca_low, smca_high);
- }
-
/* Gather LVT offset for thresholding: */
if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
goto out;
@@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
- get_smca_bank_info(bank);
+ smca_configure(bank, cpu);
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(cpu, address, low, high, bank, block);
@@ -755,37 +741,19 @@ out_err:
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-static void
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- u32 msr_status = msr_ops.status(bank);
- u32 msr_addr = msr_ops.addr(bank);
struct mce m;
- u64 status;
-
- WARN_ON_ONCE(deferred_err && threshold_err);
-
- if (deferred_err && mce_flags.smca) {
- msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
- msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
- }
-
- rdmsrl(msr_status, status);
-
- if (!(status & MCI_STATUS_VAL))
- return;
mce_setup(&m);
m.status = status;
+ m.misc = misc;
m.bank = bank;
m.tsc = rdtsc();
- if (threshold_err)
- m.misc = misc;
-
if (m.status & MCI_STATUS_ADDRV) {
- rdmsrl(msr_addr, m.addr);
+ m.addr = addr;
/*
* Extract [55:<lsb>] where lsb is the least significant
@@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
}
mce_log(&m);
-
- wrmsrl(msr_status, 0);
}
static inline void __smp_deferred_error_interrupt(void)
@@ -832,87 +798,126 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
exiting_ack_irq();
}
-/* APIC interrupt handler for deferred errors */
-static void amd_deferred_error_interrupt(void)
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
- unsigned int bank;
- u32 msr_status;
- u64 status;
+ u64 status, addr = 0;
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
- : msr_ops.status(bank);
+ rdmsrl(msr_stat, status);
+ if (!(status & MCI_STATUS_VAL))
+ return false;
- rdmsrl(msr_status, status);
+ if (status & MCI_STATUS_ADDRV)
+ rdmsrl(msr_addr, addr);
- if (!(status & MCI_STATUS_VAL) ||
- !(status & MCI_STATUS_DEFERRED))
- continue;
+ __log_error(bank, status, addr, misc);
- __log_error(bank, true, false, 0);
- break;
- }
+ wrmsrl(msr_stat, 0);
+
+ return status & MCI_STATUS_DEFERRED;
}
/*
- * APIC Interrupt Handler
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
*/
+static void log_error_deferred(unsigned int bank)
+{
+ bool defrd;
-/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
- */
+ defrd = _log_error_bank(bank, msr_ops.status(bank),
+ msr_ops.addr(bank), 0);
-static void amd_threshold_interrupt(void)
+ if (!mce_flags.smca)
+ return;
+
+ /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+ if (defrd) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return;
+ }
+
+ /*
+ * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+ * for a valid error.
+ */
+ _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+ MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
{
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block, cpu = smp_processor_id();
- struct thresh_restart tr;
+ unsigned int bank;
- /* assume first bank caused it */
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(cpu, address, low, high, bank, block);
- if (!address)
- break;
+ for (bank = 0; bank < mca_cfg.banks; ++bank)
+ log_error_deferred(bank);
+}
- if (rdmsr_safe(address, &low, &high))
- break;
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+ _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
+static void log_and_reset_block(struct threshold_block *block)
+{
+ struct thresh_restart tr;
+ u32 low = 0, high = 0;
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
+ if (!block)
+ return;
- /*
- * Log the machine check that caused the threshold
- * event.
- */
- if (high & MASK_OVERFLOW_HI)
- goto log;
- }
- }
- return;
+ if (rdmsr_safe(block->address, &low, &high))
+ return;
+
+ if (!(high & MASK_OVERFLOW_HI))
+ return;
-log:
- __log_error(bank, false, true, ((u64)high << 32) | low);
+ /* Log the MCE which caused the threshold event. */
+ log_error_thresholding(block->bank, ((u64)high << 32) | low);
/* Reset threshold block after logging error. */
memset(&tr, 0, sizeof(tr));
- tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ tr.b = block;
threshold_restart_bank(&tr);
}
/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ unsigned int bank, cpu = smp_processor_id();
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+
+ first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ if (!first_block)
+ continue;
+
+ /*
+ * The first block is also the head of the list. Check it first
+ * before iterating over the rest.
+ */
+ log_and_reset_block(first_block);
+ list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+ log_and_reset_block(block);
+ }
+}
+
+/*
* Sysfs Interface
*/
@@ -1202,7 +1207,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
per_cpu(threshold_banks, cpu)[bank] = b;
- atomic_inc(&b->cpus);
+ refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1225,7 +1230,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;
if (is_shared_bank(bank)) {
- atomic_set(&b->cpus, 1);
+ refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
if (nb) {
@@ -1289,7 +1294,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
goto free_out;
if (is_shared_bank(bank)) {
- if (!atomic_dec_and_test(&b->cpus)) {
+ if (!refcount_dec_and_test(&b->cpus)) {
__threshold_remove_blocks(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 45db4d2ebd01..21b185793c80 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -251,7 +251,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
#endif
}
-void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
struct ucode_cpu_info *uci;
struct cpio_data cp;
@@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
- desc.data, desc.size);
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret != UCODE_OK)
return -EINVAL;
@@ -675,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
@@ -689,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
- if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(cpu);
+ if (save) {
+ struct ucode_patch *p = find_patch(0);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -722,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ if (!refresh_fw || !bsp)
return UCODE_OK;
if (c->x86 >= 0x15)
@@ -743,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index e53d3c909840..9cb98ee103db 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -290,6 +290,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
return (struct cpio_data){ NULL, 0, "" };
if (initrd_start)
start = initrd_start;
+ } else {
+ /*
+ * The picture with physical addresses is a bit different: we
+ * need to get the *physical* address to which the ramdisk was
+ * relocated, i.e., relocated_ramdisk (not initrd_start) and
+ * since we're running from physical addresses, we need to access
+ * relocated_ramdisk through its *physical* address too.
+ */
+ u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
+ if (*rr)
+ start = *rr;
}
return find_cpio_data(path, (void *)start, size, NULL);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index afdfd237b59f..59edbe9d4ccb 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -42,7 +42,7 @@
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
/* Current microcode patch used in early patching on the APs. */
-struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *intel_ucode_patch;
static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
unsigned int s2, unsigned int p2)
@@ -166,7 +166,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
static void save_microcode_patch(void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p;
+ struct ucode_patch *iter, *tmp, *p = NULL;
bool prev_found = false;
unsigned int sig, pf;
@@ -202,6 +202,18 @@ static void save_microcode_patch(void *data, unsigned int size)
else
list_add_tail(&p->plist, &microcode_cache);
}
+
+ /*
+ * Save for early loading. On 32-bit, that needs to be a physical
+ * address as the APs are running from physical addresses, before
+ * paging has been enabled.
+ */
+ if (p) {
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
+ }
}
static int microcode_sanity_check(void *mc, int print_err)
@@ -607,6 +619,14 @@ int __init save_microcode_in_initrd_intel(void)
struct ucode_cpu_info uci;
struct cpio_data cp;
+ /*
+ * initrd is going away, clear patch ptr. We will scan the microcode one
+ * last time before jettisoning and save a patch, if found. Then we will
+ * update that pointer too, with a stable patch address to use when
+ * resuming the cores.
+ */
+ intel_ucode_patch = NULL;
+
if (!load_builtin_intel_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path, false);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 04cb8d34ccb8..70e717fccdd6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,6 +161,15 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
}
#endif
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_host_info_eax;
@@ -193,8 +202,15 @@ static void __init ms_hyperv_init_platform(void)
hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
}
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
- if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
/*
* Get the APIC frequency.
*/
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 2bce84d91c2b..c5bb63be4ba1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -807,10 +807,8 @@ void mtrr_save_state(void)
if (!mtrr_enabled())
return;
- get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
- put_online_cpus();
}
void set_mtrr_aps_delayed_init(void)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6df621ae62a7..218f79825b3c 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -2,7 +2,6 @@
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
-#include <linux/cpufreq.h>
/*
* Get CPU information for use by the procfs.
@@ -76,14 +75,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
- if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
-
- if (!freq)
- freq = cpu_khz;
+ if (cpu_has(c, X86_FEATURE_TSC))
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
- }
+ cpu_khz / 1000, (cpu_khz % 1000));
/* Cache size */
if (c->x86_cache_size >= 0)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */
- pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..46c3c73e7f43 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
/*
* Manage page tables very early on.
*/
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt = 2;
+static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+#define __head __section(.head.text)
+
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+ return ptr - (void *)_text + (void *)physaddr;
+}
+
+void __head __startup_64(unsigned long physaddr)
+{
+ unsigned long load_delta, *p;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ int i;
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_PAGE_MASK)
+ for (;;);
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = fixup_pointer(&early_top_pgt, physaddr);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d[511] += load_delta;
+ }
+
+ pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud[510] += load_delta;
+ pud[511] += load_delta;
+
+ pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+ pmd[506] += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+
+ i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
+ p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ }
+
+ i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
+ pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
+ pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
+ pmd[idx] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ */
+
+ pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+ }
+
+ /* Fixup phys_base */
+ p = fixup_pointer(&phys_base, physaddr);
+ *p += load_delta;
+}
+
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
- memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1;
again:
- pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
pgd = *pgd_p;
/*
@@ -67,8 +171,25 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (pgd)
- pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -77,7 +198,7 @@ again:
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
- clear_page(init_level4_pgt);
+ clear_page(init_top_pgt);
kasan_early_init();
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- /* set init_level4_pgt kernel high mapping*/
- init_level4_pgt[511] = early_level4_pgt[511];
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ac9d327d2e42..6225550883df 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
*
*/
+#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -72,101 +73,12 @@ startup_64:
/* Sanitize CPU configuration */
call verify_cpu
- /*
- * Compute the delta between the address I am compiled to run at and the
- * address I am actually running at.
- */
- leaq _text(%rip), %rbp
- subq $_text - __START_KERNEL_map, %rbp
-
- /* Is the address not 2M aligned? */
- testl $~PMD_PAGE_MASK, %ebp
- jnz bad_address
-
- /*
- * Is the address too large?
- */
- leaq _text(%rip), %rax
- shrq $MAX_PHYSMEM_BITS, %rax
- jnz bad_address
-
- /*
- * Fixup the physical addresses in the page table
- */
- addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
-
- addq %rbp, level3_kernel_pgt + (510*8)(%rip)
- addq %rbp, level3_kernel_pgt + (511*8)(%rip)
-
- addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
leaq _text(%rip), %rdi
- leaq early_level4_pgt(%rip), %rbx
-
- movq %rdi, %rax
- shrq $PGDIR_SHIFT, %rax
-
- leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
- movq %rdx, 0(%rbx,%rax,8)
- movq %rdx, 8(%rbx,%rax,8)
-
- addq $PAGE_SIZE, %rdx
- movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- incl %eax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
-
- addq $PAGE_SIZE * 2, %rbx
- movq %rdi, %rax
- shrq $PMD_SHIFT, %rdi
- addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
- leaq (_end - 1)(%rip), %rcx
- shrq $PMD_SHIFT, %rcx
- subq %rdi, %rcx
- incl %ecx
-
-1:
- andq $(PTRS_PER_PMD - 1), %rdi
- movq %rax, (%rbx,%rdi,8)
- incq %rdi
- addq $PMD_SIZE, %rax
- decl %ecx
- jnz 1b
-
- test %rbp, %rbp
- jz .Lskip_fixup
+ pushq %rsi
+ call __startup_64
+ popq %rsi
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- */
- leaq level2_kernel_pgt(%rip), %rdi
- leaq PAGE_SIZE(%rdi), %r8
- /* See if it is a valid page table entry */
-1: testb $_PAGE_PRESENT, 0(%rdi)
- jz 2f
- addq %rbp, 0(%rdi)
- /* Go to the next page */
-2: addq $8, %rdi
- cmp %r8, %rdi
- jne 1b
-
- /* Fixup phys_base */
- addq %rbp, phys_base(%rip)
-
-.Lskip_fixup:
- movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ movq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
/*
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */
call verify_cpu
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq $(init_top_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
+ /* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+ orl $X86_CR4_LA57, %ecx
+#endif
movq %rcx, %cr4
- /* Setup early boot stage 4 level pagetables. */
+ /* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -417,9 +332,13 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PAGE(early_top_pgt)
.fill 511,8,0
+#ifdef CONFIG_X86_5LEVEL
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
+
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 89ff7af2de50..16f82a3aaec7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -285,7 +285,7 @@ static void hpet_legacy_clockevent_register(void)
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
- hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
+ hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index);
clockevents_config_and_register(&hpet_clockevent, hpet_freq,
HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
global_clock_event = &hpet_clockevent;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f34fe7444836..4aa03c5a14c9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -432,84 +432,12 @@ int check_irq_vectors_for_cpu_disable(void)
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
- unsigned int irq, vector;
- static int warned;
+ unsigned int irr, vector;
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
- int ret;
- for_each_irq_desc(irq, desc) {
- int break_affinity = 0;
- int set_affinity = 1;
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- /* interrupt's are disabled at this point */
- raw_spin_lock(&desc->lock);
-
- data = irq_desc_get_irq_data(desc);
- affinity = irq_data_get_affinity_mask(data);
- if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
- cpumask_subset(affinity, cpu_online_mask)) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
-
- /*
- * Complete the irq move. This cpu is going down and for
- * non intr-remapping case, we can't wait till this interrupt
- * arrives at this cpu before completing the irq move.
- */
- irq_force_complete_move(desc);
-
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- break_affinity = 1;
- affinity = cpu_online_mask;
- }
-
- chip = irq_data_get_irq_chip(data);
- /*
- * The interrupt descriptor might have been cleaned up
- * already, but it is not yet removed from the radix tree
- */
- if (!chip) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
-
- if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
- chip->irq_mask(data);
-
- if (chip->irq_set_affinity) {
- ret = chip->irq_set_affinity(data, affinity, true);
- if (ret == -ENOSPC)
- pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq);
- } else {
- if (!(warned++))
- set_affinity = 0;
- }
-
- /*
- * We unmask if the irq was not marked masked by the
- * core code. That respects the lazy irq disable
- * behaviour.
- */
- if (!irqd_can_move_in_process_context(data) &&
- !irqd_irq_masked(data) && chip->irq_unmask)
- chip->irq_unmask(data);
-
- raw_spin_unlock(&desc->lock);
-
- if (break_affinity && set_affinity)
- pr_notice("Broke affinity for irq %i\n", irq);
- else if (!set_affinity)
- pr_notice("Cannot set affinity for irq %i\n", irq);
- }
+ irq_migrate_all_off_this_cpu();
/*
* We can remove mdelay() and then send spuriuous interrupts to
@@ -528,8 +456,6 @@ void fixup_irqs(void)
* nothing else will touch it.
*/
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irr;
-
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index c37bd0f39c70..ab4f491da2a9 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry,
void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type)
{
- get_online_cpus();
mutex_lock(&text_mutex);
__jump_label_transform(entry, type, NULL, 0);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
static enum {
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
" popf\n"
#endif
".global optprobe_template_end\n"
- "optprobe_template_end:\n");
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index da5c09789984..43e10d6fdbed 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token)
*/
rcu_irq_exit();
native_safe_halt();
- rcu_irq_enter();
local_irq_disable();
+ rcu_irq_enter();
}
}
if (!n.halted)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a15831ac58..a870910c8565 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
#include <asm/syscalls.h>
/* context.lock is held for us, so we don't need any locking. */
-static void flush_ldt(void *current_mm)
+static void flush_ldt(void *__mm)
{
+ struct mm_struct *mm = __mm;
mm_context_t *pc;
- if (current->active_mm != current_mm)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &current->active_mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->size);
+ pc = &mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(unsigned int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
struct ldt_struct *new_ldt;
unsigned int alloc_size;
- if (size > LDT_ENTRIES)
+ if (num_entries > LDT_ENTRIES)
return NULL;
new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
- alloc_size = size * LDT_ENTRY_SIZE;
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
/*
* Xen is very picky: it requires a page-aligned LDT that has no
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
}
- new_ldt->size = size;
+ new_ldt->nr_entries = num_entries;
return new_ldt;
}
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
- paravirt_alloc_ldt(ldt->entries, ldt->size);
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
/* context.lock is held */
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
- paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
goto out_unlock;
}
- new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
retval = -ENOMEM;
goto out_unlock;
}
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
- new_ldt->size * LDT_ENTRY_SIZE);
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
mm->context.ldt = new_ldt;
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int retval;
- unsigned long size;
struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
mutex_lock(&mm->context.lock);
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- size = mm->context.ldt->size * LDT_ENTRY_SIZE;
- if (size > bytecount)
- size = bytecount;
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
- if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
- if (size != bytecount) {
+ if (entries_size != bytecount) {
/* Zero-fill the rest and pretend we read bytecount bytes. */
- if (clear_user(ptr + size, bytecount - size)) {
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct ldt_struct *new_ldt, *old_ldt;
- unsigned int oldsize, newsize;
+ unsigned int old_nr_entries, new_nr_entries;
struct user_desc ldt_info;
struct desc_struct ldt;
int error;
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
mutex_lock(&mm->context.lock);
- old_ldt = mm->context.ldt;
- oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max(ldt_info.entry_number + 1, oldsize);
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
error = -ENOMEM;
- new_ldt = alloc_ldt_struct(newsize);
+ new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt)
goto out_unlock;
if (old_ldt)
- memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6f5ca4ebe6e5..cb0a30473c23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
VMCOREINFO_NUMBER(phys_base);
- VMCOREINFO_SYMBOL(init_level4_pgt);
+ VMCOREINFO_SYMBOL(init_top_pgt);
#ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 6d9582ec0324..d27f8d84c4ff 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -78,7 +78,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
/* Don't wait longer than a second */
timeout = USEC_PER_SEC;
- while (!cpumask_empty(mask) && timeout--)
+ while (!cpumask_empty(mask) && --timeout)
udelay(1);
/* What happens if we timeout, do we still unregister?? */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
+ .read_cr3 = __native_read_cr3,
.write_cr3 = native_write_cr3,
.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..3ca198080ea9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct inactive_task_frame *frame =
- (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
- return READ_ONCE_NOCHECK(frame->ret_addr);
-}
-
-/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ff40e74c9181..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- smp_processor_id());
+ raw_smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6840bf3940b..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt->entries,
- dead_task->mm->context.ldt->size);
+ dead_task->mm->context.ldt->nr_entries);
BUG();
}
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/tboot.h>
#include <linux/delay.h>
+#include <linux/frame.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
* Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f81823695014..65622f07e633 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -503,7 +503,7 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
- low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
(unsigned long)(low_size >> 20));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..b474c8de7fba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
void play_dead_common(void)
{
idle_task_exit();
- reset_lazy_tlbstate();
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f07f83b3611b..5f25cfbd952e 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
mutex_lock(&child->mm->context.lock);
if (unlikely(!child->mm->context.ldt ||
- seg >= child->mm->context.ldt->size))
+ seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */
else {
desc = &child->mm->context.ldt->entries[seg];
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 207b8f2582c7..213ddf3e937d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -187,7 +187,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4b1724059909..a4eb27918ceb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,7 +514,7 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (!intel_iommu_tboot_noforce)
+ if (intel_iommu_tboot_noforce)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index d39c09119db6..e0754cdbad37 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -66,7 +66,7 @@ static struct irqaction irq0 = {
.name = "timer"
};
-void __init setup_default_timer_irq(void)
+static void __init setup_default_timer_irq(void)
{
if (!nr_legacy_irqs())
return;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3995d3a777d4..bf54309b85da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -182,7 +182,7 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
}
-static int fixup_bug(struct pt_regs *regs, int trapnr)
+int fixup_bug(struct pt_regs *regs, int trapnr)
{
if (trapnr != X86_TRAP_UD)
return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..796d96bb0821 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
-
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
+ cyc2ns_read_begin(&data);
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- barrier();
-
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
- cyc2ns_write_end(cpu, data);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
@@ -1412,11 +1328,11 @@ void __init tsc_init(void)
use_tsc_delay();
+ check_system_tsc_reliable();
+
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
- check_system_tsc_reliable();
-
detect_art();
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 728f75378475..7842371bc9e4 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -71,13 +71,8 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
* non zero. We don't do that on non boot cpus because physical
* hotplug should have set the ADJUST register to a value > 0 so
* the TSC is in sync with the already running cpus.
- *
- * But we always force positive ADJUST values. Otherwise the TSC
- * deadline timer creates an interrupt storm. We also have to
- * prevent values > 0x7FFFFFFF as those wreckage the timer as well.
*/
- if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) ||
- (bootval > 0x7FFFFFFF)) {
+ if (bootcpu && bootval != 0) {
pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
bootval);
wrmsrl(MSR_IA32_TSC_ADJUST, 0);
@@ -451,20 +446,6 @@ retry:
*/
cur->adjusted += cur_max_warp;
- /*
- * TSC deadline timer stops working or creates an interrupt storm
- * with adjust values < 0 and > x07ffffff.
- *
- * To allow adjust values > 0x7FFFFFFF we need to disable the
- * deadline timer and use the local APIC timer, but that requires
- * more intrusive changes and we do not have any useful information
- * from Intel about the underlying HW wreckage yet.
- */
- if (cur->adjusted < 0)
- cur->adjusted = 0;
- if (cur->adjusted > 0x7FFFFFFF)
- cur->adjusted = 0x7FFFFFFF;
-
pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
cpu, cur_max_warp, cur->adjusted);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a181ae76c71c..59ca2eea522c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -780,18 +780,20 @@ out:
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
+ struct kvm_cpuid_entry2 *ej;
+ int j = i;
+ int nent = vcpu->arch.cpuid_nent;
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
- }
- }
- return 0; /* silence gcc, even though control never reaches here */
+ do {
+ j = (j + 1) % nent;
+ ej = &vcpu->arch.cpuid_entries[j];
+ } while (ej->function != e->function);
+
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+ return j;
}
/* find an entry with matching function, matching index (if needed), and that
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0816ab2e8adc..80890dee66ce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c329d2894905..d24c8742d9b0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,8 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
+ preempt_disable();
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
+ preempt_enable();
}
static bool start_hv_timer(struct kvm_lapic *apic)
@@ -1934,7 +1936,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
kvm_lapic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5d3376f67794..cb8225969255 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3698,12 +3698,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
+ if (is_guest_mode(vcpu))
+ return false;
+
return kvm_x86_ops->interrupt_allowed(vcpu);
}
@@ -3719,7 +3722,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && can_do_async_pf(vcpu)) {
+ if (!prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 27975807cc64..330bf3a811fb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,6 +76,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 183ddb235fb4..33460fcdeef9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
+#include <linux/frame.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -1807,7 +1808,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
* AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
- var->unusable = !var->present || (var->type == 0);
+ var->unusable = !var->present;
switch (seg) {
case VCPU_SREG_TR:
@@ -1840,6 +1841,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ /* This is symmetric with svm_set_segment() */
var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
@@ -1980,18 +1982,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
s->base = var->base;
s->limit = var->limit;
s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
/*
* This is always accurate, except if SYSRET returned to a segment
@@ -2000,7 +1998,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
* would entail passing the CPL to userspace and back.
*/
if (seg == VCPU_SREG_SS)
- svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
mark_dirty(svm->vmcb, VMCB_SEG);
}
@@ -4908,6 +4907,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
mark_all_clean(svm->vmcb);
}
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 72f78396bc09..6dcc4873e435 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
+#include <linux/frame.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -48,6 +49,7 @@
#include <asm/kexec.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
#include "trace.h"
#include "pmu.h"
@@ -596,6 +598,7 @@ struct vcpu_vmx {
int gs_ldt_reload_needed;
int fs_reload_needed;
u64 msr_host_bndcfgs;
+ unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state;
struct {
@@ -2425,7 +2428,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
if (!(vmcs12->exception_bitmap & (1u << nr)))
return 0;
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
return 1;
@@ -5012,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
- unsigned long cr0, cr4;
+ unsigned long cr0, cr3, cr4;
cr0 = read_cr0();
WARN_ON(cr0 & X86_CR0_TS);
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->host_state.vmcs_host_cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow();
@@ -6914,97 +6924,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * This function performs the various checks including
- * - if it's 4KB aligned
- * - No bits beyond the physical address width are set
- * - Returns 0 on success or else 1
- * (Intel SDM Section 30.3)
- */
-static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
- gpa_t *vmpointer)
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
{
gva_t gva;
- gpa_t vmptr;
struct x86_exception e;
- struct page *page;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
- sizeof(vmptr), &e)) {
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
+ sizeof(*vmpointer), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
- switch (exit_reason) {
- case EXIT_REASON_VMON:
- /*
- * SDM 3: 24.11.5
- * The first 4 bytes of VMXON region contain the supported
- * VMCS revision identifier
- *
- * Note - IA32_VMX_BASIC[48] will never be 1
- * for the nested case;
- * which replaces physical address width with 32
- *
- */
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
- if (*(u32 *)kmap(page) != VMCS12_REVISION) {
- kunmap(page);
- nested_release_page_clean(page);
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
- kunmap(page);
- nested_release_page_clean(page);
- vmx->nested.vmxon_ptr = vmptr;
- break;
- case EXIT_REASON_VMCLEAR:
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
- break;
- case EXIT_REASON_VMPTRLD:
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
-
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
- break;
- default:
- return 1; /* shouldn't happen */
- }
-
- if (vmpointer)
- *vmpointer = vmptr;
return 0;
}
@@ -7066,6 +7000,8 @@ out_msr_bitmap:
static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
+ gpa_t vmptr;
+ struct page *page;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7095,9 +7031,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
-
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ page = nested_get_page(vcpu, vmptr);
+ if (page == NULL) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+ kunmap(page);
+ nested_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ kunmap(page);
+ nested_release_page_clean(page);
+
+ vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
return ret;
@@ -7213,9 +7177,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vmx);
@@ -7545,9 +7519,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
if (vmx->nested.current_vmptr != vmptr) {
struct vmcs12 *new_vmcs12;
struct page *page;
@@ -7913,11 +7897,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
int cr = exit_qualification & 15;
- int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_readl(vcpu, reg);
+ int reg;
+ unsigned long val;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_readl(vcpu, reg);
switch (cr) {
case 0:
if (vmcs12->cr0_guest_host_mask &
@@ -7972,6 +7958,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* lmsw can change bits 1..3 of cr0, and only set bit 0 of
* cr0. Other attempted changes are ignored, with no exit.
*/
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
if (vmcs12->cr0_guest_host_mask & 0xe &
(val ^ vmcs12->cr0_read_shadow))
return true;
@@ -8675,6 +8662,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
);
}
}
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
static bool vmx_has_high_real_mode_segbase(void)
{
@@ -8843,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr, cr4;
+ unsigned long debugctlmsr, cr3, cr4;
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
@@ -8865,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->host_state.vmcs_host_cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -9051,6 +9045,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02363e37d4a6..0e846f0cb83b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
@@ -5528,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * rflags is the old, "raw" value of the flags. The new value has
- * not been saved yet.
- *
- * This is correct even for TF set by the guest, because "the
- * processor will not generate this exception after the instruction
- * that sets the TF flag".
- */
- if (unlikely(rflags & X86_EFLAGS_TF)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
- DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5567,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
int r = EMULATE_DONE;
kvm_x86_ops->skip_emulated_instruction(vcpu);
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ kvm_vcpu_do_singlestep(vcpu, &r);
return r == EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
@@ -5726,8 +5727,9 @@ restart:
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ if (r == EMULATE_DONE &&
+ (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
__kvm_set_rflags(vcpu, ctxt->eflags);
@@ -8394,10 +8396,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (vcpu->arch.pv.pv_unhalted)
return true;
- if (atomic_read(&vcpu->arch.nmi_queued))
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_ops->nmi_allowed(vcpu)))
return true;
- if (kvm_test_request(KVM_REQ_SMI, vcpu))
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending && !is_smm(vcpu)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -8604,8 +8609,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
return true;
else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_can_do_async_pf(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index c5959576c315..020f75cc8cf6 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -37,7 +37,7 @@ ENTRY(copy_user_generic_unrolled)
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
- jz 17f
+ jz .L_copy_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
@@ -58,7 +58,8 @@ ENTRY(copy_user_generic_unrolled)
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
-17: movl %edx,%ecx
+.L_copy_short_string:
+ movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz 20f
@@ -174,6 +175,8 @@ EXPORT_SYMBOL(copy_user_generic_string)
*/
ENTRY(copy_user_enhanced_fast_string)
ASM_STAC
+ cmpl $64,%edx
+ jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
movl %edx,%ecx
1: rep
movsb
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index c81556409bbb..10ffa7e8519f 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -13,14 +13,14 @@
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
pushq %rbx
- pushq %rbp
+ pushq %r12
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
movl 4(%rdi), %ecx
movl 8(%rdi), %edx
movl 12(%rdi), %ebx
- movl 20(%rdi), %ebp
+ movl 20(%rdi), %r12d
movl 24(%rdi), %esi
movl 28(%rdi), %edi
1: \op
@@ -29,10 +29,10 @@ ENTRY(\op\()_safe_regs)
movl %ecx, 4(%r10)
movl %edx, 8(%r10)
movl %ebx, 12(%r10)
- movl %ebp, 20(%r10)
+ movl %r12d, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
- popq %rbp
+ popq %r12
popq %rbx
ret
3:
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 767be7c76034..12e377184ee4 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1009,7 +1009,7 @@ GrpTable: Grp15
1: fxstor | RDGSBASE Ry (F3),(11B)
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
-4: XSAVE
+4: XSAVE | ptwrite Ey (F3),(11B)
5: XRSTOR | lfence (11B)
6: XSAVEOPT | clwb (66) | mfence (11B)
7: clflush | clflushopt (66) | sfence (11B)
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 5e044d506b7a..a179254a5122 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
seg >>= 3;
mutex_lock(&current->mm->context.lock);
- if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
+ if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
ret = current->mm->context.ldt->entries[seg];
mutex_unlock(&current->mm->context.lock);
#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bce6990b1d81..0470826d2bdc 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
{
#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_level4_pgt;
+ pgd_t *start = (pgd_t *) &init_top_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 35ea061010a1..0ea8afcb929c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -162,6 +162,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
if (fixup_exception(regs, trapnr))
return;
+ if (fixup_bug(regs, trapnr))
+ return;
+
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
- pgd_paddr = read_cr3();
+ pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3());
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d;
pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later
* case just flush:
*/
- pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d;
pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
pgd_t *pgd;
pte_t *pte;
- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 456dfdfd2249..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Lockless get_user_pages_fast for x86
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/memremap.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X86_PAE
- return READ_ONCE(*ptep);
-#else
- /*
- * With get_user_pages_fast, we walk down the pagetables without taking
- * any locks. For this we would like to load the pointers atomically,
- * but that is not possible (without expensive cmpxchg8b) on PAE. What
- * we do have is the guarantee that a pte will only either go from not
- * present to present, or present to not present or both -- it will not
- * switch to a completely different present page without a TLB flush in
- * between; something that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees l iff pte_high
- * sees h. We load pte_high *after* loading pte_low, which ensures we
- * don't see an older value of pte_high. *Then* we recheck pte_low,
- * which ensures that we haven't picked up a changed pte high. We might
- * have got rubbish values from pte_low and pte_high, but we are
- * guaranteed that pte_low will not have the present bit set *unless*
- * it is 'l'. And get_user_pages_fast only operates on present ptes, so
- * we're safe.
- *
- * gup_get_pte should not be used or copied outside gup.c without being
- * very careful -- it does not atomically load the pte or anything that
- * is likely to be useful for you.
- */
- pte_t pte;
-
-retry:
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- if (unlikely(pte.pte_low != ptep->pte_low))
- goto retry;
-
- return pte;
-#endif
-}
-
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
-{
- while ((*nr) - nr_start) {
- struct page *page = pages[--(*nr)];
-
- ClearPageReferenced(page);
- put_page(page);
- }
-}
-
-/*
- * 'pteval' can come from a pte, pmd, pud or p4d. We only check
- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 4 types.
- */
-static inline int pte_allows_gup(unsigned long pteval, int write)
-{
- unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
-
- if (write)
- need_pte_bits |= _PAGE_RW;
-
- if ((pteval & need_pte_bits) != need_pte_bits)
- return 0;
-
- /* Check memory protection keys permissions. */
- if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
- return 0;
-
- return 1;
-}
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
- pte_t *ptep, *ptem;
-
- /*
- * Keep the original mapped PTE value (ptem) around since we
- * might increment ptep off the end of the page when finishing
- * our loop iteration.
- */
- ptem = ptep = pte_offset_map(&pmd, addr);
- do {
- pte_t pte = gup_get_pte(ptep);
- struct page *page;
-
- /* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_protnone(pte))
- break;
-
- if (!pte_allows_gup(pte_val(pte), write))
- break;
-
- if (pte_devmap(pte)) {
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- break;
- }
- } else if (pte_special(pte))
- break;
-
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- get_page(page);
- put_dev_pagemap(pgmap);
- SetPageReferenced(page);
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
- if (addr == end)
- ret = 1;
- pte_unmap(ptem);
-
- return ret;
-}
-
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
- VM_BUG_ON_PAGE(page != compound_head(page), page);
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, nr);
- SetPageReferenced(page);
-}
-
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- return 0;
- }
- SetPageReferenced(page);
- pages[*nr] = page;
- get_page(page);
- put_dev_pagemap(pgmap);
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
- return 1;
-}
-
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pmd_val(pmd), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
- if (pmd_devmap(pmd))
- return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pmd_page(pmd);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = *pmdp;
-
- next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
- return 0;
- if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
- return 0;
- if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- }
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pud_val(pud), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
- if (pud_devmap(pud))
- return __gup_device_huge_pud(pud, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pud_page(pud);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&p4d, addr);
- do {
- pud_t pud = *pudp;
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (unlikely(pud_large(pud))) {
- if (!gup_huge_pud(pud, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- }
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- p4d_t *p4dp;
-
- p4dp = p4d_offset(&pgd, addr);
- do {
- p4d_t p4d = *p4dp;
-
- next = p4d_addr_end(addr, end);
- if (p4d_none(p4d))
- return 0;
- BUILD_BUG_ON(p4d_large(p4d));
- if (!gup_pud_range(p4d, addr, next, write, pages, nr))
- return 0;
- } while (p4dp++, addr = next, addr != end);
-
- return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
- return 0;
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
-
- return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
-
- end = start + len;
- if (end < start)
- goto slow_irqon;
-
-#ifdef CONFIG_X86_64
- if (end >> __VIRTUAL_MASK_SHIFT)
- goto slow_irqon;
-#endif
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_disable();
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
-
-slow:
- local_irq_enable();
-slow_irqon:
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- ret = get_user_pages_unlocked(start,
- (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
-
- return ret;
- }
-}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 302f43fd9c28..adad702b39cd 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -148,7 +148,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cbc87ea98751..673541eb3b3f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,16 +161,16 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
* use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
page_size_mask |= 1 << PG_LEVEL_2M;
-#endif
+ else
+ direct_gbpages = 0;
/* Enable PSE if available */
if (boot_cpu_has(X86_FEATURE_PSE))
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
- .active_mm = &init_mm,
+ .loaded_mm = &init_mm,
.state = 0,
-#endif
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 95651dc58e09..dae6a5e5ad4a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
* When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
+#ifdef CONFIG_X86_5LEVEL
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ const pgd_t *pgd_ref = pgd_offset_k(addr);
+ struct page *page;
+
+ /* Check for overflow */
+ if (addr < start)
+ break;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+#else
void sync_global_pgds(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
+#endif
/*
* NOTE: This function is marked __ref because it calls __init function
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
return paddr_last;
}
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ unsigned long paddr_next, paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = p4d_index(vaddr);
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+
+ for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d;
+ pud_t *pud;
+
+ vaddr = (unsigned long)__va(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_p4d(p4d, __p4d(0));
+ continue;
+ }
+
+ if (!p4d_none(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ paddr_last = phys_pud_init(pud, paddr,
+ paddr_end,
+ page_size_mask);
+ __flush_tlb_all();
+ continue;
+ }
+
+ pud = alloc_low_page();
+ paddr_last = phys_pud_init(pud, paddr, paddr_end,
+ page_size_mask);
+
+ spin_lock(&init_mm.page_table_lock);
+ p4d_populate(&init_mm, p4d, pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ __flush_tlb_all();
+
+ return paddr_last;
+}
+
/*
* Create page table mapping for the physical memory for specific physical
* addresses. The virtual and physical addresses have to be aligned on PMD level
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d;
- pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
- BUILD_BUG_ON(pgd_none(*pgd));
- p4d = p4d_offset(pgd, vaddr);
- if (p4d_val(*p4d)) {
- pud = (pud_t *)p4d_page_vaddr(*p4d);
- paddr_last = phys_pud_init(pud, __pa(vaddr),
+ if (pgd_val(*pgd)) {
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
- pud = alloc_low_page();
- paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+ p4d = alloc_low_page();
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- p4d_populate(&init_mm, p4d, pud);
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ pgd_populate(&init_mm, pgd, p4d);
+ else
+ p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -990,7 +1080,13 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
pud_base = pud_offset(p4d, 0);
remove_pud_table(pud_base, addr, next, direct);
- free_pud_table(pud_base, p4d);
+ /*
+ * For 4-level page tables we do not want to free PUDs, but in the
+ * 5-level case we should free them. This code will have to change
+ * to adapt for boot-time switching between 4 and 5 level page tables.
+ */
+ if (CONFIG_PGTABLE_LEVELS == 5)
+ free_pud_table(pud_base, p4d);
}
if (direct)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
/* Don't assume we're using swapper_pg_dir at this point */
- pgd_t *base = __va(read_cr3());
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(addr)];
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0c7d8129bed6..88215ac16b24 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,7 +12,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static int __init map_range(struct range *range)
@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
- kasan_map_early_shadow(early_level4_pgt);
- kasan_map_early_shadow(init_level4_pgt);
+ kasan_map_early_shadow(early_top_pgt);
+ kasan_map_early_shadow(init_top_pgt);
}
void __init kasan_init(void)
@@ -121,8 +121,8 @@ void __init kasan_init(void)
register_die_notifier(&kasan_die_notifier);
#endif
- memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
- load_cr3(early_level4_pgt);
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+ load_cr3(early_top_pgt);
__flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -148,7 +148,7 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
- load_cr3(init_level4_pgt);
+ load_cr3(init_top_pgt);
__flush_tlb_all();
/*
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..af599167fe3c 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
*
* Entropy is generated using the KASLR early boot functions now shared in
* the lib directory (originally written by Kees Cook). Randomization is
- * done on PGD & PUD page table levels to increase possible addresses. The
- * physical memory mapping code was adapted to support PUD level virtual
- * addresses. This implementation on the best configuration provides 30,000
- * possible virtual addresses in average for each memory region. An additional
- * low memory page is used to ensure each CPU can start with a PGD aligned
- * virtual address (for realmode).
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
+ * The physical memory mapping code was adapted to support P4D/PUD level
+ * virtual addresses. This implementation on the best configuration provides
+ * 30,000 possible virtual addresses in average for each memory region.
+ * An additional low memory page is used to ensure each CPU can start with
+ * a PGD aligned virtual address (for realmode).
*
* The order of each memory region is not changed. The feature looks at
* the available space for the regions based on different configuration
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 64/* Maximum */ },
+ { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB },
{ &vmemmap_base, 1 },
};
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- entropy = (rand % (entropy + 1)) & PUD_MASK;
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ entropy = (rand % (entropy + 1)) & P4D_MASK;
+ else
+ entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- vaddr = round_up(vaddr + 1, PUD_SIZE);
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ vaddr = round_up(vaddr + 1, P4D_SIZE);
+ else
+ vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
}
-/*
- * Create PGD aligned trampoline table to allow real mode initialization
- * of additional CPUs. Consume only 1 low memory page.
- */
-void __meminit init_trampoline(void)
+static void __meminit init_trampoline_pud(void)
{
unsigned long paddr, paddr_next;
pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
- if (!kaslr_memory_enabled()) {
- init_trampoline_default();
- return;
- }
-
pud_page_tramp = alloc_low_page();
paddr = 0;
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
+
+static void __meminit init_trampoline_p4d(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ p4d_t *p4d_page, *p4d_page_tramp;
+ int i;
+
+ p4d_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+
+ for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d, *p4d_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ *p4d_tramp = *p4d;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+
+ if (!kaslr_memory_enabled()) {
+ init_trampoline_default();
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ init_trampoline_p4d();
+ else
+ init_trampoline_pud();
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 19ad095b41df..797295e792b2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
- return 1;
-
return sysctl_legacy_va_layout;
}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 83a59a67757a..9b78685b66e6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -65,11 +65,9 @@ static int __init nopat(char *str)
}
early_param("nopat", nopat);
-static bool __read_mostly __pat_initialized = false;
-
bool pat_enabled(void)
{
- return __pat_initialized;
+ return !!__pat_enabled;
}
EXPORT_SYMBOL_GPL(pat_enabled);
@@ -227,14 +225,13 @@ static void pat_bsp_init(u64 pat)
}
wrmsrl(MSR_IA32_CR_PAT, pat);
- __pat_initialized = true;
__init_cache_modes(pat);
}
static void pat_ap_init(u64 pat)
{
- if (!this_cpu_has(X86_FEATURE_PAT)) {
+ if (!boot_cpu_has(X86_FEATURE_PAT)) {
/*
* If this happens we are on a secondary CPU, but switched to
* PAT on the boot CPU. We have no way to undo PAT.
@@ -309,7 +306,7 @@ void pat_init(void)
u64 pat;
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (!__pat_enabled) {
+ if (!pat_enabled()) {
init_cache_modes();
return;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..014d07a80053 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
#include <linux/debugfs.h>
/*
- * Smarter SMP flushing macros.
+ * TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
@@ -28,39 +28,28 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
-#ifdef CONFIG_SMP
-
-struct flush_tlb_info {
- struct mm_struct *flush_mm;
- unsigned long flush_start;
- unsigned long flush_end;
-};
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
void leave_mm(int cpu)
{
- struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+ /*
+ * It's plausible that we're in lazy TLB mode while our mm is init_mm.
+ * If so, our callers still expect us to flush the TLB, but there
+ * aren't any user TLB entries in init_mm to worry about.
+ *
+ * This needs to happen before any other sanity checks due to
+ * intel_idle's shenanigans.
+ */
+ if (loaded_mm == &init_mm)
+ return;
+
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
- cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
- load_cr3(swapper_pg_dir);
- /*
- * This gets called in the idle path where RCU
- * functions differently. Tracing normally
- * uses RCU, so we have to call the tracepoint
- * specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- }
+
+ switch_mm(NULL, &init_mm, NULL);
}
EXPORT_SYMBOL_GPL(leave_mm);
-#endif /* CONFIG_SMP */
-
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
+ struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
- if (likely(prev != next)) {
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
- pgd_t *pgd = next->pgd + stack_pgd_index;
-
- if (unlikely(pgd_none(*pgd)))
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- }
+ /*
+ * NB: The scheduler will call us with prev == next when
+ * switching from lazy TLB mode to normal mode if active_mm
+ * isn't changing. When this happens, there is no guarantee
+ * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+ *
+ * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
+ */
-#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (real_prev == next) {
+ /*
+ * There's nothing to do: we always keep the per-mm control
+ * regs in sync with cpu_tlbstate.loaded_mm. Just
+ * sanity-check mm_cpumask.
+ */
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ return;
+ }
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
*/
- load_cr3(next->pgd);
+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ pgd_t *pgd = next->pgd + stack_pgd_index;
- /* Stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ }
- /* Load per-mm CR4 state */
- load_mm_cr4(next);
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT, if the LDT is different.
- *
- * It's possible that prev->context.ldt doesn't match
- * the LDT register. This can happen if leave_mm(prev)
- * was called and then modify_ldt changed
- * prev->context.ldt but suppressed an IPI to this CPU.
- * In this case, prev->context.ldt != NULL, because we
- * never set context.ldt to NULL while the mm still
- * exists. That means that next->context.ldt !=
- * prev->context.ldt, because mms never share an LDT.
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
-#endif
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ */
+ load_cr3(next->pgd);
+
+ /*
+ * This gets called via leave_mm() in the idle path where RCU
+ * functions differently. Tracing normally uses RCU, so we have to
+ * call the tracepoint specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+ /* Stop flush ipis for the previous mm */
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+
+ /* Load per-mm CR4 and LDTR state */
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+}
+
+static void flush_tlb_func_common(const struct flush_tlb_info *f,
+ bool local, enum tlb_flush_reason reason)
+{
+ /* This code cannot presently handle being reentered. */
+ VM_WARN_ON(!irqs_disabled());
+
+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+ leave_mm(smp_processor_id());
+ return;
}
-#ifdef CONFIG_SMP
- else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * On established mms, the mm_cpumask is only changed
- * from irq context, from ptep_clear_flush() while in
- * lazy tlb mode, and here. Irqs are blocked during
- * schedule, protecting us from simultaneous changes.
- */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- /*
- * We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
- */
- load_cr3(next->pgd);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_mm_cr4(next);
- load_mm_ldt(next);
+ if (f->end == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
+ } else {
+ unsigned long addr;
+ unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ addr = f->start;
+ while (addr < f->end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
}
+ if (local)
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+ trace_tlb_flush(reason, nr_pages);
}
-#endif
}
-#ifdef CONFIG_SMP
+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+{
+ const struct flush_tlb_info *f = info;
-/*
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) set cpu_tlbstate to TLBSTATE_OK
- * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
- * if cpu0 was in lazy tlb mode.
- * 1a2) update cpu active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
- * mm, and in the worst case we perform a superfluous tlb flush.
- * 1b) thread switch without mm change
- * cpu active_mm is correct, cpu0 already handles flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
+ flush_tlb_func_common(f, true, reason);
+}
-/*
- * TLB flush funcation:
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-static void flush_tlb_func(void *info)
+static void flush_tlb_func_remote(void *info)
{
- struct flush_tlb_info *f = info;
+ const struct flush_tlb_info *f = info;
inc_irq_stat(irq_tlb_count);
- if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
- (f->flush_end - f->flush_start) / PAGE_SIZE;
- addr = f->flush_start;
- while (addr < f->flush_end) {
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
- }
- } else
- leave_mm(smp_processor_id());
-
+ flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
- struct flush_tlb_info info;
-
- info.flush_mm = mm;
- info.flush_start = start;
- info.flush_end = end;
-
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- if (end == TLB_FLUSH_ALL)
+ if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
- (end - start) >> PAGE_SHIFT);
+ (info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
unsigned int cpu;
cpu = smp_processor_id();
- cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+ cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func,
- &info, 1);
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
}
/*
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
- unsigned long addr;
- /* do a global flush by default */
- unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
-
- preempt_disable();
+ int cpu;
- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
- base_pages_to_flush = (end - start) >> PAGE_SHIFT;
- if (base_pages_to_flush > tlb_single_page_flush_ceiling)
- base_pages_to_flush = TLB_FLUSH_ALL;
+ struct flush_tlb_info info = {
+ .mm = mm,
+ };
- if (current->active_mm != mm) {
- /* Synchronize with switch_mm. */
- smp_mb();
+ cpu = get_cpu();
- goto out;
- }
-
- if (!current->mm) {
- leave_mm(smp_processor_id());
+ /* Synchronize with switch_mm. */
+ smp_mb();
- /* Synchronize with switch_mm. */
- smp_mb();
-
- goto out;
- }
-
- /*
- * Both branches below are implicit full barriers (MOV to CR or
- * INVLPG) that synchronize with switch_mm.
- */
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
+ /* Should we flush just the requested range? */
+ if ((end != TLB_FLUSH_ALL) &&
+ !(vmflag & VM_HUGETLB) &&
+ ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ info.start = start;
+ info.end = end;
} else {
- /* flush range by one by one 'invlpg' */
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
- __flush_tlb_single(addr);
- }
- }
- trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
-out:
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- start = 0UL;
- end = TLB_FLUSH_ALL;
+ info.start = 0UL;
+ info.end = TLB_FLUSH_ALL;
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, end);
- preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
+ if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+ local_irq_enable();
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), &info);
+ put_cpu();
}
+
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
unsigned long addr;
/* flush range by one by one 'invlpg' */
- for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+ for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
}
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+ (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
struct flush_tlb_info info;
- info.flush_start = start;
- info.flush_end = end;
+ info.start = start;
+ info.end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ struct flush_tlb_info info = {
+ .mm = NULL,
+ .start = 0UL,
+ .end = TLB_FLUSH_ALL,
+ };
+
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+ local_irq_enable();
+ }
+
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&batch->cpumask, &info);
+ cpumask_clear(&batch->cpumask);
+
+ put_cpu();
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c33ddb0..fefb4b619598 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,4 +1,6 @@
#
# Arch-specific network modules
#
+OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+
obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index b914e20b5a00..3353b76dcff0 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -65,6 +65,9 @@ struct sim_reg_op {
{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
{0, SIZE_TO_MASK(size)} },
+/*
+ * All read/write functions are called with pci_config_lock held.
+ */
static void reg_init(struct sim_dev_reg *reg)
{
pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
@@ -73,21 +76,13 @@ static void reg_init(struct sim_dev_reg *reg)
static void reg_read(struct sim_dev_reg *reg, u32 *value)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&pci_config_lock, flags);
*value = reg->sim_reg.value;
- raw_spin_unlock_irqrestore(&pci_config_lock, flags);
}
static void reg_write(struct sim_dev_reg *reg, u32 value)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&pci_config_lock, flags);
reg->sim_reg.value = (value & reg->sim_reg.mask) |
(reg->sim_reg.value & ~reg->sim_reg.mask);
- raw_spin_unlock_irqrestore(&pci_config_lock, flags);
}
static void sata_reg_init(struct sim_dev_reg *reg)
@@ -117,12 +112,8 @@ static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&pci_config_lock, flags);
/* force interrupt pin value to 0 */
*value = reg->sim_reg.value & 0xfff00ff;
- raw_spin_unlock_irqrestore(&pci_config_lock, flags);
}
static struct sim_dev_reg bus1_fixups[] = {
@@ -265,24 +256,33 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
return retval;
}
-static int ce4100_conf_read(unsigned int seg, unsigned int bus,
- unsigned int devfn, int reg, int len, u32 *value)
+static int ce4100_bus1_read(unsigned int devfn, int reg, int len, u32 *value)
{
+ unsigned long flags;
int i;
- WARN_ON(seg);
- if (bus == 1) {
- for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
- if (bus1_fixups[i].dev_func == devfn &&
- bus1_fixups[i].reg == (reg & ~3) &&
- bus1_fixups[i].read) {
- bus1_fixups[i].read(&(bus1_fixups[i]),
- value);
- extract_bytes(value, reg, len);
- return 0;
- }
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].dev_func == devfn &&
+ bus1_fixups[i].reg == (reg & ~3) &&
+ bus1_fixups[i].read) {
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ bus1_fixups[i].read(&(bus1_fixups[i]), value);
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+ extract_bytes(value, reg, len);
+ return 0;
}
}
+ return -1;
+}
+
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ WARN_ON(seg);
+
+ if (bus == 1 && !ce4100_bus1_read(devfn, reg, len, value))
+ return 0;
if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
!bridge_read(devfn, reg, len, value))
@@ -291,23 +291,32 @@ static int ce4100_conf_read(unsigned int seg, unsigned int bus,
return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
}
-static int ce4100_conf_write(unsigned int seg, unsigned int bus,
- unsigned int devfn, int reg, int len, u32 value)
+static int ce4100_bus1_write(unsigned int devfn, int reg, int len, u32 value)
{
+ unsigned long flags;
int i;
- WARN_ON(seg);
- if (bus == 1) {
- for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
- if (bus1_fixups[i].dev_func == devfn &&
- bus1_fixups[i].reg == (reg & ~3) &&
- bus1_fixups[i].write) {
- bus1_fixups[i].write(&(bus1_fixups[i]),
- value);
- return 0;
- }
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].dev_func == devfn &&
+ bus1_fixups[i].reg == (reg & ~3) &&
+ bus1_fixups[i].write) {
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ bus1_fixups[i].write(&(bus1_fixups[i]), value);
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+ return 0;
}
}
+ return -1;
+}
+
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ WARN_ON(seg);
+
+ if (bus == 1 && !ce4100_bus1_write(devfn, reg, len, value))
+ return 0;
/* Discard writes to A/V bridge BAR. */
if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
@@ -318,8 +327,8 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus,
}
static const struct pci_raw_ops ce4100_pci_conf = {
- .read = ce4100_conf_read,
- .write = ce4100_conf_write,
+ .read = ce4100_conf_read,
+ .write = ce4100_conf_write,
};
int __init ce4100_pci_init(void)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 190e718694b1..cfd1a89fd04e 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -75,8 +75,8 @@ struct pci_ops pci_root_ops = {
};
/*
- * This interrupt-safe spinlock protects all accesses to PCI
- * configuration space.
+ * This interrupt-safe spinlock protects all accesses to PCI configuration
+ * space, except for the mmconfig (ECAM) based operations.
*/
DEFINE_RAW_SPINLOCK(pci_config_lock);
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index ea6f3802c17b..1cb01abcb1be 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -24,12 +24,10 @@ static void pcibios_fixup_peer_bridges(void)
int __init pci_legacy_init(void)
{
- if (!raw_pci_ops) {
- printk("PCI: System does not support PCI\n");
- return 0;
- }
+ if (!raw_pci_ops)
+ return 1;
- printk("PCI: Probing PCI hardware\n");
+ pr_info("PCI: Probing PCI hardware\n");
pcibios_scan_root(0);
return 0;
}
@@ -46,7 +44,7 @@ void pcibios_scan_specific_bus(int busn)
if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
l != 0x0000 && l != 0xffff) {
DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
- printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn);
+ pr_info("PCI: Discovered peer bus %02x\n", busn);
pcibios_scan_root(busn);
return;
}
@@ -60,8 +58,12 @@ static int __init pci_subsys_init(void)
* The init function returns an non zero value when
* pci_legacy_init should be invoked.
*/
- if (x86_init.pci.init())
- pci_legacy_init();
+ if (x86_init.pci.init()) {
+ if (pci_legacy_init()) {
+ pr_info("PCI: System does not support PCI\n");
+ return -ENODEV;
+ }
+ }
pcibios_fixup_peer_bridges();
x86_init.pci.init_irq();
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index f1d83b34c329..2f56e1ed61c3 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,5 @@
OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 7e76a4d8304b..f084d8718ac4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -828,9 +828,11 @@ static void __init kexec_enter_virtual_mode(void)
/*
* We don't do virtual mode, since we don't do runtime services, on
- * non-native EFI
+ * non-native EFI. With efi=old_map, we don't do runtime services in
+ * kexec kernel because in the initial boot something else might
+ * have been mapped at these virtual addresses.
*/
- if (!efi_is_native()) {
+ if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) {
efi_memmap_unmap();
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
@@ -1012,7 +1014,6 @@ static void __init __efi_enter_virtual_mode(void)
* necessary relocation fixups for the new virtual addresses.
*/
efi_runtime_update_mappings();
- efi_dump_pagetable();
/* clean DUMMY object */
efi_delete_dummy_variable();
@@ -1027,6 +1028,8 @@ void __init efi_enter_virtual_mode(void)
kexec_enter_virtual_mode();
else
__efi_enter_virtual_mode();
+
+ efi_dump_pagetable();
}
/*
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 3481268da3d0..52f7faa1538f 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -44,7 +44,14 @@ int __init efi_alloc_page_tables(void)
}
void efi_sync_low_kernel_mappings(void) {}
-void __init efi_dump_pagetable(void) {}
+
+void __init efi_dump_pagetable(void)
+{
+#ifdef CONFIG_EFI_PGT_DUMP
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+#endif
+}
+
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
return 0;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index c488625c9712..9bf72f5bfedb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -71,14 +71,16 @@ static void __init early_code_mapping_set_exec(int executable)
pgd_t * __init efi_call_phys_prolog(void)
{
- unsigned long vaddress;
- pgd_t *save_pgd;
+ unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
+ pgd_t *save_pgd, *pgd_k, *pgd_efi;
+ p4d_t *p4d, *p4d_k, *p4d_efi;
+ pud_t *pud;
int pgd;
- int n_pgds;
+ int n_pgds, i, j;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
- save_pgd = (pgd_t *)read_cr3();
+ save_pgd = (pgd_t *)__read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt);
goto out;
}
@@ -88,10 +90,49 @@ pgd_t * __init efi_call_phys_prolog(void)
n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
+ /*
+ * Build 1:1 identity mapping for efi=old_map usage. Note that
+ * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
+ * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
+ * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
+ * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
+ * This means here we can only reuse the PMD tables of the direct mapping.
+ */
for (pgd = 0; pgd < n_pgds; pgd++) {
- save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
- vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
- set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
+ addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
+ vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
+ pgd_efi = pgd_offset_k(addr_pgd);
+ save_pgd[pgd] = *pgd_efi;
+
+ p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+ if (!p4d) {
+ pr_err("Failed to allocate p4d table!\n");
+ goto out;
+ }
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ addr_p4d = addr_pgd + i * P4D_SIZE;
+ p4d_efi = p4d + p4d_index(addr_p4d);
+
+ pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+ if (!pud) {
+ pr_err("Failed to allocate pud table!\n");
+ goto out;
+ }
+
+ for (j = 0; j < PTRS_PER_PUD; j++) {
+ addr_pud = addr_p4d + j * PUD_SIZE;
+
+ if (addr_pud > (max_pfn << PAGE_SHIFT))
+ break;
+
+ vaddr = (unsigned long)__va(addr_pud);
+
+ pgd_k = pgd_offset_k(vaddr);
+ p4d_k = p4d_offset(pgd_k, vaddr);
+ pud[j] = *pud_offset(p4d_k, vaddr);
+ }
+ }
}
out:
__flush_tlb_all();
@@ -104,8 +145,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
/*
* After the lock is released, the original page table is restored.
*/
- int pgd_idx;
+ int pgd_idx, i;
int nr_pgds;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
write_cr3((unsigned long)save_pgd);
@@ -115,9 +159,28 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
- for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+ for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
+ pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ continue;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_offset(pgd,
+ pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
+
+ if (!(p4d_val(*p4d) & _PAGE_PRESENT))
+ continue;
+
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
+ pud_free(&init_mm, pud);
+ }
+
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ p4d_free(&init_mm, p4d);
+ }
+
kfree(save_pgd);
__flush_tlb_all();
@@ -526,7 +589,10 @@ void __init efi_runtime_update_mappings(void)
void __init efi_dump_pagetable(void)
{
#ifdef CONFIG_EFI_PGT_DUMP
- ptdump_walk_pgd_level(NULL, efi_pgd);
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+ else
+ ptdump_walk_pgd_level(NULL, efi_pgd);
#endif
}
@@ -583,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_scratch.prev_cr3 = read_cr3();
+ efi_scratch.prev_cr3 = __read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt);
__flush_tlb_all();
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 26615991d69c..8a99a2e96537 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -15,12 +15,66 @@
#include <asm/e820/api.h>
#include <asm/efi.h>
#include <asm/uv/uv.h>
+#include <asm/cpu_device_id.h>
#define EFI_MIN_RESERVE 5120
#define EFI_DUMMY_GUID \
EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+#define QUARK_CSH_SIGNATURE 0x5f435348 /* _CSH */
+#define QUARK_SECURITY_HEADER_SIZE 0x400
+
+/*
+ * Header prepended to the standard EFI capsule on Quark systems the are based
+ * on Intel firmware BSP.
+ * @csh_signature: Unique identifier to sanity check signed module
+ * presence ("_CSH").
+ * @version: Current version of CSH used. Should be one for Quark A0.
+ * @modulesize: Size of the entire module including the module header
+ * and payload.
+ * @security_version_number_index: Index of SVN to use for validation of signed
+ * module.
+ * @security_version_number: Used to prevent against roll back of modules.
+ * @rsvd_module_id: Currently unused for Clanton (Quark).
+ * @rsvd_module_vendor: Vendor Identifier. For Intel products value is
+ * 0x00008086.
+ * @rsvd_date: BCD representation of build date as yyyymmdd, where
+ * yyyy=4 digit year, mm=1-12, dd=1-31.
+ * @headersize: Total length of the header including including any
+ * padding optionally added by the signing tool.
+ * @hash_algo: What Hash is used in the module signing.
+ * @cryp_algo: What Crypto is used in the module signing.
+ * @keysize: Total length of the key data including including any
+ * padding optionally added by the signing tool.
+ * @signaturesize: Total length of the signature including including any
+ * padding optionally added by the signing tool.
+ * @rsvd_next_header: 32-bit pointer to the next Secure Boot Module in the
+ * chain, if there is a next header.
+ * @rsvd: Reserved, padding structure to required size.
+ *
+ * See also QuartSecurityHeader_t in
+ * Quark_EDKII_v1.2.1.1/QuarkPlatformPkg/Include/QuarkBootRom.h
+ * from https://downloadcenter.intel.com/download/23197/Intel-Quark-SoC-X1000-Board-Support-Package-BSP
+ */
+struct quark_security_header {
+ u32 csh_signature;
+ u32 version;
+ u32 modulesize;
+ u32 security_version_number_index;
+ u32 security_version_number;
+ u32 rsvd_module_id;
+ u32 rsvd_module_vendor;
+ u32 rsvd_date;
+ u32 headersize;
+ u32 hash_algo;
+ u32 cryp_algo;
+ u32 keysize;
+ u32 signaturesize;
+ u32 rsvd_next_header;
+ u32 rsvd[2];
+};
+
static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
static bool efi_no_storage_paranoia;
@@ -360,6 +414,9 @@ void __init efi_free_boot_services(void)
free_bootmem_late(start, size);
}
+ if (!num_entries)
+ return;
+
new_size = efi.memmap.desc_size * num_entries;
new_phys = efi_memmap_alloc(num_entries);
if (!new_phys) {
@@ -501,3 +558,86 @@ bool efi_poweroff_required(void)
{
return acpi_gbl_reduced_hardware || acpi_no_s5;
}
+
+#ifdef CONFIG_EFI_CAPSULE_QUIRK_QUARK_CSH
+
+static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
+ size_t hdr_bytes)
+{
+ struct quark_security_header *csh = *pkbuff;
+
+ /* Only process data block that is larger than the security header */
+ if (hdr_bytes < sizeof(struct quark_security_header))
+ return 0;
+
+ if (csh->csh_signature != QUARK_CSH_SIGNATURE ||
+ csh->headersize != QUARK_SECURITY_HEADER_SIZE)
+ return 1;
+
+ /* Only process data block if EFI header is included */
+ if (hdr_bytes < QUARK_SECURITY_HEADER_SIZE +
+ sizeof(efi_capsule_header_t))
+ return 0;
+
+ pr_debug("Quark security header detected\n");
+
+ if (csh->rsvd_next_header != 0) {
+ pr_err("multiple Quark security headers not supported\n");
+ return -EINVAL;
+ }
+
+ *pkbuff += csh->headersize;
+ cap_info->total_size = csh->headersize;
+
+ /*
+ * Update the first page pointer to skip over the CSH header.
+ */
+ cap_info->pages[0] += csh->headersize;
+
+ return 1;
+}
+
+#define ICPU(family, model, quirk_handler) \
+ { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \
+ (unsigned long)&quirk_handler }
+
+static const struct x86_cpu_id efi_capsule_quirk_ids[] = {
+ ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */
+ { }
+};
+
+int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
+ size_t hdr_bytes)
+{
+ int (*quirk_handler)(struct capsule_info *, void **, size_t);
+ const struct x86_cpu_id *id;
+ int ret;
+
+ if (hdr_bytes < sizeof(efi_capsule_header_t))
+ return 0;
+
+ cap_info->total_size = 0;
+
+ id = x86_match_cpu(efi_capsule_quirk_ids);
+ if (id) {
+ /*
+ * The quirk handler is supposed to return
+ * - a value > 0 if the setup should continue, after advancing
+ * kbuff as needed
+ * - 0 if not enough hdr_bytes are available yet
+ * - a negative error code otherwise
+ */
+ quirk_handler = (typeof(quirk_handler))id->driver_data;
+ ret = quirk_handler(cap_info, &kbuff, hdr_bytes);
+ if (ret <= 0)
+ return ret;
+ }
+
+ memcpy(&cap_info->header, kbuff, sizeof(cap_info->header));
+
+ cap_info->total_size += cap_info->header.imagesize;
+
+ return __efi_capsule_setup_info(cap_info);
+}
+
+#endif
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
{
- void *pgd_addr = __va(read_cr3());
+ void *pgd_addr = __va(read_cr3_pa());
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 42e65fee5673..2983faab5b18 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long ns;
- ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return ns;
}
@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long cyc;
- cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return cyc;
}
@@ -1121,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- unsigned int cpu)
+ const struct flush_tlb_info *info)
{
+ unsigned int cpu = smp_processor_id();
int locals = 0, remotes = 0, hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
@@ -1179,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
- if (!end || (end - start) <= PAGE_SIZE)
- address = start;
+ if (!info->end || (info->end - info->start) <= PAGE_SIZE)
+ address = info->start;
else
address = TLB_FLUSH_ALL;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 776c6592136c..03fc397335b7 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -160,13 +160,21 @@ static struct irq_domain *uv_get_irq_domain(void)
{
static struct irq_domain *uv_domain;
static DEFINE_MUTEX(uv_lock);
+ struct fwnode_handle *fn;
mutex_lock(&uv_lock);
- if (uv_domain == NULL) {
- uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL);
- if (uv_domain)
- uv_domain->parent = x86_vector_domain;
- }
+ if (uv_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("UV-CORE");
+ if (!fn)
+ goto out;
+
+ uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL);
+ irq_domain_free_fwnode(fn);
+ if (uv_domain)
+ uv_domain->parent = x86_vector_domain;
+out:
mutex_unlock(&uv_lock);
return uv_domain;
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index a6a198c33623..05041871ac90 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
+
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
*/
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
- ctxt->cr3 = read_cr3();
+ ctxt->cr3 = __read_cr3();
ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..f2598d81cd55 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -147,10 +147,11 @@ static int relocate_restore_code(void)
if (!relocated_restore_code)
return -ENOMEM;
- memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
+ memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */
- pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+ pgd = (pgd_t *)__va(read_cr3_pa()) +
+ pgd_index(relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code);
if (p4d_large(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
@@ -292,8 +293,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
if (max_size < sizeof(struct restore_data_record))
return -EOVERFLOW;
- rdr->jump_address = (unsigned long)&restore_registers;
- rdr->jump_address_phys = __pa_symbol(&restore_registers);
+ rdr->jump_address = (unsigned long)restore_registers;
+ rdr->jump_address_phys = __pa_symbol(restore_registers);
rdr->cr3 = restore_cr3;
rdr->magic = RESTORE_MAGIC;
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 2a2d89d39af6..bb026699ad19 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -1,13 +1,3 @@
-config MCE_AMD_INJ
- tristate "Simple MCE injection interface for AMD processors"
- depends on RAS && X86_MCE && DEBUG_FS && AMD_NB
- default n
- help
- This is a simple debugfs interface to inject MCEs and test different
- aspects of the MCE handling code.
-
- WARNING: Do not even assume this interface is staying stable!
-
config RAS_CEC
bool "Correctable Errors Collector"
depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
@@ -20,4 +10,3 @@ config RAS_CEC
Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
-
diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile
deleted file mode 100644
index 5f94546db280..000000000000
--- a/arch/x86/ras/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o
-
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
deleted file mode 100644
index 8730c2882fff..000000000000
--- a/arch/x86/ras/mce_amd_inj.c
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- * A simple MCE injection facility for testing different aspects of the RAS
- * code. This driver should be built as module so that it can be loaded
- * on production kernels for testing purposes.
- *
- * This file may be distributed under the terms of the GNU General Public
- * License version 2.
- *
- * Copyright (c) 2010-15: Borislav Petkov <bp@alien8.de>
- * Advanced Micro Devices Inc.
- */
-
-#include <linux/kobject.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/pci.h>
-
-#include <asm/mce.h>
-#include <asm/smp.h>
-#include <asm/amd_nb.h>
-#include <asm/irq_vectors.h>
-
-#include "../kernel/cpu/mcheck/mce-internal.h"
-
-/*
- * Collect all the MCi_XXX settings
- */
-static struct mce i_mce;
-static struct dentry *dfs_inj;
-
-static u8 n_banks;
-
-#define MAX_FLAG_OPT_SIZE 3
-#define NBCFG 0x44
-
-enum injection_type {
- SW_INJ = 0, /* SW injection, simply decode the error */
- HW_INJ, /* Trigger a #MC */
- DFR_INT_INJ, /* Trigger Deferred error interrupt */
- THR_INT_INJ, /* Trigger threshold interrupt */
- N_INJ_TYPES,
-};
-
-static const char * const flags_options[] = {
- [SW_INJ] = "sw",
- [HW_INJ] = "hw",
- [DFR_INT_INJ] = "df",
- [THR_INT_INJ] = "th",
- NULL
-};
-
-/* Set default injection to SW_INJ */
-static enum injection_type inj_type = SW_INJ;
-
-#define MCE_INJECT_SET(reg) \
-static int inj_##reg##_set(void *data, u64 val) \
-{ \
- struct mce *m = (struct mce *)data; \
- \
- m->reg = val; \
- return 0; \
-}
-
-MCE_INJECT_SET(status);
-MCE_INJECT_SET(misc);
-MCE_INJECT_SET(addr);
-MCE_INJECT_SET(synd);
-
-#define MCE_INJECT_GET(reg) \
-static int inj_##reg##_get(void *data, u64 *val) \
-{ \
- struct mce *m = (struct mce *)data; \
- \
- *val = m->reg; \
- return 0; \
-}
-
-MCE_INJECT_GET(status);
-MCE_INJECT_GET(misc);
-MCE_INJECT_GET(addr);
-MCE_INJECT_GET(synd);
-
-DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
-
-/*
- * Caller needs to be make sure this cpu doesn't disappear
- * from under us, i.e.: get_cpu/put_cpu.
- */
-static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
-{
- u32 l, h;
- int err;
-
- err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
- if (err) {
- pr_err("%s: error reading HWCR\n", __func__);
- return err;
- }
-
- enable ? (l |= BIT(18)) : (l &= ~BIT(18));
-
- err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
- if (err)
- pr_err("%s: error writing HWCR\n", __func__);
-
- return err;
-}
-
-static int __set_inj(const char *buf)
-{
- int i;
-
- for (i = 0; i < N_INJ_TYPES; i++) {
- if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
- inj_type = i;
- return 0;
- }
- }
- return -EINVAL;
-}
-
-static ssize_t flags_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_FLAG_OPT_SIZE];
- int n;
-
- n = sprintf(buf, "%s\n", flags_options[inj_type]);
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
-}
-
-static ssize_t flags_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_FLAG_OPT_SIZE], *__buf;
- int err;
-
- if (cnt > MAX_FLAG_OPT_SIZE)
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt - 1] = 0;
-
- /* strip whitespace */
- __buf = strstrip(buf);
-
- err = __set_inj(__buf);
- if (err) {
- pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
- return err;
- }
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static const struct file_operations flags_fops = {
- .read = flags_read,
- .write = flags_write,
- .llseek = generic_file_llseek,
-};
-
-/*
- * On which CPU to inject?
- */
-MCE_INJECT_GET(extcpu);
-
-static int inj_extcpu_set(void *data, u64 val)
-{
- struct mce *m = (struct mce *)data;
-
- if (val >= nr_cpu_ids || !cpu_online(val)) {
- pr_err("%s: Invalid CPU: %llu\n", __func__, val);
- return -EINVAL;
- }
- m->extcpu = val;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
-
-static void trigger_mce(void *info)
-{
- asm volatile("int $18");
-}
-
-static void trigger_dfr_int(void *info)
-{
- asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
-}
-
-static void trigger_thr_int(void *info)
-{
- asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
-}
-
-static u32 get_nbc_for_node(int node_id)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 cores_per_node;
-
- cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
-
- return cores_per_node * node_id;
-}
-
-static void toggle_nb_mca_mst_cpu(u16 nid)
-{
- struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
- u32 val;
- int err;
-
- if (!F3)
- return;
-
- err = pci_read_config_dword(F3, NBCFG, &val);
- if (err) {
- pr_err("%s: Error reading F%dx%03x.\n",
- __func__, PCI_FUNC(F3->devfn), NBCFG);
- return;
- }
-
- if (val & BIT(27))
- return;
-
- pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
- __func__);
-
- val |= BIT(27);
- err = pci_write_config_dword(F3, NBCFG, val);
- if (err)
- pr_err("%s: Error writing F%dx%03x.\n",
- __func__, PCI_FUNC(F3->devfn), NBCFG);
-}
-
-static void prepare_msrs(void *info)
-{
- struct mce m = *(struct mce *)info;
- u8 b = m.bank;
-
- wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
-
- if (boot_cpu_has(X86_FEATURE_SMCA)) {
- if (m.inject_flags == DFR_INT_INJ) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
- } else {
- wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
- }
-
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
- wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
- } else {
- wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
- wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
- }
-}
-
-static void do_inject(void)
-{
- u64 mcg_status = 0;
- unsigned int cpu = i_mce.extcpu;
- u8 b = i_mce.bank;
-
- rdtscll(i_mce.tsc);
-
- if (i_mce.misc)
- i_mce.status |= MCI_STATUS_MISCV;
-
- if (i_mce.synd)
- i_mce.status |= MCI_STATUS_SYNDV;
-
- if (inj_type == SW_INJ) {
- mce_inject_log(&i_mce);
- return;
- }
-
- /* prep MCE global settings for the injection */
- mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
-
- if (!(i_mce.status & MCI_STATUS_PCC))
- mcg_status |= MCG_STATUS_RIPV;
-
- /*
- * Ensure necessary status bits for deferred errors:
- * - MCx_STATUS[Deferred]: make sure it is a deferred error
- * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
- */
- if (inj_type == DFR_INT_INJ) {
- i_mce.status |= MCI_STATUS_DEFERRED;
- i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
- }
-
- /*
- * For multi node CPUs, logging and reporting of bank 4 errors happens
- * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
- * Fam10h and later BKDGs.
- */
- if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
- b == 4 &&
- boot_cpu_data.x86 < 0x17) {
- toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
- cpu = get_nbc_for_node(amd_get_nb_id(cpu));
- }
-
- get_online_cpus();
- if (!cpu_online(cpu))
- goto err;
-
- toggle_hw_mce_inject(cpu, true);
-
- i_mce.mcgstatus = mcg_status;
- i_mce.inject_flags = inj_type;
- smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
-
- toggle_hw_mce_inject(cpu, false);
-
- switch (inj_type) {
- case DFR_INT_INJ:
- smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
- break;
- case THR_INT_INJ:
- smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
- break;
- default:
- smp_call_function_single(cpu, trigger_mce, NULL, 0);
- }
-
-err:
- put_online_cpus();
-
-}
-
-/*
- * This denotes into which bank we're injecting and triggers
- * the injection, at the same time.
- */
-static int inj_bank_set(void *data, u64 val)
-{
- struct mce *m = (struct mce *)data;
-
- if (val >= n_banks) {
- pr_err("Non-existent MCE bank: %llu\n", val);
- return -EINVAL;
- }
-
- m->bank = val;
- do_inject();
-
- return 0;
-}
-
-MCE_INJECT_GET(bank);
-
-DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
-
-static const char readme_msg[] =
-"Description of the files and their usages:\n"
-"\n"
-"Note1: i refers to the bank number below.\n"
-"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
-"as they mirror the hardware registers.\n"
-"\n"
-"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
-"\t attributes of the error which caused the MCE.\n"
-"\n"
-"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
-"\t used for error thresholding purposes and its validity is indicated by\n"
-"\t MCi_STATUS[MiscV].\n"
-"\n"
-"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
-"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
-"\n"
-"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
-"\t associated with the error.\n"
-"\n"
-"cpu:\t The CPU to inject the error on.\n"
-"\n"
-"bank:\t Specify the bank you want to inject the error into: the number of\n"
-"\t banks in a processor varies and is family/model-specific, therefore, the\n"
-"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
-"\t injection.\n"
-"\n"
-"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
-"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
-"\t for AMD processors.\n"
-"\n"
-"\t Allowed error injection types:\n"
-"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
-"\t format only. Safe to use.\n"
-"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
-"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
-"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
-"\t before injecting.\n"
-"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
-"\t error APIC interrupt handler to handle the error if the feature is \n"
-"\t is present in hardware. \n"
-"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
-"\t APIC interrupt handler to handle the error. \n"
-"\n";
-
-static ssize_t
-inj_readme_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- return simple_read_from_buffer(ubuf, cnt, ppos,
- readme_msg, strlen(readme_msg));
-}
-
-static const struct file_operations readme_fops = {
- .read = inj_readme_read,
-};
-
-static struct dfs_node {
- char *name;
- struct dentry *d;
- const struct file_operations *fops;
- umode_t perm;
-} dfs_fls[] = {
- { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
- { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
-};
-
-static int __init init_mce_inject(void)
-{
- unsigned int i;
- u64 cap;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- n_banks = cap & MCG_BANKCNT_MASK;
-
- dfs_inj = debugfs_create_dir("mce-inject", NULL);
- if (!dfs_inj)
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
- dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
- dfs_fls[i].perm,
- dfs_inj,
- &i_mce,
- dfs_fls[i].fops);
-
- if (!dfs_fls[i].d)
- goto err_dfs_add;
- }
-
- return 0;
-
-err_dfs_add:
- while (i-- > 0)
- debugfs_remove(dfs_fls[i].d);
-
- debugfs_remove(dfs_inj);
- dfs_inj = NULL;
-
- return -ENODEV;
-}
-
-static void __exit exit_mce_inject(void)
-{
-
- debugfs_remove_recursive(dfs_inj);
- dfs_inj = NULL;
-
- memset(&dfs_fls, 0, sizeof(dfs_fls));
-}
-module_init(init_mce_inject);
-module_exit(exit_mce_inject);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>");
-MODULE_AUTHOR("AMD Inc.");
-MODULE_DESCRIPTION("MCE injection facility for RAS testing");
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a163a90af4aa..cd4be19c36dc 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_level4_pgt[511].pgd;
+ trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif
}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index fffb0a16f9e3..bced7a369a11 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,3 +1,6 @@
+OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_xen-pvh.o := y
+
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index bcea81f36fc5..b5e48da7fbff 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -178,7 +178,7 @@ static struct apic xen_pv_apic = {
.get_apic_id = xen_get_apic_id,
.set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
#ifdef CONFIG_SMP
.send_IPI_mask = xen_send_IPI_mask,
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 30bb2e80cfe7..a18703be9ead 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -54,38 +54,6 @@ static efi_system_table_t efi_systab_xen __initdata = {
.tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */
};
-static const struct efi efi_xen __initconst = {
- .systab = NULL, /* Initialized later. */
- .runtime_version = 0, /* Initialized later. */
- .mps = EFI_INVALID_TABLE_ADDR,
- .acpi = EFI_INVALID_TABLE_ADDR,
- .acpi20 = EFI_INVALID_TABLE_ADDR,
- .smbios = EFI_INVALID_TABLE_ADDR,
- .smbios3 = EFI_INVALID_TABLE_ADDR,
- .sal_systab = EFI_INVALID_TABLE_ADDR,
- .boot_info = EFI_INVALID_TABLE_ADDR,
- .hcdp = EFI_INVALID_TABLE_ADDR,
- .uga = EFI_INVALID_TABLE_ADDR,
- .uv_systab = EFI_INVALID_TABLE_ADDR,
- .fw_vendor = EFI_INVALID_TABLE_ADDR,
- .runtime = EFI_INVALID_TABLE_ADDR,
- .config_table = EFI_INVALID_TABLE_ADDR,
- .get_time = xen_efi_get_time,
- .set_time = xen_efi_set_time,
- .get_wakeup_time = xen_efi_get_wakeup_time,
- .set_wakeup_time = xen_efi_set_wakeup_time,
- .get_variable = xen_efi_get_variable,
- .get_next_variable = xen_efi_get_next_variable,
- .set_variable = xen_efi_set_variable,
- .query_variable_info = xen_efi_query_variable_info,
- .update_capsule = xen_efi_update_capsule,
- .query_capsule_caps = xen_efi_query_capsule_caps,
- .get_next_high_mono_count = xen_efi_get_next_high_mono_count,
- .reset_system = xen_efi_reset_system,
- .set_virtual_address_map = NULL, /* Not used under Xen. */
- .flags = 0 /* Initialized later. */
-};
-
static efi_system_table_t __init *xen_efi_probe(void)
{
struct xen_platform_op op = {
@@ -102,7 +70,18 @@ static efi_system_table_t __init *xen_efi_probe(void)
/* Here we know that Xen runs on EFI platform. */
- efi = efi_xen;
+ efi.get_time = xen_efi_get_time;
+ efi.set_time = xen_efi_set_time;
+ efi.get_wakeup_time = xen_efi_get_wakeup_time;
+ efi.set_wakeup_time = xen_efi_set_wakeup_time;
+ efi.get_variable = xen_efi_get_variable;
+ efi.get_next_variable = xen_efi_get_next_variable;
+ efi.set_variable = xen_efi_set_variable;
+ efi.query_variable_info = xen_efi_query_variable_info;
+ efi.update_capsule = xen_efi_update_capsule;
+ efi.query_capsule_caps = xen_efi_query_capsule_caps;
+ efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
+ efi.reset_system = xen_efi_reset_system;
efi_systab_xen.tables = info->cfg.addr;
efi_systab_xen.nr_tables = info->cfg.nent;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1f386d7fdf70..1d7a7213a310 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
-
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
- we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
+static void drop_mm_ref_this_cpu(void *info)
{
struct mm_struct *mm = info;
- struct mm_struct *active_mm;
-
- active_mm = this_cpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
leave_mm(smp_processor_id());
- /* If this cpu still has a stale cr3 reference, then make sure
- it has been flushed. */
+ /*
+ * If this cpu still has a stale cr3 reference, then make sure
+ * it has been flushed.
+ */
if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
- load_cr3(swapper_pg_dir);
+ xen_mc_flush();
}
+#ifdef CONFIG_SMP
+/*
+ * Another cpu may still have their %cr3 pointing at the pagetable, so
+ * we need to repoint it somewhere else before we can unpin it.
+ */
static void xen_drop_mm_ref(struct mm_struct *mm)
{
cpumask_var_t mask;
unsigned cpu;
- if (current->active_mm == mm) {
- if (current->mm == mm)
- load_cr3(swapper_pg_dir);
- else
- leave_mm(smp_processor_id());
- }
+ drop_mm_ref_this_cpu(mm);
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
- smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
}
return;
}
cpumask_copy(mask, mm_cpumask(mm));
- /* It's possible that a vcpu may have a stale reference to our
- cr3, because its in lazy mode, and it hasn't yet flushed
- its set of pending hypercalls yet. In this case, we can
- look at its actual current cr3 value, and force it to flush
- if needed. */
+ /*
+ * It's possible that a vcpu may have a stale reference to our
+ * cr3, because its in lazy mode, and it hasn't yet flushed
+ * its set of pending hypercalls yet. In this case, we can
+ * look at its actual current cr3 value, and force it to flush
+ * if needed.
+ */
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
cpumask_set_cpu(cpu, mask);
}
- if (!cpumask_empty(mask))
- smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
free_cpumask_var(mask);
}
#else
static void xen_drop_mm_ref(struct mm_struct *mm)
{
- if (current->active_mm == mm)
- load_cr3(swapper_pg_dir);
+ drop_mm_ref_this_cpu(mm);
}
#endif
@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
}
static void xen_flush_tlb_others(const struct cpumask *cpus,
- struct mm_struct *mm, unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
struct {
struct mmuext_op op;
@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
} *args;
struct multicall_space mcs;
- trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
+ trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus))
return; /* nothing to do */
@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+ if (info->end != TLB_FLUSH_ALL &&
+ (info->end - info->start) <= PAGE_SIZE) {
args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = start;
+ args->op.arg1.linear_addr = info->start;
}
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
* At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them
* in xen_setup_kernel_pagetable and graft as appropriate them in the
- * init_level4_pgt and its friends. Then when we are happy we load
- * the new init_level4_pgt - and continue on.
+ * init_top_pgt and its friends. Then when we are happy we load
+ * the new init_top_pgt - and continue on.
*
* The generic code starts (start_kernel) and 'init_mem_mapping' sets
* up the rest of the pagetables. When it has completed it loads the cr3.
@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
pt_end = pt_base + xen_start_info->nr_pt_frames;
/* Zap identity mapping */
- init_level4_pgt[0] = __pgd(0);
+ init_top_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */
/* L4[272] -> level3_ident_pgt */
/* L4[511] -> level3_kernel_pgt */
- convert_pfn_mfn(init_level4_pgt);
+ convert_pfn_mfn(init_top_pgt);
/* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt);
@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map))
- init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+ init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
/* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
+ PFN_DOWN(__pa_symbol(init_top_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* attach it to, so make sure we just set kernel pgd.
*/
xen_mc_batch();
- __xen_write_cr3(true, __pa(init_level4_pgt));
+ __xen_write_cr3(true, __pa(init_top_pgt));
xen_mc_issue(PARAVIRT_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
pmd_t pmd;
pte_t pte;
- pa = read_cr3();
+ pa = read_cr3_pa();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd)));
if (!pgd_present(pgd))
@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
- pgd = __va(read_cr3());
+ pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
idx_p4d = 0;
save_pud = n_pud;
@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
{
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
- BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(read_cr3_pa() != __pa(initial_page_table));
BUG_ON(cr3 != __pa(swapper_pg_dir));
/*
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 5e246716d58f..e1a5fbeae08d 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
wrmsr
/* Enable pre-constructed page tables. */
- mov $_pa(init_level4_pgt), %eax
+ mov $_pa(init_top_pgt), %eax
mov %eax, %cr3
mov $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0