summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/Kconfig.debug12
-rw-r--r--arch/x86/entry/entry_64.S299
-rw-r--r--arch/x86/entry/entry_64_compat.S14
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/espfix.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h72
-rw-r--r--arch/x86/include/asm/intel_pmc_ipc.h27
-rw-r--r--arch/x86/include/asm/kasan.h8
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h4
-rw-r--r--arch/x86/kernel/apic/vector.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c8
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/espfix_64.c28
-rw-r--r--arch/x86/kernel/fpu/init.c46
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/head_64.S29
-rw-r--r--arch/x86/kernel/irq.c20
-rw-r--r--arch/x86/kernel/nmi.c123
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/smpboot.c38
-rw-r--r--arch/x86/kernel/tsc.c11
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/iommu.c2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c10
-rw-r--r--arch/x86/kvm/mtrr.c40
-rw-r--r--arch/x86/kvm/svm.c110
-rw-r--r--arch/x86/kvm/vmx.c16
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/kasan_init_64.c47
-rw-r--r--arch/x86/mm/mmap.c7
-rw-r--r--arch/x86/mm/mpx.c24
-rw-r--r--arch/x86/mm/tlb.c2
41 files changed, 704 insertions, 419 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced17dc95..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -254,6 +255,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config KASAN_SHADOW_OFFSET
+ hex
+ depends on KASAN
+ default 0xdffffc0000000000
+
config HAVE_INTEL_TXT
def_bool y
depends on INTEL_IOMMU && ACPI
@@ -2015,7 +2021,7 @@ config CMDLINE_BOOL
To compile command line arguments into the kernel,
set this option to 'Y', then fill in the
- the boot arguments in CONFIG_CMDLINE.
+ boot arguments in CONFIG_CMDLINE.
Systems with fully functional boot loaders (i.e. non-embedded)
should leave this option set to 'N'.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a15893d17c55..d8c0d3266173 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
If unsure, say N.
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
+ depends on DEBUG_KERNEL
+ ---help---
+ This option enables sanity checks in x86's low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
+
+ This is currently used to help test NMI code.
+
+ If unsure, say N.
+
config DEBUG_NMI_SELFTEST
bool "NMI Selftest"
depends on DEBUG_KERNEL && X86_LOCAL_APIC
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..8cb3e438f21e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
*/
/* Use %rdx as our temp variable throughout */
pushq %rdx
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ */
+
+ SWAPGS
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ pushq $-1 /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq (%rdx) /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
+ SWAPGS
+ jmp restore_c_regs_and_iret
+
+.Lnmi_from_kernel:
+ /*
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
/*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
+
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi
/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
*/
lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
cmpq %rdx, 4*8(%rsp)
/* If it is below the NMI stack, it is a normal NMI */
jb first_nmi
- /* Ah, it is within the NMI stack, treat it as nested */
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
*/
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
- leaq -1*8(%rsp), %rdx
- movq %rdx, %rsp
+ subq $8, %rsp
leaq -10*8(%rsp), %rdx
pushq $__KERNEL_DS
pushq %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
nested_nmi_out:
popq %rdx
- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
- /* Set the NMI executing variable on the stack. */
- pushq $1
+ /* Make room for "NMI executing". */
+ pushq $0
- /* Leave room for the "copied" frame */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq 11*8(%rsp)
.endr
/* Everything up to here is safe from nested NMIs */
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * For ease of testing, unmask NMIs right away. Disabled by
+ * default because IRET is very expensive.
+ */
+ pushq $0 /* SS */
+ pushq %rsp /* RSP (minus 8 because of the previous push) */
+ addq $8, (%rsp) /* Fix up RSP */
+ pushfq /* RFLAGS */
+ pushq $__KERNEL_CS /* CS */
+ pushq $1f /* RIP */
+ INTERRUPT_RETURN /* continues at repeat_nmi below */
+1:
+#endif
+
+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi. "NMI executing"
+ * is zero.
*/
-repeat_nmi:
+ movq $1, 10*8(%rsp) /* Set "NMI executing". */
+
/*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
*/
- movq $1, 10*8(%rsp)
-
- /* Make another copy, this one may be modified by nested NMIs */
addq $(10*8), %rsp
.rept 5
pushq -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
end_repeat_nmi:
/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
*/
call paranoid_entry
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
call do_nmi
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
nmi_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
- /* Pop the extra iret frame at once */
+
+ /* Point RSP at the "iret" frame. */
REMOVE_PT_GPREGS_FROM_STACK 6*8
- /* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths. On a native kernel, we
+ * could just inspect RIP, but, on paravirt kernels,
+ * INTERRUPT_RETURN can translate into a jump into a
+ * hypercall page.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
INTERRUPT_RETURN
END(nmi)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index bb187a6a877c..5a1844765a7a 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -205,7 +205,6 @@ sysexit_from_sys_call:
movl RDX(%rsp), %edx /* arg3 */
movl RSI(%rsp), %ecx /* arg4 */
movl RDI(%rsp), %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
.endm
.macro auditsys_exit exit
@@ -236,6 +235,7 @@ sysexit_from_sys_call:
sysenter_auditsys:
auditsys_entry_common
+ movl %ebp, %r9d /* reload 6th syscall arg */
jmp sysenter_dispatch
sysexit_audit:
@@ -336,7 +336,7 @@ ENTRY(entry_SYSCALL_compat)
* 32-bit zero extended:
*/
ASM_STAC
-1: movl (%r8), %ebp
+1: movl (%r8), %r9d
_ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
@@ -346,7 +346,7 @@ ENTRY(entry_SYSCALL_compat)
cstar_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
+ /* r9 already loaded */ /* arg6 */
xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
movl %ebx, %edi /* arg1 */
movl %edx, %edx /* arg3 (zero extension) */
@@ -358,7 +358,6 @@ cstar_dispatch:
call *ia32_sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
- movl RCX(%rsp), %ebp
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -392,7 +391,9 @@ sysretl_from_sys_call:
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
+ movl %r9d, R9(%rsp) /* register to be clobbered by call */
auditsys_entry_common
+ movl R9(%rsp), %r9d /* reload 6th syscall arg */
jmp cstar_dispatch
sysretl_audit:
@@ -404,14 +405,16 @@ cstar_tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz cstar_auditsys
#endif
+ xchgl %r9d, %ebp
SAVE_EXTRA_REGS
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
+ movq %r9, R9(%rsp)
movq %rax, R8(%rsp)
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
+ movl R9(%rsp), %r9d
/* Reload arg registers from stack. (see sysenter_tracesys) */
movl RCX(%rsp), %ecx
@@ -421,6 +424,7 @@ cstar_tracesys:
movl %eax, %eax /* zero extension */
RESTORE_EXTRA_REGS
+ xchgl %ebp, %r9d
jmp cstar_do_call
END(entry_SYSCALL_compat)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4dd1f2d770af..aeac434c9feb 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -9,3 +9,4 @@ generic-y += cputime.h
generic-y += dma-contiguous.h
generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += mm-arch-hooks.h
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 99efebb2f69d..ca3ce9ab9385 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
struct fxregs_state fxsave;
struct swregs_state soft;
struct xregs_state xsave;
+ u8 __padding[PAGE_SIZE];
};
/*
@@ -198,40 +199,6 @@ union fpregs_state {
*/
struct fpu {
/*
- * @state:
- *
- * In-memory copy of all FPU registers that we save/restore
- * over context switches. If the task is using the FPU then
- * the registers in the FPU are more recent than this state
- * copy. If the task context-switches away then they get
- * saved here and represent the FPU state.
- *
- * After context switches there may be a (short) time period
- * during which the in-FPU hardware registers are unchanged
- * and still perfectly match this state, if the tasks
- * scheduled afterwards are not using the FPU.
- *
- * This is the 'lazy restore' window of optimization, which
- * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
- *
- * We detect whether a subsequent task uses the FPU via setting
- * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
- *
- * During this window, if the task gets scheduled again, we
- * might be able to skip having to do a restore from this
- * memory buffer to the hardware registers - at the cost of
- * incurring the overhead of #NM fault traps.
- *
- * Note that on modern CPUs that support the XSAVEOPT (or other
- * optimized XSAVE instructions), we don't use #NM traps anymore,
- * as the hardware can track whether FPU registers need saving
- * or not. On such CPUs we activate the non-lazy ('eagerfpu')
- * logic, which unconditionally saves/restores all FPU state
- * across context switches. (if FPU state exists.)
- */
- union fpregs_state state;
-
- /*
* @last_cpu:
*
* Records the last CPU on which this context was loaded into
@@ -288,6 +255,43 @@ struct fpu {
* deal with bursty apps that only use the FPU for a short time:
*/
unsigned char counter;
+ /*
+ * @state:
+ *
+ * In-memory copy of all FPU registers that we save/restore
+ * over context switches. If the task is using the FPU then
+ * the registers in the FPU are more recent than this state
+ * copy. If the task context-switches away then they get
+ * saved here and represent the FPU state.
+ *
+ * After context switches there may be a (short) time period
+ * during which the in-FPU hardware registers are unchanged
+ * and still perfectly match this state, if the tasks
+ * scheduled afterwards are not using the FPU.
+ *
+ * This is the 'lazy restore' window of optimization, which
+ * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+ *
+ * We detect whether a subsequent task uses the FPU via setting
+ * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+ *
+ * During this window, if the task gets scheduled again, we
+ * might be able to skip having to do a restore from this
+ * memory buffer to the hardware registers - at the cost of
+ * incurring the overhead of #NM fault traps.
+ *
+ * Note that on modern CPUs that support the XSAVEOPT (or other
+ * optimized XSAVE instructions), we don't use #NM traps anymore,
+ * as the hardware can track whether FPU registers need saving
+ * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+ * logic, which unconditionally saves/restores all FPU state
+ * across context switches. (if FPU state exists.)
+ */
+ union fpregs_state state;
+ /*
+ * WARNING: 'state' is dynamically-sized. Do not put
+ * anything after it here.
+ */
};
#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
index 200ec2e7821d..cd0310e186f4 100644
--- a/arch/x86/include/asm/intel_pmc_ipc.h
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -25,36 +25,9 @@
#if IS_ENABLED(CONFIG_INTEL_PMC_IPC)
-/*
- * intel_pmc_ipc_simple_command
- * @cmd: command
- * @sub: sub type
- */
int intel_pmc_ipc_simple_command(int cmd, int sub);
-
-/*
- * intel_pmc_ipc_raw_cmd
- * @cmd: command
- * @sub: sub type
- * @in: input data
- * @inlen: input length in bytes
- * @out: output data
- * @outlen: output length in dwords
- * @sptr: data writing to SPTR register
- * @dptr: data writing to DPTR register
- */
int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen, u32 dptr, u32 sptr);
-
-/*
- * intel_pmc_ipc_command
- * @cmd: command
- * @sub: sub type
- * @in: input data
- * @inlen: input length in bytes
- * @out: output data
- * @outlen: output length in dwords
- */
int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen);
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 8b22422fbad8..74a2a8dc9908 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -14,15 +14,11 @@
#ifndef __ASSEMBLY__
-extern pte_t kasan_zero_pte[];
-extern pte_t kasan_zero_pmd[];
-extern pte_t kasan_zero_pud[];
-
#ifdef CONFIG_KASAN
-void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_early_init(void);
void __init kasan_init(void);
#else
-static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2a7f5d782c33..49ec9038ec14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -604,6 +604,8 @@ struct kvm_arch {
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+ atomic_t assigned_device_count;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
deleted file mode 100644
index 4e881a342236..000000000000
--- a/arch/x86/include/asm/mm-arch-hooks.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Architecture specific mm hooks
- *
- * Copyright (C) 2015, IBM Corporation
- * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _ASM_X86_MM_ARCH_HOOKS_H
-#define _ASM_X86_MM_ARCH_HOOKS_H
-
-#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e8daee7c5c9..804a3a6030ca 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -23,7 +23,7 @@ extern struct static_key rdpmc_always_available;
static inline void load_mm_cr4(struct mm_struct *mm)
{
- if (static_key_true(&rdpmc_always_available) ||
+ if (static_key_false(&rdpmc_always_available) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
#endif
unsigned long gs;
- /* Floating point and extended processor state */
- struct fpu fpu;
-
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
+
+ /* Floating point and extended processor state */
+ struct fpu fpu;
+ /*
+ * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
+ * the end.
+ */
};
/*
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 8fba544e9cc4..f36d56bd7632 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -108,6 +108,8 @@
#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
/* Support for a virtual guest idle state is available */
#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+/* Guest crash data handler available */
+#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
/*
* Implementation recommendations. Indicates which behaviors the hypervisor
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a4ae82eb82aa..cd54147cb365 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -354,7 +354,7 @@ struct kvm_xcrs {
struct kvm_sync_regs {
};
-#define KVM_QUIRK_LINT0_REENABLED (1 << 0)
-#define KVM_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 28eba2d38b15..f813261d9740 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -409,12 +409,6 @@ static void __setup_vector_irq(int cpu)
int irq, vector;
struct apic_chip_data *data;
- /*
- * vector_lock will make sure that we don't run into irq vector
- * assignments that might be happening on another cpu in parallel,
- * while we setup our initial vector to irq mappings.
- */
- raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
data = apic_chip_data(irq_get_irq_data(irq));
@@ -436,16 +430,16 @@ static void __setup_vector_irq(int cpu)
if (!cpumask_test_cpu(cpu, data->domain))
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
- raw_spin_unlock(&vector_lock);
}
/*
- * Setup the vector to irq mappings.
+ * Setup the vector to irq mappings. Must be called with vector_lock held.
*/
void setup_vector_irq(int cpu)
{
int irq;
+ lockdep_assert_held(&vector_lock);
/*
* On most of the platforms, legacy PIC delivers the interrupts on the
* boot cpu. But there are certain platforms where PIC interrupts are
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 188076161c1b..63eb68b73589 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -952,6 +952,14 @@ static u64 intel_cqm_event_count(struct perf_event *event)
return 0;
/*
+ * Getting up-to-date values requires an SMP IPI which is not
+ * possible if we're being called in interrupt context. Return
+ * the cached values instead.
+ */
+ if (unlikely(in_interrupt()))
+ goto out;
+
+ /*
* Notice that we don't perform the reading of an RMID
* atomically, because we can't hold a spin lock across the
* IPIs.
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 89427d8d4fc5..eec40f595ab9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -175,7 +175,9 @@ static __init void early_serial_init(char *s)
}
if (*s) {
- if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = simple_strtoull(s, &e, 0);
+
+ if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730e7b08..ce95676abd60 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
init_espfix_random();
/* The rest is the same as for any other processor */
- init_espfix_ap();
+ init_espfix_ap(0);
}
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
{
- unsigned int cpu, page;
+ unsigned int page;
unsigned long addr;
pud_t pud, *pud_p;
pmd_t pmd, *pmd_p;
pte_t pte, *pte_p;
- int n;
+ int n, node;
void *stack_page;
pteval_t ptemask;
/* We only have to do this once... */
- if (likely(this_cpu_read(espfix_stack)))
+ if (likely(per_cpu(espfix_stack, cpu)))
return; /* Already initialized */
- cpu = smp_processor_id();
addr = espfix_base_addr(cpu);
page = cpu/ESPFIX_STACKS_PER_PAGE;
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
if (stack_page)
goto unlock_done;
+ node = cpu_to_node(cpu);
ptemask = __supported_pte_mask;
pud_p = &espfix_pud_page[pud_index(addr)];
pud = *pud_p;
if (!pud_present(pud)) {
- pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
pmd_p = pmd_offset(&pud, addr);
pmd = *pmd_p;
if (!pmd_present(pmd)) {
- pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
}
pte_p = pte_offset_kernel(&pmd, addr);
- stack_page = (void *)__get_free_page(GFP_KERNEL);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
unlock_done:
mutex_unlock(&espfix_init_mutex);
done:
- this_cpu_write(espfix_stack, addr);
- this_cpu_write(espfix_waddr, (unsigned long)stack_page
- + (addr & ~PAGE_MASK));
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..1e173f6285c7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <linux/sched.h>
+
/*
* Initialize the TS bit in CR0 according to the style of context-switches
* we are using:
@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += xstate_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * it be at the end of 'thread_struct' and that
+ * 'thread_struct' be at the end of 'task_struct'. If
+ * you hit a compile error here, check the structure to
+ * see if something got added to the end.
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+ CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+ arch_task_struct_size = task_size;
+}
+
/*
* Set up the xstate_size based on the legacy FPU context size.
*
@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
+ fpu__init_task_struct_size();
fpu__init_system_ctx_switch();
}
@@ -311,9 +351,15 @@ static int __init x86_noxsave_setup(char *s)
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
return 1;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5a4668136e98..f129a9af6357 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -161,11 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
- kasan_map_early_shadow(early_level4_pgt);
-
- /* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
+ clear_page(init_level4_pgt);
+
+ kasan_early_init();
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
@@ -177,12 +178,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
- kasan_map_early_shadow(init_level4_pgt);
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e5c27f729a38..1d40ca8a73f2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -516,38 +516,9 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
-#ifdef CONFIG_KASAN
-#define FILL(VAL, COUNT) \
- .rept (COUNT) ; \
- .quad (VAL) ; \
- .endr
-
-NEXT_PAGE(kasan_zero_pte)
- FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pmd)
- FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pud)
- FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
-
-#undef FILL
-#endif
-
-
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
-#ifdef CONFIG_KASAN
-/*
- * This page used as early shadow. We don't use empty_zero_page
- * at early stages, stack instrumentation could write some garbage
- * to this page.
- * Latter we reuse it as zero shadow for large ranges of memory
- * that allowed to access, but not instrumented by kasan
- * (vmalloc/vmemmap ...).
- */
-NEXT_PAGE(kasan_zero_page)
- .skip PAGE_SIZE
-#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b366487b0e..c7dfe1be784e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -347,14 +347,22 @@ int check_irq_vectors_for_cpu_disable(void)
if (!desc)
continue;
+ /*
+ * Protect against concurrent action removal,
+ * affinity changes etc.
+ */
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new, data->affinity);
cpumask_clear_cpu(this_cpu, &affinity_new);
/* Do not count inactive or per-cpu irqs. */
- if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
+ raw_spin_unlock(&desc->lock);
continue;
+ }
+ raw_spin_unlock(&desc->lock);
/*
* A single irq may be mapped to multiple
* cpu's vector_irq[] (for example IOAPIC cluster
@@ -385,6 +393,9 @@ int check_irq_vectors_for_cpu_disable(void)
* vector. If the vector is marked in the used vectors
* bitmap or an irq is assigned to it, we don't count
* it as available.
+ *
+ * As this is an inaccurate snapshot anyway, we can do
+ * this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
@@ -486,6 +497,11 @@ void fixup_irqs(void)
*/
mdelay(1);
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
@@ -497,9 +513,9 @@ void fixup_irqs(void)
irq = __this_cpu_read(vector_irq[vector]);
desc = irq_to_desc(irq);
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
- raw_spin_lock(&desc->lock);
if (chip->irq_retrigger) {
chip->irq_retrigger(data);
__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..d05bd2e2ee91 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi);
/*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs) \
- do { \
- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
- this_cpu_write(nmi_state, NMI_LATCHED); \
- return; \
- } \
- this_cpu_write(nmi_state, NMI_EXECUTING); \
- this_cpu_write(nmi_cr2, read_cr2()); \
- } while (0); \
- nmi_restart:
-
-#define nmi_nesting_postprocess() \
- do { \
- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
- write_cr2(this_cpu_read(nmi_cr2)); \
- if (this_cpu_dec_return(nmi_state)) \
- goto nmi_restart; \
- } while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
/*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
*
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-}
#endif
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_nesting_preprocess(regs);
-
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit();
- /* On i386, may loop back to preprocess */
- nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
}
NOKPROBE_SYMBOL(do_nmi);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- *dst = *src;
+ memcpy(dst, src, arch_task_struct_size);
return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8add66b22f33..b1f3ed9c7a9e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -171,11 +171,6 @@ static void smp_callin(void)
apic_ap_setup();
/*
- * Need to setup vector mappings before we enable interrupts.
- */
- setup_vector_irq(smp_processor_id());
-
- /*
* Save our processor parameters. Note: this information
* is needed for clock calibration.
*/
@@ -239,18 +234,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
-#endif
-
- /*
- * We need to hold vector_lock so there the set of online cpus
- * does not change while we are assigning vectors to cpus. Holding
- * this lock ensures we don't half assign or remove an irq from a cpu.
+ * Lock vector_lock and initialize the vectors on this cpu
+ * before setting the cpu online. We must set it online with
+ * vector_lock held to prevent a concurrent setup/teardown
+ * from seeing a half valid vector space.
*/
lock_vector_lock();
+ setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
cpu_set_state_online(smp_processor_id());
@@ -854,6 +844,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
+ /*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap(cpu);
+#endif
+
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -995,8 +992,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
+ /*
+ * We have to walk the irq descriptors to setup the vector
+ * space for the cpu which comes online. Prevent irq
+ * alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
err = do_boot_cpu(apicid, cpu, tidle);
+
if (err) {
+ irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1014,6 +1020,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
+ irq_unlock_sparse();
+
return 0;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 505449700e0c..7437b41f6a47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 64dd46793099..2fbea2544f24 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -98,6 +98,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+ if (vcpu->arch.eager_fpu)
+ kvm_x86_ops->fpu_activate(vcpu);
/*
* The existing code assumes virtual address is 48-bit in the canonical
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 7dbced309ddb..5c520ebf6343 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
goto out_unmap;
}
+ kvm_arch_start_assignment(kvm);
pci_set_dev_assigned(pdev);
dev_info(&pdev->dev, "kvm assign device\n");
@@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
iommu_detach_device(domain, &pdev->dev);
pci_clear_dev_assigned(pdev);
+ kvm_arch_end_assignment(kvm);
dev_info(&pdev->dev, "kvm deassign device\n");
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 954e98a8c2e3..2a5ca97c263b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1595,7 +1595,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
apic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f807496b62c2..44171462bd2a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2479,6 +2479,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
+static bool kvm_is_mmio_pfn(pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+
+ return true;
+}
+
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
@@ -2506,7 +2514,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_reserved_pfn(pfn));
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index de1d2d8062e2..dc0a84a6f309 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -120,6 +120,16 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
+static u8 mtrr_disabled_type(void)
+{
+ /*
+ * Intel SDM 11.11.2.2: all MTRRs are disabled when
+ * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+ * memory type is applied to all of physical memory.
+ */
+ return MTRR_TYPE_UNCACHABLE;
+}
+
/*
* Three terms are used in the following code:
* - segment, it indicates the address segments covered by fixed MTRRs.
@@ -434,6 +444,8 @@ struct mtrr_iter {
/* output fields. */
int mem_type;
+ /* mtrr is completely disabled? */
+ bool mtrr_disabled;
/* [start, end) is not fully covered in MTRRs? */
bool partial_map;
@@ -549,7 +561,7 @@ static void mtrr_lookup_var_next(struct mtrr_iter *iter)
static void mtrr_lookup_start(struct mtrr_iter *iter)
{
if (!mtrr_is_enabled(iter->mtrr_state)) {
- iter->partial_map = true;
+ iter->mtrr_disabled = true;
return;
}
@@ -563,6 +575,7 @@ static void mtrr_lookup_init(struct mtrr_iter *iter,
iter->mtrr_state = mtrr_state;
iter->start = start;
iter->end = end;
+ iter->mtrr_disabled = false;
iter->partial_map = false;
iter->fixed = false;
iter->range = NULL;
@@ -656,15 +669,19 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
return MTRR_TYPE_WRBACK;
}
- /* It is not covered by MTRRs. */
- if (iter.partial_map) {
- /*
- * We just check one page, partially covered by MTRRs is
- * impossible.
- */
- WARN_ON(type != -1);
- type = mtrr_default_type(mtrr_state);
- }
+ if (iter.mtrr_disabled)
+ return mtrr_disabled_type();
+
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(iter.partial_map);
+
+ /* not contained in any MTRRs. */
+ if (type == -1)
+ return mtrr_default_type(mtrr_state);
+
return type;
}
EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
@@ -689,6 +706,9 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
return false;
}
+ if (iter.mtrr_disabled)
+ return true;
+
if (!iter.partial_map)
return true;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 602b974a60a6..8e0c0844c6b9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -865,6 +865,64 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+#define MTRR_TYPE_UC_MINUS 7
+#define MTRR2PROTVAL_INVALID 0xff
+
+static u8 mtrr2protval[8];
+
+static u8 fallback_mtrr_type(int mtrr)
+{
+ /*
+ * WT and WP aren't always available in the host PAT. Treat
+ * them as UC and UC- respectively. Everything else should be
+ * there.
+ */
+ switch (mtrr)
+ {
+ case MTRR_TYPE_WRTHROUGH:
+ return MTRR_TYPE_UNCACHABLE;
+ case MTRR_TYPE_WRPROT:
+ return MTRR_TYPE_UC_MINUS;
+ default:
+ BUG();
+ }
+}
+
+static void build_mtrr2protval(void)
+{
+ int i;
+ u64 pat;
+
+ for (i = 0; i < 8; i++)
+ mtrr2protval[i] = MTRR2PROTVAL_INVALID;
+
+ /* Ignore the invalid MTRR types. */
+ mtrr2protval[2] = 0;
+ mtrr2protval[3] = 0;
+
+ /*
+ * Use host PAT value to figure out the mapping from guest MTRR
+ * values to nested page table PAT/PCD/PWT values. We do not
+ * want to change the host PAT value every time we enter the
+ * guest.
+ */
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ for (i = 0; i < 8; i++) {
+ u8 mtrr = pat >> (8 * i);
+
+ if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID)
+ mtrr2protval[mtrr] = __cm_idx2pte(i);
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) {
+ u8 fallback = fallback_mtrr_type(i);
+ mtrr2protval[i] = mtrr2protval[fallback];
+ BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID);
+ }
+ }
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -931,6 +989,7 @@ static __init int svm_hardware_setup(void)
} else
kvm_disable_tdp();
+ build_mtrr2protval();
return 0;
err:
@@ -1085,6 +1144,39 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
return target_tsc - tsc;
}
+static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ /* Unlike Intel, AMD takes the guest's CR0.CD into account.
+ *
+ * AMD does not have IPAT. To emulate it for the case of guests
+ * with no assigned devices, just set everything to WB. If guests
+ * have assigned devices, however, we cannot force WB for RAM
+ * pages only, so use the guest PAT directly.
+ */
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ *g_pat = 0x0606060606060606;
+ else
+ *g_pat = vcpu->arch.pat;
+}
+
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ u8 mtrr;
+
+ /*
+ * 1. MMIO: trust guest MTRR, so same as item 3.
+ * 2. No passthrough: always map as WB, and force guest PAT to WB as well
+ * 3. Passthrough: can't guarantee the result, try to trust guest.
+ */
+ if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ return mtrr2protval[mtrr];
+}
+
static void init_vmcb(struct vcpu_svm *svm, bool init_event)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1180,6 +1272,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = svm->vcpu.arch.pat;
+ svm_set_guest_pat(svm, &save->g_pat);
save->cr3 = 0;
save->cr4 = 0;
}
@@ -1579,7 +1672,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* does not do it - this results in some delay at
* reboot
*/
- if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
mark_dirty(svm->vmcb, VMCB_CR);
@@ -3254,6 +3347,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
+ case MSR_IA32_CR_PAT:
+ if (npt_enabled) {
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm_set_guest_pat(svm, &svm->vmcb->save.g_pat);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ }
+ /* fall through */
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -4088,11 +4191,6 @@ static bool svm_has_high_real_mode_segbase(void)
return true;
}
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- return 0;
-}
-
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e856dd566f4c..83b7b5cd75d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8632,22 +8632,17 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
u64 ipat = 0;
/* For VT-d and EPT combination
- * 1. MMIO: always map as UC
+ * 1. MMIO: guest may want to apply WC, trust it.
* 2. EPT with VT-d:
* a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
+ * result, try to trust guest. So the same as item 1.
* b. VT-d with snooping control feature: snooping control feature of
* VT-d engine can guarantee the cache correctness. Just set it
* to WB to keep consistent with host. So the same as item 3.
* 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
* consistent with host MTRR
*/
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
-
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ if (!is_mmio && !kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
ipat = VMX_EPT_IPAT_BIT;
cache = MTRR_TYPE_WRBACK;
goto exit;
@@ -8655,7 +8650,10 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_UNCACHABLE;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cache = MTRR_TYPE_WRBACK;
+ else
+ cache = MTRR_TYPE_UNCACHABLE;
goto exit;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbaf44e8f0d3..5ef2560075bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3157,8 +3157,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
memcpy(dest, src + offset, size);
- } else
- WARN_ON_ONCE(1);
+ }
valid -= feature;
}
@@ -7315,11 +7314,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu = kvm_x86_ops->vcpu_create(kvm, id);
- /*
- * Activate fpu unconditionally in case the guest needs eager FPU. It will be
- * deactivated soon if it doesn't.
- */
- kvm_x86_ops->fpu_activate(vcpu);
return vcpu;
}
@@ -8218,6 +8212,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
kvm_x86_ops->interrupt_allowed(vcpu);
}
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
atomic_inc(&kvm->arch.noncoherent_dma_count);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index edc8cdcd786b..0ca2f3e4803c 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
return kvm_register_write(vcpu, reg, val);
}
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index cc5ccc415cc0..b9c78f3bcd67 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -63,8 +63,6 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
!PageReserved(pfn_to_page(start_pfn + i)))
return 1;
- WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
-
return 0;
}
@@ -94,7 +92,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
pgprot_t prot;
int retval;
void __iomem *ret_addr;
- int ram_region;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
@@ -117,23 +114,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- /* First check if whole region can be identified as RAM or not */
- ram_region = region_is_ram(phys_addr, size);
- if (ram_region > 0) {
- WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n",
- (unsigned long int)phys_addr,
- (unsigned long int)last_addr);
+ pfn = phys_addr >> PAGE_SHIFT;
+ last_pfn = last_addr >> PAGE_SHIFT;
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1) {
+ WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+ &phys_addr, &last_addr);
return NULL;
}
- /* If could not be identified(-1), check page by page */
- if (ram_region < 0) {
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1)
- return NULL;
- }
/*
* Mappings have to be page-aligned
*/
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 4860906c6b9f..e1840f3db5b5 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,3 +1,4 @@
+#define pr_fmt(fmt) "kasan: " fmt
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
@@ -11,7 +12,19 @@
extern pgd_t early_level4_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_X_MAX];
-extern unsigned char kasan_zero_page[PAGE_SIZE];
+static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
static int __init map_range(struct range *range)
{
@@ -36,7 +49,7 @@ static void __init clear_pgds(unsigned long start,
pgd_clear(pgd_offset_k(start));
}
-void __init kasan_map_early_shadow(pgd_t *pgd)
+static void __init kasan_map_early_shadow(pgd_t *pgd)
{
int i;
unsigned long start = KASAN_SHADOW_START;
@@ -73,7 +86,7 @@ static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
WARN_ON(!pmd_none(*pmd));
set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
- | __PAGE_KERNEL_RO));
+ | _KERNPG_TABLE));
addr += PMD_SIZE;
pmd = pmd_offset(pud, addr);
}
@@ -99,7 +112,7 @@ static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
WARN_ON(!pud_none(*pud));
set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
- | __PAGE_KERNEL_RO));
+ | _KERNPG_TABLE));
addr += PUD_SIZE;
pud = pud_offset(pgd, addr);
}
@@ -124,7 +137,7 @@ static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
WARN_ON(!pgd_none(*pgd));
set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
- | __PAGE_KERNEL_RO));
+ | _KERNPG_TABLE));
addr += PGDIR_SIZE;
pgd = pgd_offset_k(addr);
}
@@ -166,6 +179,26 @@ static struct notifier_block kasan_die_notifier = {
};
#endif
+void __init kasan_early_init(void)
+{
+ int i;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kasan_zero_pte[i] = __pte(pte_val);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ kasan_zero_pmd[i] = __pmd(pmd_val);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ kasan_zero_pud[i] = __pud(pud_val);
+
+ kasan_map_early_shadow(early_level4_pgt);
+ kasan_map_early_shadow(init_level4_pgt);
+}
+
void __init kasan_init(void)
{
int i;
@@ -176,6 +209,7 @@ void __init kasan_init(void)
memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
load_cr3(early_level4_pgt);
+ __flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -202,5 +236,8 @@ void __init kasan_init(void)
memset(kasan_zero_page, 0, PAGE_SIZE);
load_cr3(init_level4_pgt);
+ __flush_tlb_all();
init_task.kasan_depth = 0;
+
+ pr_info("Kernel address sanitizer initialized\n");
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 9d518d693b4b..844b06d67df4 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -126,3 +126,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_MPX)
+ return "[mpx]";
+ return NULL;
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 7a657f58bbea..db1b0bc5017c 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -20,20 +20,6 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/mpx.h>
-static const char *mpx_mapping_name(struct vm_area_struct *vma)
-{
- return "[mpx]";
-}
-
-static struct vm_operations_struct mpx_vma_ops = {
- .name = mpx_mapping_name,
-};
-
-static int is_mpx_vma(struct vm_area_struct *vma)
-{
- return (vma->vm_ops == &mpx_vma_ops);
-}
-
static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
{
if (is_64bit_mm(mm))
@@ -53,9 +39,6 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
/*
* This is really a simplified "vm_mmap". it only handles MPX
* bounds tables (the bounds directory is user-allocated).
- *
- * Later on, we use the vma->vm_ops to uniquely identify these
- * VMAs.
*/
static unsigned long mpx_mmap(unsigned long len)
{
@@ -101,7 +84,6 @@ static unsigned long mpx_mmap(unsigned long len)
ret = -ENOMEM;
goto out;
}
- vma->vm_ops = &mpx_vma_ops;
if (vm_flags & VM_LOCKED) {
up_write(&mm->mmap_sem);
@@ -812,7 +794,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
* so stop immediately and return an error. This
* probably results in a SIGSEGV.
*/
- if (!is_mpx_vma(vma))
+ if (!(vma->vm_flags & VM_MPX))
return -EINVAL;
len = min(vma->vm_end, end) - addr;
@@ -945,9 +927,9 @@ static int try_unmap_single_bt(struct mm_struct *mm,
* lots of tables even though we have no actual table
* entries in use.
*/
- while (next && is_mpx_vma(next))
+ while (next && (next->vm_flags & VM_MPX))
next = next->vm_next;
- while (prev && is_mpx_vma(prev))
+ while (prev && (prev->vm_flags & VM_MPX))
prev = prev->vm_prev;
/*
* We know 'start' and 'end' lie within an area controlled
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3250f2371aea..90b924acd982 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -117,7 +117,7 @@ static void flush_tlb_func(void *info)
} else {
unsigned long addr;
unsigned long nr_pages =
- f->flush_end - f->flush_start / PAGE_SIZE;
+ (f->flush_end - f->flush_start) / PAGE_SIZE;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);