From 2406e3b166eee42777a6b0b38f52f924454474d7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Sep 2017 21:36:56 +0200
Subject: perf/x86/intel, watchdog/core: Sanitize PMU HT bug workaround

The lockup_detector_suspend/resume() interface is broken in several ways
especially as it results in recursive locking of the CPU hotplug lock.

Use the new stop/restart interface in the perf NMI watchdog to temporarily
disable and reenable the already active watchdog events. That's enough to
handle it.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.247141871@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 829e89cfcee2..9fb9a1f1e47b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	if (lockup_detector_suspend() != 0) {
-		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-		return 0;
-	}
+	cpus_read_lock();
+
+	hardlockup_detector_perf_stop();
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	lockup_detector_resume();
-
-	cpus_read_lock();
+	hardlockup_detector_perf_restart();
 
 	for_each_online_cpu(c)
 		free_excl_cntrs(c);
-- 
cgit v1.2.3


From f7f3dc00f61261cdc9ccd8b886f21bc4dffd6fd9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 7 Sep 2017 19:08:21 +0200
Subject: x86/cpu/AMD: Fix erratum 1076 (CPB bit)

CPUID Fn8000_0007_EDX[CPB] is wrongly 0 on models up to B1. But they do
support CPB (AMD's Core Performance Boosting cpufreq CPU feature), so fix that.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sherry Hurwitz <sherry.hurwitz@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170907170821.16021-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9862e2cd6d93..d58184b7cd44 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -763,6 +763,16 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 	}
 }
 
+static void init_amd_zn(struct cpuinfo_x86 *c)
+{
+	/*
+	 * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
+	 * all up to and including B1.
+	 */
+	if (c->x86_model <= 1 && c->x86_mask <= 1)
+		set_cpu_cap(c, X86_FEATURE_CPB);
+}
+
 static void init_amd(struct cpuinfo_x86 *c)
 {
 	early_init_amd(c);
@@ -791,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c)
 	case 0x10: init_amd_gh(c); break;
 	case 0x12: init_amd_ln(c); break;
 	case 0x15: init_amd_bd(c); break;
+	case 0x17: init_amd_zn(c); break;
 	}
 
 	/* Enable workaround for FXSAVE leak */
-- 
cgit v1.2.3


From 51ae253870f55e14ba2854ce9577ac2920efef0c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 15 Sep 2017 21:29:13 +0200
Subject: xen: x86: mark xen_find_pt_base as __init

gcc-4.6 causes a harmless link-time warning:

WARNING: vmlinux.o(.text.unlikely+0x48e): Section mismatch in reference from the function xen_find_pt_base() to the function .init.text:m2p()
The function xen_find_pt_base() references
the function __init m2p().
This is often because xen_find_pt_base lacks a __init
annotation or the annotation of m2p is wrong.

Newer compilers inline this function, so it never shows up, but marking
it __init is the right way to avoid the warning.

Fixes: 70e61199559a ("xen: move p2m list if conflicting with e820 map")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 0422ee7e70b3..ddfeebc2b125 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2221,7 +2221,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
  * not the first page table in the page table pool.
  * Iterate through the initial page tables to find the real page table base.
  */
-static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+static phys_addr_t __init xen_find_pt_base(pmd_t *pmd)
 {
 	phys_addr_t pt_base, paddr;
 	unsigned pmdidx;
-- 
cgit v1.2.3


From 47061a24e2ee5bd8a40d473d47a5bd823fa0081f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:48 -0700
Subject: x86/mm: Factor out CR3-building code

Current, the code that assembles a value to load into CR3 is
open-coded everywhere.  Factor it out into helpers build_cr3() and
build_cr3_noflush().

This makes one semantic change: __get_current_cr3_fast() was wrong
on SME systems.  No one noticed because the only caller is in the
VMX code, and there are no CPUs with both SME and VMX.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <Thomas.Lendacky@amd.com>
Link: http://lkml.kernel.org/r/ce350cf11e93e2842d14d0b95b0199c7d881f527.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 15 +++++++++++----
 arch/x86/mm/tlb.c                  | 11 +++++------
 2 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 7ae318c340d9..a999ba6b721f 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -286,6 +286,15 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 }
 
+static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
+{
+	return __sme_pa(mm->pgd) | asid;
+}
+
+static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
+{
+	return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH;
+}
 
 /*
  * This can be used from process context to figure out what the value of
@@ -296,10 +305,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
  */
 static inline unsigned long __get_current_cr3_fast(void)
 {
-	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
-
-	if (static_cpu_has(X86_FEATURE_PCID))
-		cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+		this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || preemptible());
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 1ab3821f9e26..93fe97cce581 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -126,8 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * isn't free.
 	 */
 #ifdef CONFIG_DEBUG_VM
-	if (WARN_ON_ONCE(__read_cr3() !=
-			 (__sme_pa(real_prev->pgd) | prev_asid))) {
+	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
 		/*
 		 * If we were to BUG here, we'd be very likely to kill
 		 * the system so hard that we don't see the call trace.
@@ -172,7 +171,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
 				       next_tlb_gen);
-			write_cr3(__sme_pa(next->pgd) | prev_asid);
+			write_cr3(build_cr3(next, prev_asid));
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
 		}
@@ -216,12 +215,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(__sme_pa(next->pgd) | new_asid);
+			write_cr3(build_cr3(next, new_asid));
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
-			write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+			write_cr3(build_cr3_noflush(next, new_asid));
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
 		}
 
@@ -265,7 +264,7 @@ void initialize_tlbstate_and_flush(void)
 		!(cr4_read_shadow() & X86_CR4_PCIDE));
 
 	/* Force ASID 0 and force a TLB flush. */
-	write_cr3(cr3 & ~CR3_PCID_MASK);
+	write_cr3(build_cr3(mm, 0));
 
 	/* Reinitialize tlbstate. */
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
-- 
cgit v1.2.3


From 52a2af400c1075219b3f0ce5c96fc961da44018a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:49 -0700
Subject: x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code

Putting the logical ASID into CR3's PCID bits directly means that we
have two cases to consider separately: ASID == 0 and ASID != 0.
This means that bugs that only hit in one of these cases trigger
nondeterministically.

There were some bugs like this in the past, and I think there's
still one in current kernels.  In particular, we have a number of
ASID-unware code paths that save CR3, write some special value, and
then restore CR3.  This includes suspend/resume, hibernate, kexec,
EFI, and maybe other things I've missed.  This is currently
dangerous: if ASID != 0, then this code sequence will leave garbage
in the TLB tagged for ASID 0.  We could potentially see corruption
when switching back to ASID 0.  In principle, an
initialize_tlbstate_and_flush() call after these sequences would
solve the problem, but EFI, at least, does not call this.  (And it
probably shouldn't -- initialize_tlbstate_and_flush() is rather
expensive.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/cdc14bbe5d3c3ef2a562be09a6368ffe9bd947a6.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index a999ba6b721f..c120b5db178a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -286,14 +286,31 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 }
 
+/*
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
+ * bits.  This serves two purposes.  It prevents a nasty situation in
+ * which PCID-unaware code saves CR3, loads some other value (with PCID
+ * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
+ * the saved ASID was nonzero.  It also means that any bugs involving
+ * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
+ * deterministically.
+ */
+
 static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
 {
-	return __sme_pa(mm->pgd) | asid;
+	if (static_cpu_has(X86_FEATURE_PCID)) {
+		VM_WARN_ON_ONCE(asid > 4094);
+		return __sme_pa(mm->pgd) | (asid + 1);
+	} else {
+		VM_WARN_ON_ONCE(asid != 0);
+		return __sme_pa(mm->pgd);
+	}
 }
 
 static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
 {
-	return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH;
+	VM_WARN_ON_ONCE(asid > 4094);
+	return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
 }
 
 /*
-- 
cgit v1.2.3


From b8b7abaed7a49b350f8ba659ddc264b04931d581 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:50 -0700
Subject: x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier

Otherwise we might have the PCID feature bit set during cpu_init().

This is just for robustness.  I haven't seen any actual bugs here.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cba4671af755 ("x86/mm: Disable PCID on 32-bit kernels")
Link: http://lkml.kernel.org/r/b16dae9d6b0db5d9801ddbebbfd83384097c61f3.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/bugs.c   | 8 --------
 arch/x86/kernel/cpu/common.c | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index db684880d74a..0af86d9242da 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,14 +21,6 @@
 
 void __init check_bugs(void)
 {
-#ifdef CONFIG_X86_32
-	/*
-	 * Regardless of whether PCID is enumerated, the SDM says
-	 * that it can't be enabled in 32-bit mode.
-	 */
-	setup_clear_cpu_cap(X86_FEATURE_PCID);
-#endif
-
 	identify_boot_cpu();
 
 	if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 775f10100d7f..c9176bae7fd8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -904,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 	fpu__init_system(c);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Regardless of whether PCID is enumerated, the SDM says
+	 * that it can't be enabled in 32-bit mode.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
 }
 
 void __init early_cpu_init(void)
-- 
cgit v1.2.3


From 4ba55e65f471d011d3ba2ac2022180ea0877d68e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:51 -0700
Subject: x86/mm/32: Load a sane CR3 before cpu_init() on secondary CPUs

For unknown historical reasons (i.e. Borislav doesn't recall),
32-bit kernels invoke cpu_init() on secondary CPUs with
initial_page_table loaded into CR3.  Then they set
current->active_mm to &init_mm and call enter_lazy_tlb() before
fixing CR3.  This means that the x86 TLB code gets invoked while CR3
is inconsistent, and, with the improved PCID sanity checks I added,
we warn.

Fix it by loading swapper_pg_dir (i.e. init_mm.pgd) earlier.

Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reported-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 72c0098d92ce ("x86/mm: Reinitialize TLB state on hotplug and resume")
Link: http://lkml.kernel.org/r/30cdfea504682ba3b9012e77717800a91c22097f.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0854ff169274..ad59edd84de7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -232,12 +232,6 @@ static void notrace start_secondary(void *unused)
 	 */
 	if (boot_cpu_has(X86_FEATURE_PCID))
 		__write_cr4(__read_cr4() | X86_CR4_PCIDE);
-	cpu_init();
-	x86_cpuinit.early_percpu_clock_init();
-	preempt_disable();
-	smp_callin();
-
-	enable_start_cpu0 = 0;
 
 #ifdef CONFIG_X86_32
 	/* switch away from the initial page table */
@@ -245,6 +239,13 @@ static void notrace start_secondary(void *unused)
 	__flush_tlb_all();
 #endif
 
+	cpu_init();
+	x86_cpuinit.early_percpu_clock_init();
+	preempt_disable();
+	smp_callin();
+
+	enable_start_cpu0 = 0;
+
 	/* otherwise gcc will move up smp_processor_id before the cpu_init */
 	barrier();
 	/*
-- 
cgit v1.2.3


From d6500149bc4fddc5a91cd1a0c31b38fa36bff3ee Mon Sep 17 00:00:00 2001
From: Yu Zhang <yu.c.zhang@linux.intel.com>
Date: Mon, 18 Sep 2017 18:45:01 +0800
Subject: KVM: x86: Fix the NULL pointer parameter in check_cr_write()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Routine check_cr_write() will trigger emulator_get_cpuid()->
kvm_cpuid() to get maxphyaddr, and NULL is passed as values
for ebx/ecx/edx. This is problematic because kvm_cpuid() will
dereference these pointers.

Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its physical address width.")
Reported-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16bf6655aa85..15f527b44aa7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
 		if (efer & EFER_LMA) {
 			u64 maxphyaddr;
-			u32 eax = 0x80000008;
+			u32 eax, ebx, ecx, edx;
 
-			if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL,
-						 NULL, false))
+			eax = 0x80000008;
+			ecx = 0;
+			if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
+						 &edx, false))
 				maxphyaddr = eax & 0xff;
 			else
 				maxphyaddr = 36;
-- 
cgit v1.2.3


From dc91f2eb1a4021eb6705c15e474942f84ab9b211 Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Mon, 18 Sep 2017 09:56:49 +0800
Subject: KVM: VMX: do not change SN bit in vmx_update_pi_irte()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In kvm_vcpu_trigger_posted_interrupt() and pi_pre_block(), KVM
assumes that PI notification events should not be suppressed when the
target vCPU is not blocked.

vmx_update_pi_irte() sets the SN field before changing an interrupt
from posting to remapping, but it does not check the vCPU mode.
Therefore, the change of SN field may break above the assumption.
Besides, I don't see reasons to suppress notification events here, so
remove the changes of SN field to avoid race condition.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Reported-by: "Ramamurthy, Venkatesh" <venkatesh.ramamurthy@intel.com>
Reported-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06c0c6d0541e..7328c8c0ea3b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11911,12 +11911,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
 		if (set)
 			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-		else {
-			/* suppress notification event before unposting */
-			pi_set_sn(vcpu_to_pi_desc(vcpu));
+		else
 			ret = irq_set_vcpu_affinity(host_irq, NULL);
-			pi_clear_sn(vcpu_to_pi_desc(vcpu));
-		}
 
 		if (ret < 0) {
 			printk(KERN_INFO "%s: failed to update PI IRTE\n",
-- 
cgit v1.2.3


From 5753743fa5108b8f98bd61e40dc63f641b26c768 Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Mon, 18 Sep 2017 09:56:50 +0800
Subject: KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)) in kvm_vcpu_trigger_posted_interrupt()
intends to detect the violation of invariant that VT-d PI notification
event is not suppressed when vcpu is in the guest mode. Because the
two checks for the target vcpu mode and the target suppress field
cannot be performed atomically, the target vcpu mode may change in
between. If that does happen, WARN_ON_ONCE() here may raise false
alarms.

As the previous patch fixed the real invariant breaker, remove this
WARN_ON_ONCE() to avoid false alarms, and document the allowed cases
instead.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Reported-by: "Ramamurthy, Venkatesh" <venkatesh.ramamurthy@intel.com>
Reported-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7328c8c0ea3b..0726ca7a1b02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5077,21 +5077,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
 	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
 
 	if (vcpu->mode == IN_GUEST_MODE) {
-		struct vcpu_vmx *vmx = to_vmx(vcpu);
-
 		/*
-		 * Currently, we don't support urgent interrupt,
-		 * all interrupts are recognized as non-urgent
-		 * interrupt, so we cannot post interrupts when
-		 * 'SN' is set.
+		 * The vector of interrupt to be delivered to vcpu had
+		 * been set in PIR before this function.
+		 *
+		 * Following cases will be reached in this block, and
+		 * we always send a notification event in all cases as
+		 * explained below.
+		 *
+		 * Case 1: vcpu keeps in non-root mode. Sending a
+		 * notification event posts the interrupt to vcpu.
+		 *
+		 * Case 2: vcpu exits to root mode and is still
+		 * runnable. PIR will be synced to vIRR before the
+		 * next vcpu entry. Sending a notification event in
+		 * this case has no effect, as vcpu is not in root
+		 * mode.
 		 *
-		 * If the vcpu is in guest mode, it means it is
-		 * running instead of being scheduled out and
-		 * waiting in the run queue, and that's the only
-		 * case when 'SN' is set currently, warning if
-		 * 'SN' is set.
+		 * Case 3: vcpu exits to root mode and is blocked.
+		 * vcpu_block() has already synced PIR to vIRR and
+		 * never blocks vcpu if vIRR is not cleared. Therefore,
+		 * a blocked vcpu here does not wait for any requested
+		 * interrupts in PIR, and sending a notification event
+		 * which has no effect is safe here.
 		 */
-		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
 
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
 		return true;
-- 
cgit v1.2.3


From 569f11c9f788959b640116b5bbd6d8a1f07326da Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:00 -0500
Subject: crypto: x86/blowfish - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R12 instead of RBP.  R12 can't be used as the RT0 register because
of x86 instruction encoding limitations.  So use R12 for CTX and RDI for
CTX.  This means that CTX is no longer an implicit function argument.
Instead it needs to be explicitly copied from RDI.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/blowfish-x86_64-asm_64.S | 48 +++++++++++++++++---------------
 1 file changed, 26 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 246c67006ed0..8c1fcb6bad21 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -33,7 +33,7 @@
 #define s3	((16 + 2 + (3 * 256)) * 4)
 
 /* register macros */
-#define CTX %rdi
+#define CTX %r12
 #define RIO %rsi
 
 #define RX0 %rax
@@ -56,12 +56,12 @@
 #define RX2bh %ch
 #define RX3bh %dh
 
-#define RT0 %rbp
+#define RT0 %rdi
 #define RT1 %rsi
 #define RT2 %r8
 #define RT3 %r9
 
-#define RT0d %ebp
+#define RT0d %edi
 #define RT1d %esi
 #define RT2d %r8d
 #define RT3d %r9d
@@ -120,13 +120,14 @@
 
 ENTRY(__blowfish_enc_blk)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: bool, if true: xor output
 	 */
-	movq %rbp, %r11;
+	movq %r12, %r11;
 
+	movq %rdi, CTX;
 	movq %rsi, %r10;
 	movq %rdx, RIO;
 
@@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk)
 	round_enc(14);
 	add_roundkey_enc(16);
 
-	movq %r11, %rbp;
+	movq %r11, %r12;
 
 	movq %r10, RIO;
 	test %cl, %cl;
@@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk)
 
 ENTRY(blowfish_dec_blk)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
-	movq %rbp, %r11;
+	movq %r12, %r11;
 
+	movq %rdi, CTX;
 	movq %rsi, %r10;
 	movq %rdx, RIO;
 
@@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk)
 	movq %r10, RIO;
 	write_block();
 
-	movq %r11, %rbp;
+	movq %r11, %r12;
 
 	ret;
 ENDPROC(blowfish_dec_blk)
@@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk)
 
 ENTRY(__blowfish_enc_blk_4way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: bool, if true: xor output
 	 */
-	pushq %rbp;
+	pushq %r12;
 	pushq %rbx;
 	pushq %rcx;
 
-	preload_roundkey_enc(0);
-
+	movq %rdi, CTX
 	movq %rsi, %r11;
 	movq %rdx, RIO;
 
+	preload_roundkey_enc(0);
+
 	read_block4();
 
 	round_enc4(0);
@@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way)
 	round_enc4(14);
 	add_preloaded_roundkey4();
 
-	popq %rbp;
+	popq %r12;
 	movq %r11, RIO;
 
-	test %bpl, %bpl;
+	test %r12b, %r12b;
 	jnz .L__enc_xor4;
 
 	write_block4();
 
 	popq %rbx;
-	popq %rbp;
+	popq %r12;
 	ret;
 
 .L__enc_xor4:
 	xor_block4();
 
 	popq %rbx;
-	popq %rbp;
+	popq %r12;
 	ret;
 ENDPROC(__blowfish_enc_blk_4way)
 
 ENTRY(blowfish_dec_blk_4way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
-	pushq %rbp;
+	pushq %r12;
 	pushq %rbx;
-	preload_roundkey_dec(17);
 
-	movq %rsi, %r11;
+	movq %rdi, CTX;
+	movq %rsi, %r11
 	movq %rdx, RIO;
 
+	preload_roundkey_dec(17);
 	read_block4();
 
 	round_dec4(17);
@@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way)
 	write_block4();
 
 	popq %rbx;
-	popq %rbp;
+	popq %r12;
 
 	ret;
 ENDPROC(blowfish_dec_blk_4way)
-- 
cgit v1.2.3


From b46c9d717645529417ca9045cfdbf59f84922573 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:01 -0500
Subject: crypto: x86/camellia - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R12 instead of RBP.  Both are callee-saved registers, so the
substitution is straightforward.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia-x86_64-asm_64.S | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 310319c601ed..95ba6956a7f6 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -75,17 +75,17 @@
 #define RCD1bh %dh
 
 #define RT0 %rsi
-#define RT1 %rbp
+#define RT1 %r12
 #define RT2 %r8
 
 #define RT0d %esi
-#define RT1d %ebp
+#define RT1d %r12d
 #define RT2d %r8d
 
 #define RT2bl %r8b
 
 #define RXOR %r9
-#define RRBP %r10
+#define RR12 %r10
 #define RDST %r11
 
 #define RXORd %r9d
@@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk)
 	 *	%rdx: src
 	 *	%rcx: bool xor
 	 */
-	movq %rbp, RRBP;
+	movq %r12, RR12;
 
 	movq %rcx, RXOR;
 	movq %rsi, RDST;
@@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk)
 
 	enc_outunpack(mov, RT1);
 
-	movq RRBP, %rbp;
+	movq RR12, %r12;
 	ret;
 
 .L__enc_xor:
 	enc_outunpack(xor, RT1);
 
-	movq RRBP, %rbp;
+	movq RR12, %r12;
 	ret;
 ENDPROC(__camellia_enc_blk)
 
@@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk)
 	movl $24, RXORd;
 	cmovel RXORd, RT2d; /* max */
 
-	movq %rbp, RRBP;
+	movq %r12, RR12;
 	movq %rsi, RDST;
 	movq %rdx, RIO;
 
@@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk)
 
 	dec_outunpack();
 
-	movq RRBP, %rbp;
+	movq RR12, %r12;
 	ret;
 ENDPROC(camellia_dec_blk)
 
@@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way)
 	 */
 	pushq %rbx;
 
-	movq %rbp, RRBP;
+	movq %r12, RR12;
 	movq %rcx, RXOR;
 	movq %rsi, RDST;
 	movq %rdx, RIO;
@@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way)
 
 	enc_outunpack2(mov, RT2);
 
-	movq RRBP, %rbp;
+	movq RR12, %r12;
 	popq %rbx;
 	ret;
 
 .L__enc2_xor:
 	enc_outunpack2(xor, RT2);
 
-	movq RRBP, %rbp;
+	movq RR12, %r12;
 	popq %rbx;
 	ret;
 ENDPROC(__camellia_enc_blk_2way)
@@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way)
 	cmovel RXORd, RT2d; /* max */
 
 	movq %rbx, RXOR;
-	movq %rbp, RRBP;
+	movq %r12, RR12;
 	movq %rsi, RDST;
 	movq %rdx, RIO;
 
@@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way)
 
 	dec_outunpack2();
 
-	movq RRBP, %rbp;
+	movq RR12, %r12;
 	movq RXOR, %rbx;
 	ret;
 ENDPROC(camellia_dec_blk_2way)
-- 
cgit v1.2.3


From 4b15606664a2f8d7c4f0092fb0305fe1c7c65b7b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:02 -0500
Subject: crypto: x86/cast5 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R15 instead of RBP.  R15 can't be used as the RID1 register because
of x86 instruction encoding limitations.  So use R15 for CTX and RDI for
CTX.  This means that CTX is no longer an implicit function argument.
Instead it needs to be explicitly copied from RDI.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 47 ++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b4a8806234ea..86107c961bb4 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
 /**********************************************************************
   16-way AVX cast5
  **********************************************************************/
-#define CTX %rdi
+#define CTX %r15
 
 #define RL1 %xmm0
 #define RR1 %xmm1
@@ -70,8 +70,8 @@
 
 #define RTMP %xmm15
 
-#define RID1  %rbp
-#define RID1d %ebp
+#define RID1  %rdi
+#define RID1d %edi
 #define RID2  %rsi
 #define RID2d %esi
 
@@ -226,7 +226,7 @@
 .align 16
 __cast5_enc_blk16:
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	RL1: blocks 1 and 2
 	 *	RR1: blocks 3 and 4
 	 *	RL2: blocks 5 and 6
@@ -246,9 +246,11 @@ __cast5_enc_blk16:
 	 *	RR4: encrypted blocks 15 and 16
 	 */
 
-	pushq %rbp;
+	pushq %r15;
 	pushq %rbx;
 
+	movq %rdi, CTX;
+
 	vmovdqa .Lbswap_mask, RKM;
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
@@ -283,7 +285,7 @@ __cast5_enc_blk16:
 
 .L__skip_enc:
 	popq %rbx;
-	popq %rbp;
+	popq %r15;
 
 	vmovdqa .Lbswap_mask, RKM;
 
@@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16)
 .align 16
 __cast5_dec_blk16:
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	RL1: encrypted blocks 1 and 2
 	 *	RR1: encrypted blocks 3 and 4
 	 *	RL2: encrypted blocks 5 and 6
@@ -318,9 +320,11 @@ __cast5_dec_blk16:
 	 *	RR4: decrypted blocks 15 and 16
 	 */
 
-	pushq %rbp;
+	pushq %r15;
 	pushq %rbx;
 
+	movq %rdi, CTX;
+
 	vmovdqa .Lbswap_mask, RKM;
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
@@ -356,7 +360,7 @@ __cast5_dec_blk16:
 
 	vmovdqa .Lbswap_mask, RKM;
 	popq %rbx;
-	popq %rbp;
+	popq %r15;
 
 	outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 	outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16)
 
 ENTRY(cast5_ecb_enc_16way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	FRAME_BEGIN
+	pushq %r15;
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 
 	vmovdqu (0*4*4)(%rdx), RL1;
@@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way)
 	vmovdqu RR4, (6*4*4)(%r11);
 	vmovdqu RL4, (7*4*4)(%r11);
 
+	popq %r15;
 	FRAME_END
 	ret;
 ENDPROC(cast5_ecb_enc_16way)
 
 ENTRY(cast5_ecb_dec_16way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 
 	FRAME_BEGIN
+	pushq %r15;
+
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 
 	vmovdqu (0*4*4)(%rdx), RL1;
@@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way)
 	vmovdqu RR4, (6*4*4)(%r11);
 	vmovdqu RL4, (7*4*4)(%r11);
 
+	popq %r15;
 	FRAME_END
 	ret;
 ENDPROC(cast5_ecb_dec_16way)
 
 ENTRY(cast5_cbc_dec_16way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	FRAME_BEGIN
-
 	pushq %r12;
+	pushq %r15;
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 	movq %rdx, %r12;
 
@@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way)
 	vmovdqu RR4, (6*16)(%r11);
 	vmovdqu RL4, (7*16)(%r11);
 
+	popq %r15;
 	popq %r12;
-
 	FRAME_END
 	ret;
 ENDPROC(cast5_cbc_dec_16way)
 
 ENTRY(cast5_ctr_16way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: iv (big endian, 64bit)
 	 */
 	FRAME_BEGIN
-
 	pushq %r12;
+	pushq %r15;
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 	movq %rdx, %r12;
 
@@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way)
 	vmovdqu RR4, (6*16)(%r11);
 	vmovdqu RL4, (7*16)(%r11);
 
+	popq %r15;
 	popq %r12;
-
 	FRAME_END
 	ret;
 ENDPROC(cast5_ctr_16way)
-- 
cgit v1.2.3


From c66cc3be2951fad4d7d7f799baf57c8c5cc8d655 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:03 -0500
Subject: crypto: x86/cast6 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R15 instead of RBP.  R15 can't be used as the RID1 register because
of x86 instruction encoding limitations.  So use R15 for CTX and RDI for
CTX.  This means that CTX is no longer an implicit function argument.
Instead it needs to be explicitly copied from RDI.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 50 +++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 952d3156a933..7f30b6f0d72c 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
 /**********************************************************************
   8-way AVX cast6
  **********************************************************************/
-#define CTX %rdi
+#define CTX %r15
 
 #define RA1 %xmm0
 #define RB1 %xmm1
@@ -70,8 +70,8 @@
 
 #define RTMP %xmm15
 
-#define RID1  %rbp
-#define RID1d %ebp
+#define RID1  %rdi
+#define RID1d %edi
 #define RID2  %rsi
 #define RID2d %esi
 
@@ -264,15 +264,17 @@
 .align 8
 __cast6_enc_blk8:
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
 	 * output:
 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 	 */
 
-	pushq %rbp;
+	pushq %r15;
 	pushq %rbx;
 
+	movq %rdi, CTX;
+
 	vmovdqa .Lbswap_mask, RKM;
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
@@ -297,7 +299,7 @@ __cast6_enc_blk8:
 	QBAR(11);
 
 	popq %rbx;
-	popq %rbp;
+	popq %r15;
 
 	vmovdqa .Lbswap_mask, RKM;
 
@@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8)
 .align 8
 __cast6_dec_blk8:
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 	 * output:
 	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
 	 */
 
-	pushq %rbp;
+	pushq %r15;
 	pushq %rbx;
 
+	movq %rdi, CTX;
+
 	vmovdqa .Lbswap_mask, RKM;
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
@@ -343,7 +347,7 @@ __cast6_dec_blk8:
 	QBAR(0);
 
 	popq %rbx;
-	popq %rbp;
+	popq %r15;
 
 	vmovdqa .Lbswap_mask, RKM;
 	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
@@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8)
 
 ENTRY(cast6_ecb_enc_8way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	FRAME_BEGIN
+	pushq %r15;
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 
 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way)
 
 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	popq %r15;
 	FRAME_END
 	ret;
 ENDPROC(cast6_ecb_enc_8way)
 
 ENTRY(cast6_ecb_dec_8way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	FRAME_BEGIN
+	pushq %r15;
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 
 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way)
 
 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	popq %r15;
 	FRAME_END
 	ret;
 ENDPROC(cast6_ecb_dec_8way)
 
 ENTRY(cast6_cbc_dec_8way)
 	/* input:
-	 *	%rdi: ctx, CTX
+	 *	%rdi: ctx
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	FRAME_BEGIN
-
 	pushq %r12;
+	pushq %r15;
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 	movq %rdx, %r12;
 
@@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way)
 
 	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	popq %r15;
 	popq %r12;
-
 	FRAME_END
 	ret;
 ENDPROC(cast6_cbc_dec_8way)
@@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way)
 	 *	%rcx: iv (little endian, 128bit)
 	 */
 	FRAME_BEGIN
-
 	pushq %r12;
+	pushq %r15
 
+	movq %rdi, CTX;
 	movq %rsi, %r11;
 	movq %rdx, %r12;
 
@@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way)
 
 	store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	popq %r15;
 	popq %r12;
-
 	FRAME_END
 	ret;
 ENDPROC(cast6_ctr_8way)
@@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way)
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
 	FRAME_BEGIN
+	pushq %r15;
 
+	movq %rdi, CTX
 	movq %rsi, %r11;
 
 	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
@@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	popq %r15;
 	FRAME_END
 	ret;
 ENDPROC(cast6_xts_enc_8way)
@@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way)
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
 	FRAME_BEGIN
+	pushq %r15;
 
+	movq %rdi, CTX
 	movq %rsi, %r11;
 
 	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
@@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	popq %r15;
 	FRAME_END
 	ret;
 ENDPROC(cast6_xts_dec_8way)
-- 
cgit v1.2.3


From 3ed7b4d67c6745300c9b5c6baa55da1161b57f60 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:04 -0500
Subject: crypto: x86/des3_ede - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use RSI instead of RBP for RT1.  Since RSI is also used as a the 'dst'
function argument, it needs to be saved on the stack until the argument
is needed.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/des3_ede-asm_64.S | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index f3e91647ca27..8e49ce117494 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -64,12 +64,12 @@
 #define RW2bh %ch
 
 #define RT0 %r15
-#define RT1 %rbp
+#define RT1 %rsi
 #define RT2 %r14
 #define RT3 %rdx
 
 #define RT0d %r15d
-#define RT1d %ebp
+#define RT1d %esi
 #define RT2d %r14d
 #define RT3d %edx
 
@@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
-	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
 	pushq %r13;
 	pushq %r14;
 	pushq %r15;
 
+	pushq %rsi; /* dst */
+
 	read_block(%rdx, RL0, RR0);
 	initial_permutation(RL0, RR0);
 
@@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk)
 	round1(32+15, RL0, RR0, dummy2);
 
 	final_permutation(RR0, RL0);
+
+	popq %rsi /* dst */
 	write_block(%rsi, RR0, RL0);
 
 	popq %r15;
@@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk)
 	popq %r13;
 	popq %r12;
 	popq %rbx;
-	popq %rbp;
 
 	ret;
 ENDPROC(des3_ede_x86_64_crypt_blk)
@@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	 *	%rdx: src (3 blocks)
 	 */
 
-	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
 	pushq %r13;
 	pushq %r14;
 	pushq %r15;
 
+	pushq %rsi /* dst */
+
 	/* load input */
 	movl 0 * 4(%rdx), RL0d;
 	movl 1 * 4(%rdx), RR0d;
@@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	bswapl RR2d;
 	bswapl RL2d;
 
+	popq %rsi /* dst */
 	movl RR0d, 0 * 4(%rsi);
 	movl RL0d, 1 * 4(%rsi);
 	movl RR1d, 2 * 4(%rsi);
@@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	popq %r13;
 	popq %r12;
 	popq %rbx;
-	popq %rbp;
 
 	ret;
 ENDPROC(des3_ede_x86_64_crypt_blk_3way)
-- 
cgit v1.2.3


From d7b1722c72aa915283ada27709c6feeb392f6038 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:05 -0500
Subject: crypto: x86/sha1-avx2 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R11 instead of RBP.  Since R11 isn't a callee-saved register, it
doesn't need to be saved and restored on the stack.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha1_avx2_x86_64_asm.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1eab79c9ac48..9f712a7dfd79 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -89,7 +89,7 @@
 #define	REG_RE	%rdx
 #define	REG_RTA	%r12
 #define	REG_RTB	%rbx
-#define	REG_T1	%ebp
+#define	REG_T1	%r11d
 #define	xmm_mov	vmovups
 #define	avx2_zeroupper	vzeroupper
 #define	RND_F1	1
@@ -637,7 +637,6 @@ _loop3:
 	ENTRY(\name)
 
 	push	%rbx
-	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
@@ -673,7 +672,6 @@ _loop3:
 	pop	%r14
 	pop	%r13
 	pop	%r12
-	pop	%rbp
 	pop	%rbx
 
 	ret
-- 
cgit v1.2.3


From 6488bce756861b94810e54f83416d5e74c0f18bf Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:06 -0500
Subject: crypto: x86/sha1-ssse3 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Swap the usages of R12 and RBP.  Use R12 for the REG_D register, and use
RBP to store the pre-aligned stack pointer.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha1_ssse3_asm.S | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index a4109506a5e8..6204bd53528c 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -37,7 +37,7 @@
 #define REG_A	%ecx
 #define REG_B	%esi
 #define REG_C	%edi
-#define REG_D	%ebp
+#define REG_D	%r12d
 #define REG_E	%edx
 
 #define REG_T1	%eax
@@ -74,10 +74,10 @@
 	ENTRY(\name)
 
 	push	%rbx
-	push	%rbp
 	push	%r12
+	push	%rbp
+	mov	%rsp, %rbp
 
-	mov	%rsp, %r12
 	sub	$64, %rsp		# allocate workspace
 	and	$~15, %rsp		# align stack
 
@@ -99,10 +99,9 @@
 	xor	%rax, %rax
 	rep stosq
 
-	mov	%r12, %rsp		# deallocate workspace
-
-	pop	%r12
+	mov	%rbp, %rsp		# deallocate workspace
 	pop	%rbp
+	pop	%r12
 	pop	%rbx
 	ret
 
-- 
cgit v1.2.3


From 673ac6fbc74f835e2125df9ee39e8a2a423832e2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:07 -0500
Subject: crypto: x86/sha256-avx - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Swap the usages of R12 and RBP.  Use R12 for the TBL register, and use
RBP to store the pre-aligned stack pointer.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-avx-asm.S | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index e08888a1a5f2..001bbcf93c79 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -103,7 +103,7 @@ SRND = %rsi       # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
-TBL = %rbp
+TBL = %r12
 a = %eax
 b = %ebx
 
@@ -350,13 +350,13 @@ a = TMP_
 ENTRY(sha256_transform_avx)
 .align 32
 	pushq   %rbx
-	pushq   %rbp
+	pushq   %r12
 	pushq   %r13
 	pushq   %r14
 	pushq   %r15
-	pushq   %r12
+	pushq	%rbp
+	movq	%rsp, %rbp
 
-	mov	%rsp, %r12
 	subq    $STACK_SIZE, %rsp	# allocate stack space
 	and	$~15, %rsp		# align stack pointer
 
@@ -452,13 +452,12 @@ loop2:
 
 done_hash:
 
-	mov	%r12, %rsp
-
-	popq	%r12
+	mov	%rbp, %rsp
+	popq	%rbp
 	popq    %r15
 	popq    %r14
 	popq    %r13
-	popq    %rbp
+	popq	%r12
 	popq    %rbx
 	ret
 ENDPROC(sha256_transform_avx)
-- 
cgit v1.2.3


From d3dfbfe2e6e7ecd620531d5201314ad14c4ed5b3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:08 -0500
Subject: crypto: x86/sha256-avx2 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

There's no need to use RBP as a temporary register for the TBL value,
because it always stores the same value: the address of the K256 table.
Instead just reference the address of K256 directly.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-avx2-asm.S | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 89c8f09787d2..1420db15dcdd 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -98,8 +98,6 @@ d	= %r8d
 e       = %edx	# clobbers NUM_BLKS
 y3	= %esi	# clobbers INP
 
-
-TBL	= %rbp
 SRND	= CTX	# SRND is same register as CTX
 
 a = %eax
@@ -531,7 +529,6 @@ STACK_SIZE	= _RSP      + _RSP_SIZE
 ENTRY(sha256_transform_rorx)
 .align 32
 	pushq	%rbx
-	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
@@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx)
 	mov	CTX, _CTX(%rsp)
 
 loop0:
-	lea     K256(%rip), TBL
-
 	## Load first 16 dwords from two blocks
 	VMOVDQ	0*32(INP),XTMP0
 	VMOVDQ	1*32(INP),XTMP1
@@ -597,19 +592,19 @@ last_block_enter:
 
 .align 16
 loop1:
-	vpaddd	0*32(TBL, SRND), X0, XFER
+	vpaddd	K256+0*32(SRND), X0, XFER
 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
 
-	vpaddd	1*32(TBL, SRND), X0, XFER
+	vpaddd	K256+1*32(SRND), X0, XFER
 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
 
-	vpaddd	2*32(TBL, SRND), X0, XFER
+	vpaddd	K256+2*32(SRND), X0, XFER
 	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
 
-	vpaddd	3*32(TBL, SRND), X0, XFER
+	vpaddd	K256+3*32(SRND), X0, XFER
 	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
 
@@ -619,10 +614,11 @@ loop1:
 
 loop2:
 	## Do last 16 rounds with no scheduling
-	vpaddd	0*32(TBL, SRND), X0, XFER
+	vpaddd	K256+0*32(SRND), X0, XFER
 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 	DO_4ROUNDS	_XFER + 0*32
-	vpaddd	1*32(TBL, SRND), X1, XFER
+
+	vpaddd	K256+1*32(SRND), X1, XFER
 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 	DO_4ROUNDS	_XFER + 1*32
 	add	$2*32, SRND
@@ -676,9 +672,6 @@ loop3:
 	ja	done_hash
 
 do_last_block:
-	#### do last block
-	lea	K256(%rip), TBL
-
 	VMOVDQ	0*16(INP),XWORD0
 	VMOVDQ	1*16(INP),XWORD1
 	VMOVDQ	2*16(INP),XWORD2
@@ -718,7 +711,6 @@ done_hash:
 	popq	%r14
 	popq	%r13
 	popq	%r12
-	popq	%rbp
 	popq	%rbx
 	ret
 ENDPROC(sha256_transform_rorx)
-- 
cgit v1.2.3


From 539012dcbdd1ff028268764385ed1f6d600812a7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:09 -0500
Subject: crypto: x86/sha256-ssse3 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Swap the usages of R12 and RBP.  Use R12 for the TBL register, and use
RBP to store the pre-aligned stack pointer.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-ssse3-asm.S | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 39b83c93e7fd..c6c05ed2c16a 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -95,7 +95,7 @@ SRND = %rsi       # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
-TBL = %rbp
+TBL = %r12
 a = %eax
 b = %ebx
 
@@ -356,13 +356,13 @@ a = TMP_
 ENTRY(sha256_transform_ssse3)
 .align 32
 	pushq   %rbx
-	pushq   %rbp
+	pushq   %r12
 	pushq   %r13
 	pushq   %r14
 	pushq   %r15
-	pushq   %r12
+	pushq   %rbp
+	mov	%rsp, %rbp
 
-	mov	%rsp, %r12
 	subq    $STACK_SIZE, %rsp
 	and	$~15, %rsp
 
@@ -462,13 +462,12 @@ loop2:
 
 done_hash:
 
-	mov	%r12, %rsp
-
-	popq    %r12
+	mov	%rbp, %rsp
+	popq	%rbp
 	popq    %r15
 	popq    %r14
 	popq    %r13
-	popq    %rbp
+	popq    %r12
 	popq    %rbx
 
 	ret
-- 
cgit v1.2.3


From ca04c823763e5b82c237cabe0c17f547ecdc6271 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:10 -0500
Subject: crypto: sha512-avx2 - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Mix things up a little bit to get rid of the RBP usage, without hurting
performance too much.  Use RDI instead of RBP for the TBL pointer.  That
will clobber CTX, so spill CTX onto the stack and use R12 to read it in
the outer loop.  R12 is used as a non-persistent temporary variable
elsewhere, so it's safe to use.

Also remove the unused y4 variable.

Reported-by: Eric Biggers <ebiggers3@gmail.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-avx2-asm.S | 75 ++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 7f5f6c6ec72e..b16d56005162 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -69,8 +69,9 @@ XFER  = YTMP0
 
 BYTE_FLIP_MASK  = %ymm9
 
-# 1st arg
-CTX         = %rdi
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1        = %rdi
+CTX2        = %r12
 # 2nd arg
 INP         = %rsi
 # 3rd arg
@@ -81,7 +82,7 @@ d           = %r8
 e           = %rdx
 y3          = %rsi
 
-TBL   = %rbp
+TBL   = %rdi # clobbers CTX1
 
 a     = %rax
 b     = %rbx
@@ -91,26 +92,26 @@ g     = %r10
 h     = %r11
 old_h = %r11
 
-T1    = %r12
+T1    = %r12 # clobbers CTX2
 y0    = %r13
 y1    = %r14
 y2    = %r15
 
-y4    = %r12
-
 # Local variables (stack frame)
 XFER_SIZE = 4*8
 SRND_SIZE = 1*8
 INP_SIZE = 1*8
 INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
 RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 6*8
+GPRSAVE_SIZE = 5*8
 
 frame_XFER = 0
 frame_SRND = frame_XFER + XFER_SIZE
 frame_INP = frame_SRND + SRND_SIZE
 frame_INPEND = frame_INP + INP_SIZE
-frame_RSPSAVE = frame_INPEND + INPEND_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_RSPSAVE = frame_CTX + CTX_SIZE
 frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
 frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 
@@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx)
 	mov	%rax, frame_RSPSAVE(%rsp)
 
 	# Save GPRs
-	mov	%rbp, frame_GPRSAVE(%rsp)
-	mov	%rbx, 8*1+frame_GPRSAVE(%rsp)
-	mov	%r12, 8*2+frame_GPRSAVE(%rsp)
-	mov	%r13, 8*3+frame_GPRSAVE(%rsp)
-	mov	%r14, 8*4+frame_GPRSAVE(%rsp)
-	mov	%r15, 8*5+frame_GPRSAVE(%rsp)
+	mov	%rbx, 8*0+frame_GPRSAVE(%rsp)
+	mov	%r12, 8*1+frame_GPRSAVE(%rsp)
+	mov	%r13, 8*2+frame_GPRSAVE(%rsp)
+	mov	%r14, 8*3+frame_GPRSAVE(%rsp)
+	mov	%r15, 8*4+frame_GPRSAVE(%rsp)
 
 	shl	$7, NUM_BLKS	# convert to bytes
 	jz	done_hash
@@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx)
 	mov	NUM_BLKS, frame_INPEND(%rsp)
 
 	## load initial digest
-	mov	8*0(CTX),a
-	mov	8*1(CTX),b
-	mov	8*2(CTX),c
-	mov	8*3(CTX),d
-	mov	8*4(CTX),e
-	mov	8*5(CTX),f
-	mov	8*6(CTX),g
-	mov	8*7(CTX),h
+	mov	8*0(CTX1), a
+	mov	8*1(CTX1), b
+	mov	8*2(CTX1), c
+	mov	8*3(CTX1), d
+	mov	8*4(CTX1), e
+	mov	8*5(CTX1), f
+	mov	8*6(CTX1), g
+	mov	8*7(CTX1), h
+
+	# save %rdi (CTX) before it gets clobbered
+	mov	%rdi, frame_CTX(%rsp)
 
 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 
@@ -652,14 +655,15 @@ loop2:
 	subq	$1, frame_SRND(%rsp)
 	jne	loop2
 
-	addm	8*0(CTX),a
-	addm	8*1(CTX),b
-	addm	8*2(CTX),c
-	addm	8*3(CTX),d
-	addm	8*4(CTX),e
-	addm	8*5(CTX),f
-	addm	8*6(CTX),g
-	addm	8*7(CTX),h
+	mov	frame_CTX(%rsp), CTX2
+	addm	8*0(CTX2), a
+	addm	8*1(CTX2), b
+	addm	8*2(CTX2), c
+	addm	8*3(CTX2), d
+	addm	8*4(CTX2), e
+	addm	8*5(CTX2), f
+	addm	8*6(CTX2), g
+	addm	8*7(CTX2), h
 
 	mov	frame_INP(%rsp), INP
 	add	$128, INP
@@ -669,12 +673,11 @@ loop2:
 done_hash:
 
 # Restore GPRs
-	mov	frame_GPRSAVE(%rsp)     ,%rbp
-	mov	8*1+frame_GPRSAVE(%rsp) ,%rbx
-	mov	8*2+frame_GPRSAVE(%rsp) ,%r12
-	mov	8*3+frame_GPRSAVE(%rsp) ,%r13
-	mov	8*4+frame_GPRSAVE(%rsp) ,%r14
-	mov	8*5+frame_GPRSAVE(%rsp) ,%r15
+	mov	8*0+frame_GPRSAVE(%rsp), %rbx
+	mov	8*1+frame_GPRSAVE(%rsp), %r12
+	mov	8*2+frame_GPRSAVE(%rsp), %r13
+	mov	8*3+frame_GPRSAVE(%rsp), %r14
+	mov	8*4+frame_GPRSAVE(%rsp), %r15
 
 	# Restore Stack Pointer
 	mov	frame_RSPSAVE(%rsp), %rsp
-- 
cgit v1.2.3


From 8f182f845d0fa26ecc18d3bdcc3d1077a0ea3a31 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 14:42:11 -0500
Subject: crypto: x86/twofish - Fix RBP usage

Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R13 instead of RBP.  Both are callee-saved registers, so the
substitution is straightforward.

Reported-by: Eric Biggers <ebiggers@google.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Eric Biggers <ebiggers@google.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index b3f49d286348..73b471da3622 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -76,8 +76,8 @@
 #define RT %xmm14
 #define RR %xmm15
 
-#define RID1  %rbp
-#define RID1d %ebp
+#define RID1  %r13
+#define RID1d %r13d
 #define RID2  %rsi
 #define RID2d %esi
 
@@ -259,7 +259,7 @@ __twofish_enc_blk8:
 
 	vmovdqu w(CTX), RK1;
 
-	pushq %rbp;
+	pushq %r13;
 	pushq %rbx;
 	pushq %rcx;
 
@@ -282,7 +282,7 @@ __twofish_enc_blk8:
 
 	popq %rcx;
 	popq %rbx;
-	popq %rbp;
+	popq %r13;
 
 	outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
 	outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
@@ -301,7 +301,7 @@ __twofish_dec_blk8:
 
 	vmovdqu (w+4*4)(CTX), RK1;
 
-	pushq %rbp;
+	pushq %r13;
 	pushq %rbx;
 
 	inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
@@ -322,7 +322,7 @@ __twofish_dec_blk8:
 	vmovdqu (w)(CTX), RK1;
 
 	popq %rbx;
-	popq %rbp;
+	popq %r13;
 
 	outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
 	outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
-- 
cgit v1.2.3


From 51a9a8284e43642fc3e85810fd54f4c245d23a14 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 18 Sep 2017 10:03:12 +0100
Subject: x86/xen: clean up clang build warning

In the case where sizeof(maddr) != sizeof(long) p is initialized and
never read and clang throws a warning on this.  Move declaration of
p to clean up the clang build warning:

warning: Value stored to 'p' during its initialization is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/xen/hypercall.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9606688caa4b..e089c1675a7c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -552,13 +552,13 @@ static inline void
 MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
 			struct desc_struct desc)
 {
-	u32 *p = (u32 *) &desc;
-
 	mcl->op = __HYPERVISOR_update_descriptor;
 	if (sizeof(maddr) == sizeof(long)) {
 		mcl->args[0] = maddr;
 		mcl->args[1] = *(unsigned long *)&desc;
 	} else {
+		u32 *p = (u32 *)&desc;
+
 		mcl->args[0] = maddr;
 		mcl->args[1] = maddr >> 32;
 		mcl->args[2] = *p++;
-- 
cgit v1.2.3


From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Fri, 22 Sep 2017 07:53:15 +0200
Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache

For nested virt we maintain multiple VMCS that can run on a vCPU. So it is
incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching
the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in
vCPU-wide data structures.

Hyper-V nested on KVM runs into this consistently for me with PCID enabled.
CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3)
fires, and the currently loaded VMCS is updated. Then we switch from L2 to
L1 and the next exit reverts CR3 to its old value.

Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant")
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0726ca7a1b02..c83d28b0ab05 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -200,6 +200,8 @@ struct loaded_vmcs {
 	int cpu;
 	bool launched;
 	bool nmi_known_unmasked;
+	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
+	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
 	struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -600,8 +602,6 @@ struct vcpu_vmx {
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
 		u64           msr_host_bndcfgs;
-		unsigned long vmcs_host_cr3;	/* May not match real cr3 */
-		unsigned long vmcs_host_cr4;	/* May not match real cr4 */
 	} host_state;
 	struct {
 		int vm86_active;
@@ -5178,12 +5178,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 */
 	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
-	vmx->host_state.vmcs_host_cr3 = cr3;
+	vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
 	cr4 = cr4_read_shadow();
 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
-	vmx->host_state.vmcs_host_cr4 = cr4;
+	vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 
 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
@@ -9274,15 +9274,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
 	cr3 = __get_current_cr3_fast();
-	if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+	if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
 		vmcs_writel(HOST_CR3, cr3);
-		vmx->host_state.vmcs_host_cr3 = cr3;
+		vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 	}
 
 	cr4 = cr4_read_shadow();
-	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+	if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
-		vmx->host_state.vmcs_host_cr4 = cr4;
+		vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 	}
 
 	/* When single-stepping over STI and MOV SS, we must clear the
-- 
cgit v1.2.3


From f5caf621ee357279e759c0911daf6d55c7d36f03 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 20 Sep 2017 16:24:33 -0500
Subject: x86/asm: Fix inline asm call constraints for Clang

For inline asm statements which have a CALL instruction, we list the
stack pointer as a constraint to convince GCC to ensure the frame
pointer is set up first:

  static inline void foo()
  {
	register void *__sp asm(_ASM_SP);
	asm("call bar" : "+r" (__sp))
  }

Unfortunately, that pattern causes Clang to corrupt the stack pointer.

The fix is easy: convert the stack pointer register variable to a global
variable.

It should be noted that the end result is different based on the GCC
version.  With GCC 6.4, this patch has exactly the same result as
before:

	defconfig	defconfig-nofp	distro		distro-nofp
 before	9820389		9491555		8816046		8516940
 after	9820389		9491555		8816046		8516940

With GCC 7.2, however, GCC's behavior has changed.  It now changes its
behavior based on the conversion of the register variable to a global.
That somehow convinces it to *always* set up the frame pointer before
inserting *any* inline asm.  (Therefore, listing the variable as an
output constraint is a no-op and is no longer necessary.)  It's a bit
overkill, but the performance impact should be negligible.  And in fact,
there's a nice improvement with frame pointers disabled:

	defconfig	defconfig-nofp	distro		distro-nofp
 before	9796316		9468236		9076191		8790305
 after	9796957		9464267		9076381		8785949

So in summary, while listing the stack pointer as an output constraint
is no longer necessary for newer versions of GCC, it's still needed for
older versions.

Suggested-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3db862e970c432ae823cf515c52b54fec8270e0e.1505942196.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/alternative.h               |  3 +--
 arch/x86/include/asm/asm.h                       | 11 +++++++++++
 arch/x86/include/asm/mshyperv.h                  | 10 ++++------
 arch/x86/include/asm/paravirt_types.h            | 14 +++++++-------
 arch/x86/include/asm/preempt.h                   | 15 +++++----------
 arch/x86/include/asm/processor.h                 |  6 ++----
 arch/x86/include/asm/rwsem.h                     |  4 ++--
 arch/x86/include/asm/uaccess.h                   |  4 ++--
 arch/x86/include/asm/xen/hypercall.h             |  5 ++---
 arch/x86/kvm/emulate.c                           |  3 +--
 arch/x86/kvm/vmx.c                               |  3 +--
 arch/x86/mm/fault.c                              |  3 +--
 tools/objtool/Documentation/stack-validation.txt |  6 +++---
 13 files changed, 42 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1b020381ab38..c096624137ae 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2,   \
 			   output, input...)				      \
 {									      \
-	register void *__sp asm(_ASM_SP);				      \
 	asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
 		"call %P[new2]", feature2)				      \
-		: output, "+r" (__sp)					      \
+		: output, ASM_CALL_CONSTRAINT				      \
 		: [old] "i" (oldfunc), [new1] "i" (newfunc1),		      \
 		  [new2] "i" (newfunc2), ## input);			      \
 }
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 676ee5807d86..c1eadbaf1115 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -132,4 +132,15 @@
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
+#ifndef __ASSEMBLY__
+/*
+ * This output constraint should be used for any inline asm which has a "call"
+ * instruction.  Otherwise the asm may be inserted before the frame pointer
+ * gets set up by the containing function.  If you forget to do this, objtool
+ * may print a "call without frame pointer save/setup" warning.
+ */
+register unsigned int __asm_call_sp asm("esp");
+#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)
+#endif
+
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 63cc96f064dc..738503e1f80c 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -179,7 +179,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	u64 input_address = input ? virt_to_phys(input) : 0;
 	u64 output_address = output ? virt_to_phys(output) : 0;
 	u64 hv_status;
-	register void *__sp asm(_ASM_SP);
 
 #ifdef CONFIG_X86_64
 	if (!hv_hypercall_pg)
@@ -187,7 +186,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 
 	__asm__ __volatile__("mov %4, %%r8\n"
 			     "call *%5"
-			     : "=a" (hv_status), "+r" (__sp),
+			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 			       "+c" (control), "+d" (input_address)
 			     :  "r" (output_address), "m" (hv_hypercall_pg)
 			     : "cc", "memory", "r8", "r9", "r10", "r11");
@@ -202,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 
 	__asm__ __volatile__("call *%7"
 			     : "=A" (hv_status),
-			       "+c" (input_address_lo), "+r" (__sp)
+			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
 			     : "A" (control),
 			       "b" (input_address_hi),
 			       "D"(output_address_hi), "S"(output_address_lo),
@@ -224,12 +223,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 {
 	u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
-	register void *__sp asm(_ASM_SP);
 
 #ifdef CONFIG_X86_64
 	{
 		__asm__ __volatile__("call *%4"
-				     : "=a" (hv_status), "+r" (__sp),
+				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 				       "+c" (control), "+d" (input1)
 				     : "m" (hv_hypercall_pg)
 				     : "cc", "r8", "r9", "r10", "r11");
@@ -242,7 +240,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		__asm__ __volatile__ ("call *%5"
 				      : "=A"(hv_status),
 					"+c"(input1_lo),
-					"+r"(__sp)
+					ASM_CALL_CONSTRAINT
 				      :	"A" (control),
 					"b" (input1_hi),
 					"m" (hv_hypercall_pg)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 42873edd9f9d..280d94c36dad 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -459,8 +459,8 @@ int paravirt_disable_iospace(void);
  */
 #ifdef CONFIG_X86_32
 #define PVOP_VCALL_ARGS							\
-	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;	\
-	register void *__sp asm("esp")
+	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
 #define PVOP_CALL_ARGS			PVOP_VCALL_ARGS
 
 #define PVOP_CALL_ARG1(x)		"a" ((unsigned long)(x))
@@ -480,8 +480,8 @@ int paravirt_disable_iospace(void);
 /* [re]ax isn't an arg, but the return val */
 #define PVOP_VCALL_ARGS						\
 	unsigned long __edi = __edi, __esi = __esi,		\
-		__edx = __edx, __ecx = __ecx, __eax = __eax;	\
-	register void *__sp asm("rsp")
+		__edx = __edx, __ecx = __ecx, __eax = __eax;
+
 #define PVOP_CALL_ARGS		PVOP_VCALL_ARGS
 
 #define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
@@ -532,7 +532,7 @@ int paravirt_disable_iospace(void);
 			asm volatile(pre				\
 				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
-				     : call_clbr, "+r" (__sp)		\
+				     : call_clbr, ASM_CALL_CONSTRAINT	\
 				     : paravirt_type(op),		\
 				       paravirt_clobber(clbr),		\
 				       ##__VA_ARGS__			\
@@ -542,7 +542,7 @@ int paravirt_disable_iospace(void);
 			asm volatile(pre				\
 				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
-				     : call_clbr, "+r" (__sp)		\
+				     : call_clbr, ASM_CALL_CONSTRAINT	\
 				     : paravirt_type(op),		\
 				       paravirt_clobber(clbr),		\
 				       ##__VA_ARGS__			\
@@ -569,7 +569,7 @@ int paravirt_disable_iospace(void);
 		asm volatile(pre					\
 			     paravirt_alt(PARAVIRT_CALL)		\
 			     post					\
-			     : call_clbr, "+r" (__sp)			\
+			     : call_clbr, ASM_CALL_CONSTRAINT		\
 			     : paravirt_type(op),			\
 			       paravirt_clobber(clbr),			\
 			       ##__VA_ARGS__				\
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index ec1f3c651150..4f44505dbf87 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -100,19 +100,14 @@ static __always_inline bool should_resched(int preempt_offset)
 
 #ifdef CONFIG_PREEMPT
   extern asmlinkage void ___preempt_schedule(void);
-# define __preempt_schedule()					\
-({								\
-	register void *__sp asm(_ASM_SP);			\
-	asm volatile ("call ___preempt_schedule" : "+r"(__sp));	\
-})
+# define __preempt_schedule() \
+	asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
 
   extern asmlinkage void preempt_schedule(void);
   extern asmlinkage void ___preempt_schedule_notrace(void);
-# define __preempt_schedule_notrace()					\
-({									\
-	register void *__sp asm(_ASM_SP);				\
-	asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp));	\
-})
+# define __preempt_schedule_notrace() \
+	asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT)
+
   extern asmlinkage void preempt_schedule_notrace(void);
 #endif
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3fa26a61eabc..b390ff76e58f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -677,8 +677,6 @@ static inline void sync_core(void)
 	 * Like all of Linux's memory ordering operations, this is a
 	 * compiler barrier as well.
 	 */
-	register void *__sp asm(_ASM_SP);
-
 #ifdef CONFIG_X86_32
 	asm volatile (
 		"pushfl\n\t"
@@ -686,7 +684,7 @@ static inline void sync_core(void)
 		"pushl $1f\n\t"
 		"iret\n\t"
 		"1:"
-		: "+r" (__sp) : : "memory");
+		: ASM_CALL_CONSTRAINT : : "memory");
 #else
 	unsigned int tmp;
 
@@ -703,7 +701,7 @@ static inline void sync_core(void)
 		"iretq\n\t"
 		UNWIND_HINT_RESTORE
 		"1:"
-		: "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
+		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
 #endif
 }
 
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a34e0d4b957d..7116b7931c7b 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
 ({							\
 	long tmp;					\
 	struct rw_semaphore* ret;			\
-	register void *__sp asm(_ASM_SP);		\
 							\
 	asm volatile("# beginning down_write\n\t"	\
 		     LOCK_PREFIX "  xadd      %1,(%4)\n\t"	\
@@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
 		     "  call " slow_path "\n"		\
 		     "1:\n"				\
 		     "# ending down_write"		\
-		     : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \
+		     : "+m" (sem->count), "=d" (tmp),	\
+		       "=a" (ret), ASM_CALL_CONSTRAINT	\
 		     : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
 		     : "memory", "cc");			\
 	ret;						\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 184eb9894dae..78e8fcc87d4c 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -166,11 +166,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 ({									\
 	int __ret_gu;							\
 	register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);		\
-	register void *__sp asm(_ASM_SP);				\
 	__chk_user_ptr(ptr);						\
 	might_fault();							\
 	asm volatile("call __get_user_%P4"				\
-		     : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp)	\
+		     : "=a" (__ret_gu), "=r" (__val_gu),		\
+			ASM_CALL_CONSTRAINT				\
 		     : "0" (ptr), "i" (sizeof(*(ptr))));		\
 	(x) = (__force __typeof__(*(ptr))) __val_gu;			\
 	__builtin_expect(__ret_gu, 0);					\
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9606688caa4b..128a1a0b1450 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -113,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[];
 	register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
 	register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
 	register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
-	register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \
-	register void *__sp asm(_ASM_SP);
+	register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
 
-#define __HYPERCALL_0PARAM	"=r" (__res), "+r" (__sp)
+#define __HYPERCALL_0PARAM	"=r" (__res), ASM_CALL_CONSTRAINT
 #define __HYPERCALL_1PARAM	__HYPERCALL_0PARAM, "+r" (__arg1)
 #define __HYPERCALL_2PARAM	__HYPERCALL_1PARAM, "+r" (__arg2)
 #define __HYPERCALL_3PARAM	__HYPERCALL_2PARAM, "+r" (__arg3)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16bf6655aa85..f23f13403f33 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5296,7 +5296,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
 
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
 {
-	register void *__sp asm(_ASM_SP);
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
 
 	if (!(ctxt->d & ByteOp))
@@ -5304,7 +5303,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
 
 	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
 	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
-	      [fastop]"+S"(fop), "+r"(__sp)
+	      [fastop]"+S"(fop), ASM_CALL_CONSTRAINT
 	    : "c"(ctxt->src2.val));
 
 	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06c0c6d0541e..6ee237f509dc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9036,7 +9036,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 {
 	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-	register void *__sp asm(_ASM_SP);
 
 	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
 			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
@@ -9065,7 +9064,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_X86_64
 			[sp]"=&r"(tmp),
 #endif
-			"+r"(__sp)
+			ASM_CALL_CONSTRAINT
 			:
 			[entry]"r"(entry),
 			[ss]"i"(__KERNEL_DS),
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b836a7274e12..39567b5c33da 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -806,7 +806,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	if (is_vmalloc_addr((void *)address) &&
 	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
 	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
-		register void *__sp asm("rsp");
 		unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
 		/*
 		 * We're likely to be running with very little stack space
@@ -821,7 +820,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		asm volatile ("movq %[stack], %%rsp\n\t"
 			      "call handle_stack_overflow\n\t"
 			      "1: jmp 1b"
-			      : "+r" (__sp)
+			      : ASM_CALL_CONSTRAINT
 			      : "D" ("kernel stack overflow (page fault)"),
 				"S" (regs), "d" (address),
 				[stack] "rm" (stack));
diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt
index 6a1af43862df..3995735a878f 100644
--- a/tools/objtool/Documentation/stack-validation.txt
+++ b/tools/objtool/Documentation/stack-validation.txt
@@ -194,10 +194,10 @@ they mean, and suggestions for how to fix them.
    If it's a GCC-compiled .c file, the error may be because the function
    uses an inline asm() statement which has a "call" instruction.  An
    asm() statement with a call instruction must declare the use of the
-   stack pointer in its output operand.  For example, on x86_64:
+   stack pointer in its output operand.  On x86_64, this means adding
+   the ASM_CALL_CONSTRAINT as an output constraint:
 
-     register void *__sp asm("rsp");
-     asm volatile("call func" : "+r" (__sp));
+     asm volatile("call func" : ASM_CALL_CONSTRAINT);
 
    Otherwise the stack frame may not get created before the call.
 
-- 
cgit v1.2.3


From 656f083116a4799d8c0194976b8a2d66bf306538 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:44 +0200
Subject: x86/fpu: Rename copyin_to_xsaves()/copyout_from_xsaves() to
 copy_user_to_xstate()/copy_xstate_to_user()

The 'copyin/copyout' nomenclature needlessly departs from what the modern FPU code
uses, which is:

 copy_fpregs_to_fpstate()
 copy_fpstate_to_sigframe()
 copy_fregs_to_user()
 copy_fxregs_to_kernel()
 copy_fxregs_to_user()
 copy_kernel_to_fpregs()
 copy_kernel_to_fregs()
 copy_kernel_to_fxregs()
 copy_kernel_to_xregs()
 copy_user_to_fregs()
 copy_user_to_fxregs()
 copy_user_to_xregs()
 copy_xregs_to_kernel()
 copy_xregs_to_user()

I.e. according to this pattern, the following rename should be done:

  copyin_to_xsaves()    -> copy_user_to_xstate()
  copyout_from_xsaves() -> copy_xstate_to_user()

or, if we want to be pedantic, denote that that the user-space format is ptrace:

  copyin_to_xsaves()    -> copy_user_ptrace_to_xstate()
  copyout_from_xsaves() -> copy_xstate_to_user_ptrace()

But I'd suggest the shorter, non-pedantic name.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-2-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h | 4 ++--
 arch/x86/kernel/fpu/regset.c      | 4 ++--
 arch/x86/kernel/fpu/signal.c      | 2 +-
 arch/x86/kernel/fpu/xstate.c      | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 1b2799e0699a..a1baa17e9748 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
-int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
+int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 			void __user *ubuf, struct xregs_state *xsave);
-int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
+int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index b188b16841e3..165d0545c924 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -92,7 +92,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	fpu__activate_fpstate_read(fpu);
 
 	if (using_compacted_format()) {
-		ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave);
+		ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave);
 	} else {
 		fpstate_sanitize_xstate(fpu);
 		/*
@@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	fpu__activate_fpstate_write(fpu);
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		ret = copyin_to_xsaves(kbuf, ubuf, xsave);
+		ret = copy_user_to_xstate(kbuf, ubuf, xsave);
 	else
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 83c23c230b4c..b1fe9a1fc4e0 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		fpu__drop(fpu);
 
 		if (using_compacted_format()) {
-			err = copyin_to_xsaves(NULL, buf_fx,
+			err = copy_user_to_xstate(NULL, buf_fx,
 					       &fpu->state.xsave);
 		} else {
 			err = __copy_from_user(&fpu->state.xsave,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c24ac1efb12d..e7bb41723eaa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -951,7 +951,7 @@ static inline int xstate_copyout(unsigned int pos, unsigned int count,
  * zero. This is called from xstateregs_get() and there we check the CPU
  * has XSAVES.
  */
-int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
+int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 			void __user *ubuf, struct xregs_state *xsave)
 {
 	unsigned int offset, size;
@@ -1023,7 +1023,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
  * there we check the CPU has XSAVES and a whole standard-sized buffer
  * exists.
  */
-int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
+int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave)
 {
 	unsigned int offset, size;
-- 
cgit v1.2.3


From f0d4f30a7fd299587840a028655285a87f334904 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:45 +0200
Subject: x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() &
 copy_xstate_to_user()

copy_xstate_to_user() is a weird API - in part due to a bad API inherited
from the regset APIs.

But don't propagate that bad API choice into the FPU code - so as a first
step split the API into kernel and user buffer handling routines.

(Also split the xstate_copyout() internal helper.)

The split API is a dumb duplication that should be obviously correct, the
real splitting will be done in the next patch.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-3-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |   4 +-
 arch/x86/kernel/fpu/regset.c      |   5 +-
 arch/x86/kernel/fpu/xstate.c      | 110 +++++++++++++++++++++++++++++++++++---
 3 files changed, 109 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index a1baa17e9748..92dc8ca14124 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
-int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
-			void __user *ubuf, struct xregs_state *xsave);
+int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave);
+int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave);
 int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 165d0545c924..b6d12d66d04b 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -92,7 +92,10 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	fpu__activate_fpstate_read(fpu);
 
 	if (using_compacted_format()) {
-		ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave);
+		if (kbuf)
+			ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave);
+		else
+			ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave);
 	} else {
 		fpstate_sanitize_xstate(fpu);
 		/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e7bb41723eaa..38561539cb99 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -924,10 +924,106 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  * This is similar to user_regset_copyout(), but will not add offset to
  * the source data pointer or increment pos, count, kbuf, and ubuf.
  */
-static inline int xstate_copyout(unsigned int pos, unsigned int count,
-				 void *kbuf, void __user *ubuf,
-				 const void *data, const int start_pos,
-				 const int end_pos)
+static inline int
+__copy_xstate_to_kernel(unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf,
+			const void *data, const int start_pos,
+			const int end_pos)
+{
+	if ((count == 0) || (pos < start_pos))
+		return 0;
+
+	if (end_pos < 0 || pos < end_pos) {
+		unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
+
+		if (kbuf) {
+			memcpy(kbuf + pos, data, copy);
+		} else {
+			if (__copy_to_user(ubuf + pos, data, copy))
+				return -EFAULT;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a kernel-space ptrace buffer.
+ *
+ * It supports partial copy but pos always starts from zero. This is called
+ * from xstateregs_get() and there we check the CPU has XSAVES.
+ */
+int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf,
+			void __user *ubuf, struct xregs_state *xsave)
+{
+	unsigned int offset, size;
+	int ret, i;
+	struct xstate_header header;
+
+	/*
+	 * Currently copy_regset_to_user() starts from pos 0:
+	 */
+	if (unlikely(pos != 0))
+		return -EFAULT;
+
+	/*
+	 * The destination is a ptrace buffer; we put in only user xstates:
+	 */
+	memset(&header, 0, sizeof(header));
+	header.xfeatures = xsave->header.xfeatures;
+	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+
+	/*
+	 * Copy xregs_state->header:
+	 */
+	offset = offsetof(struct xregs_state, header);
+	size = sizeof(header);
+
+	ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count);
+
+	if (ret)
+		return ret;
+
+	for (i = 0; i < XFEATURE_MAX; i++) {
+		/*
+		 * Copy only in-use xstates:
+		 */
+		if ((header.xfeatures >> i) & 1) {
+			void *src = __raw_xsave_addr(xsave, 1 << i);
+
+			offset = xstate_offsets[i];
+			size = xstate_sizes[i];
+
+			ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count);
+
+			if (ret)
+				return ret;
+
+			if (offset + size >= count)
+				break;
+		}
+
+	}
+
+	/*
+	 * Fill xsave->i387.sw_reserved value for ptrace frame:
+	 */
+	offset = offsetof(struct fxregs_state, sw_reserved);
+	size = sizeof(xstate_fx_sw_bytes);
+
+	ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
+
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static inline int
+__copy_xstate_to_user(unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf,
+		      const void *data, const int start_pos,
+		      const int end_pos)
 {
 	if ((count == 0) || (pos < start_pos))
 		return 0;
@@ -977,7 +1073,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count);
+	ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count);
 
 	if (ret)
 		return ret;
@@ -992,7 +1088,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count);
+			ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count);
 
 			if (ret)
 				return ret;
@@ -1009,7 +1105,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
+	ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
 
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 4d981cf2d96f29cdfa7d4972c8b377fe7baa9c4c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:46 +0200
Subject: x86/fpu: Remove 'ubuf' parameter from the copy_xstate_to_kernel()
 APIs

The 'ubuf' parameter is unused in the _kernel() side of the API, remove it.

This simplifies the code and makes it easier to think about.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-4-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  2 +-
 arch/x86/kernel/fpu/regset.c      |  2 +-
 arch/x86/kernel/fpu/xstate.c      | 21 ++++++---------------
 3 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 92dc8ca14124..c762574a245f 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -48,7 +48,7 @@ void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
-int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave);
+int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave);
 int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave);
 int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index b6d12d66d04b..34e74adf9d5d 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -93,7 +93,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (using_compacted_format()) {
 		if (kbuf)
-			ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave);
+			ret = copy_xstate_to_kernel(pos, count, kbuf, xsave);
 		else
 			ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave);
 	} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 38561539cb99..71d3bda2b898 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -926,7 +926,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  */
 static inline int
 __copy_xstate_to_kernel(unsigned int pos, unsigned int count,
-			void *kbuf, void __user *ubuf,
+			void *kbuf,
 			const void *data, const int start_pos,
 			const int end_pos)
 {
@@ -936,12 +936,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count,
 	if (end_pos < 0 || pos < end_pos) {
 		unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
 
-		if (kbuf) {
-			memcpy(kbuf + pos, data, copy);
-		} else {
-			if (__copy_to_user(ubuf + pos, data, copy))
-				return -EFAULT;
-		}
+		memcpy(kbuf + pos, data, copy);
 	}
 	return 0;
 }
@@ -953,8 +948,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count,
  * It supports partial copy but pos always starts from zero. This is called
  * from xstateregs_get() and there we check the CPU has XSAVES.
  */
-int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf,
-			void __user *ubuf, struct xregs_state *xsave)
+int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave)
 {
 	unsigned int offset, size;
 	int ret, i;
@@ -979,8 +973,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf,
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count);
-
+	ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count);
 	if (ret)
 		return ret;
 
@@ -994,8 +987,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf,
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count);
-
+			ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count);
 			if (ret)
 				return ret;
 
@@ -1011,8 +1003,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf,
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
-
+	ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From a69c158fb3e7a91220f55029bf222a4e678d16e9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:47 +0200
Subject: x86/fpu: Remove 'kbuf' parameter from the copy_xstate_to_user() APIs

The 'kbuf' parameter is unused in the _user() side of the API, remove it.

This simplifies the code and makes it easier to think about.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-5-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  2 +-
 arch/x86/kernel/fpu/regset.c      |  2 +-
 arch/x86/kernel/fpu/xstate.c      | 25 +++++++------------------
 3 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c762574a245f..65bd68c30cd0 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -49,7 +49,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
 int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave);
-int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave);
+int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave);
 int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 34e74adf9d5d..fd6dbdd8fde6 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -95,7 +95,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		if (kbuf)
 			ret = copy_xstate_to_kernel(pos, count, kbuf, xsave);
 		else
-			ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave);
+			ret = copy_xstate_to_user(pos, count, ubuf, xsave);
 	} else {
 		fpstate_sanitize_xstate(fpu);
 		/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 71d3bda2b898..2d8f3344875d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1011,10 +1011,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru
 }
 
 static inline int
-__copy_xstate_to_user(unsigned int pos, unsigned int count,
-		      void *kbuf, void __user *ubuf,
-		      const void *data, const int start_pos,
-		      const int end_pos)
+__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos)
 {
 	if ((count == 0) || (pos < start_pos))
 		return 0;
@@ -1022,12 +1019,8 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count,
 	if (end_pos < 0 || pos < end_pos) {
 		unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
 
-		if (kbuf) {
-			memcpy(kbuf + pos, data, copy);
-		} else {
-			if (__copy_to_user(ubuf + pos, data, copy))
-				return -EFAULT;
-		}
+		if (__copy_to_user(ubuf + pos, data, copy))
+			return -EFAULT;
 	}
 	return 0;
 }
@@ -1038,8 +1031,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count,
  * zero. This is called from xstateregs_get() and there we check the CPU
  * has XSAVES.
  */
-int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
-			void __user *ubuf, struct xregs_state *xsave)
+int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave)
 {
 	unsigned int offset, size;
 	int ret, i;
@@ -1064,8 +1056,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count);
-
+	ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count);
 	if (ret)
 		return ret;
 
@@ -1079,8 +1070,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count);
-
+			ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count);
 			if (ret)
 				return ret;
 
@@ -1096,8 +1086,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf,
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
-
+	ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From d7eda6c99cc75f1c41d67abf988f37a10045a370 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:48 +0200
Subject: x86/fpu: Clean up parameter order in the copy_xstate_to_*() APIs

Parameter ordering is weird:

  int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave);
  int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave);

'pos' and 'count', which are attributes of the destination buffer, are listed before the destination
buffer itself ...

List them after the primary arguments instead.

This makes the code more similar to regular memcpy() variant APIs.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-6-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  4 ++--
 arch/x86/kernel/fpu/regset.c      |  4 ++--
 arch/x86/kernel/fpu/xstate.c      | 25 ++++++++++++-------------
 3 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 65bd68c30cd0..e4430b84939d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
-int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave);
-int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave);
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count);
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count);
 int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index fd6dbdd8fde6..ec1404194b65 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -93,9 +93,9 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (using_compacted_format()) {
 		if (kbuf)
-			ret = copy_xstate_to_kernel(pos, count, kbuf, xsave);
+			ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
 		else
-			ret = copy_xstate_to_user(pos, count, ubuf, xsave);
+			ret = copy_xstate_to_user(ubuf, xsave, pos, count);
 	} else {
 		fpstate_sanitize_xstate(fpu);
 		/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 2d8f3344875d..0a299468510f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -925,10 +925,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  * the source data pointer or increment pos, count, kbuf, and ubuf.
  */
 static inline int
-__copy_xstate_to_kernel(unsigned int pos, unsigned int count,
-			void *kbuf,
-			const void *data, const int start_pos,
-			const int end_pos)
+__copy_xstate_to_kernel(void *kbuf,
+			const void *data,
+			unsigned int pos, unsigned int count, const int start_pos, const int end_pos)
 {
 	if ((count == 0) || (pos < start_pos))
 		return 0;
@@ -948,7 +947,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count,
  * It supports partial copy but pos always starts from zero. This is called
  * from xstateregs_get() and there we check the CPU has XSAVES.
  */
-int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave)
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count)
 {
 	unsigned int offset, size;
 	int ret, i;
@@ -973,7 +972,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count);
+	ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count);
 	if (ret)
 		return ret;
 
@@ -987,7 +986,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count);
+			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count);
 			if (ret)
 				return ret;
 
@@ -1003,7 +1002,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count);
+	ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count);
 	if (ret)
 		return ret;
 
@@ -1011,7 +1010,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru
 }
 
 static inline int
-__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos)
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos)
 {
 	if ((count == 0) || (pos < start_pos))
 		return 0;
@@ -1031,7 +1030,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, c
  * zero. This is called from xstateregs_get() and there we check the CPU
  * has XSAVES.
  */
-int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave)
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count)
 {
 	unsigned int offset, size;
 	int ret, i;
@@ -1056,7 +1055,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf,
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count);
+	ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count);
 	if (ret)
 		return ret;
 
@@ -1070,7 +1069,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf,
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count);
+			ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count);
 			if (ret)
 				return ret;
 
@@ -1086,7 +1085,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf,
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count);
+	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From becb2bb72ff906cc0d2bac3ee9574f694364823b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:49 +0200
Subject: x86/fpu: Clean up the parameter definitions of copy_xstate_to_*()

Remove pointless 'const' of non-pointer input parameter.

Remove unnecessary parenthesis that shows uncertainty about arithmetic operator precedence.

Clarify copy_xstate_to_user() description.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-7-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0a299468510f..9647e7256179 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -927,13 +927,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 static inline int
 __copy_xstate_to_kernel(void *kbuf,
 			const void *data,
-			unsigned int pos, unsigned int count, const int start_pos, const int end_pos)
+			unsigned int pos, unsigned int count, int start_pos, int end_pos)
 {
 	if ((count == 0) || (pos < start_pos))
 		return 0;
 
 	if (end_pos < 0 || pos < end_pos) {
-		unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
+		unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos);
 
 		memcpy(kbuf + pos, data, copy);
 	}
@@ -1010,13 +1010,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 }
 
 static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos)
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos)
 {
 	if ((count == 0) || (pos < start_pos))
 		return 0;
 
 	if (end_pos < 0 || pos < end_pos) {
-		unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
+		unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos);
 
 		if (__copy_to_user(ubuf + pos, data, copy))
 			return -EFAULT;
@@ -1026,7 +1026,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns
 
 /*
  * Convert from kernel XSAVES compacted format to standard format and copy
- * to a ptrace buffer. It supports partial copy but pos always starts from
+ * to a user-space buffer. It supports partial copy but pos always starts from
  * zero. This is called from xstateregs_get() and there we check the CPU
  * has XSAVES.
  */
-- 
cgit v1.2.3


From 8a5b731889cbf004b406d988dc591c8a7aac773e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:50 +0200
Subject: x86/fpu: Remove the 'start_pos' parameter from the
 __copy_xstate_to_*() functions

'start_pos' is always 0, so remove it and remove the pointless check of 'pos < 0'
which can not ever be true as 'pos' is unsigned ...

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-8-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9647e7256179..1f50fdaf4c5a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -927,9 +927,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 static inline int
 __copy_xstate_to_kernel(void *kbuf,
 			const void *data,
-			unsigned int pos, unsigned int count, int start_pos, int end_pos)
+			unsigned int pos, unsigned int count, int end_pos)
 {
-	if ((count == 0) || (pos < start_pos))
+	if (!count)
 		return 0;
 
 	if (end_pos < 0 || pos < end_pos) {
@@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count);
+	ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count);
 	if (ret)
 		return ret;
 
@@ -986,7 +986,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count);
+			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count);
 			if (ret)
 				return ret;
 
@@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count);
+	ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count);
 	if (ret)
 		return ret;
 
@@ -1010,9 +1010,9 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 }
 
 static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos)
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos)
 {
-	if ((count == 0) || (pos < start_pos))
+	if (!count)
 		return 0;
 
 	if (end_pos < 0 || pos < end_pos) {
@@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count);
+	ret = __copy_xstate_to_user(ubuf, &header, offset, size, count);
 	if (ret)
 		return ret;
 
@@ -1069,7 +1069,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count);
+			ret = __copy_xstate_to_user(ubuf, src, offset, size, count);
 			if (ret)
 				return ret;
 
@@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count);
+	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 56583c9a1400fe1935edd55b24b4fbbc779b59cb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:51 +0200
Subject: x86/fpu: Clarify parameter names in the copy_xstate_to_*() methods

Right now there's a confusing mixture of 'offset' and 'size' parameters:

 - __copy_xstate_to_*() input parameter 'end_pos' not not really an offset,
   but the full size of the copy to be performed.

 - input parameter 'count' to copy_xstate_to_*() shadows that of
   __copy_xstate_to_*()'s 'count' parameter name - but the roles
   are different: the first one is the total number of bytes to
   be copied, while the second one is a partial copy size.

To unconfuse all this, use a consistent set of parameter names:

 - 'size' is the partial copy size within a single xstate component
 - 'size_total' is the total copy requested
 - 'offset_start' is the requested starting offset.
 - 'offset' is the offset within an xstate component.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-9-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  4 ++--
 arch/x86/kernel/fpu/xstate.c      | 44 +++++++++++++++++++--------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index e4430b84939d..fed6617a1079 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
 const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count);
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count);
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 		     struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1f50fdaf4c5a..c13083579655 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -927,15 +927,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 static inline int
 __copy_xstate_to_kernel(void *kbuf,
 			const void *data,
-			unsigned int pos, unsigned int count, int end_pos)
+			unsigned int offset, unsigned int size, int size_total)
 {
-	if (!count)
+	if (!size)
 		return 0;
 
-	if (end_pos < 0 || pos < end_pos) {
-		unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos);
+	if (size_total < 0 || offset < size_total) {
+		unsigned int copy = size_total < 0 ? size : min(size, size_total - offset);
 
-		memcpy(kbuf + pos, data, copy);
+		memcpy(kbuf + offset, data, copy);
 	}
 	return 0;
 }
@@ -947,7 +947,7 @@ __copy_xstate_to_kernel(void *kbuf,
  * It supports partial copy but pos always starts from zero. This is called
  * from xstateregs_get() and there we check the CPU has XSAVES.
  */
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count)
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
 {
 	unsigned int offset, size;
 	int ret, i;
@@ -956,7 +956,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 	/*
 	 * Currently copy_regset_to_user() starts from pos 0:
 	 */
-	if (unlikely(pos != 0))
+	if (unlikely(offset_start != 0))
 		return -EFAULT;
 
 	/*
@@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count);
+	ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
 	if (ret)
 		return ret;
 
@@ -986,11 +986,11 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count);
+			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
 			if (ret)
 				return ret;
 
-			if (offset + size >= count)
+			if (offset + size >= size_total)
 				break;
 		}
 
@@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count);
+	ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
 	if (ret)
 		return ret;
 
@@ -1010,15 +1010,15 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po
 }
 
 static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos)
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total)
 {
-	if (!count)
+	if (!size)
 		return 0;
 
-	if (end_pos < 0 || pos < end_pos) {
-		unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos);
+	if (size_total < 0 || offset < size_total) {
+		unsigned int copy = size_total < 0 ? size : min(size, size_total - offset);
 
-		if (__copy_to_user(ubuf + pos, data, copy))
+		if (__copy_to_user(ubuf + offset, data, copy))
 			return -EFAULT;
 	}
 	return 0;
@@ -1030,7 +1030,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns
  * zero. This is called from xstateregs_get() and there we check the CPU
  * has XSAVES.
  */
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count)
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
 {
 	unsigned int offset, size;
 	int ret, i;
@@ -1039,7 +1039,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	/*
 	 * Currently copy_regset_to_user() starts from pos 0:
 	 */
-	if (unlikely(pos != 0))
+	if (unlikely(offset_start != 0))
 		return -EFAULT;
 
 	/*
@@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_user(ubuf, &header, offset, size, count);
+	ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
 	if (ret)
 		return ret;
 
@@ -1069,11 +1069,11 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			ret = __copy_xstate_to_user(ubuf, src, offset, size, count);
+			ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
 			if (ret)
 				return ret;
 
-			if (offset + size >= count)
+			if (offset + size >= size_total)
 				break;
 		}
 
@@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count);
+	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 6ff15f8db7eaf29ef5ead6afbec9b25485fe8703 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:52 +0200
Subject: x86/fpu: Change 'size_total' parameter to unsigned and standardize
 the size checks in copy_xstate_to_*()

'size_total' is derived from an unsigned input parameter - and then converted
to 'int' and checked for negative ranges:

	if (size_total < 0 || offset < size_total) {

This conversion and the checks are unnecessary obfuscation, reject overly
large requested copy sizes outright and simplify the underlying code.

Reported-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-10-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c13083579655..b18c5457065a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -925,15 +925,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  * the source data pointer or increment pos, count, kbuf, and ubuf.
  */
 static inline int
-__copy_xstate_to_kernel(void *kbuf,
-			const void *data,
-			unsigned int offset, unsigned int size, int size_total)
+__copy_xstate_to_kernel(void *kbuf, const void *data,
+			unsigned int offset, unsigned int size, unsigned int size_total)
 {
-	if (!size)
-		return 0;
-
-	if (size_total < 0 || offset < size_total) {
-		unsigned int copy = size_total < 0 ? size : min(size, size_total - offset);
+	if (offset < size_total) {
+		unsigned int copy = min(size, size_total - offset);
 
 		memcpy(kbuf + offset, data, copy);
 	}
@@ -986,12 +982,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
+			/* The next component has to fit fully into the output buffer: */
+			if (offset + size > size_total)
+				break;
+
 			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
 			if (ret)
 				return ret;
-
-			if (offset + size >= size_total)
-				break;
 		}
 
 	}
@@ -1010,13 +1007,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 }
 
 static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total)
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
 {
 	if (!size)
 		return 0;
 
-	if (size_total < 0 || offset < size_total) {
-		unsigned int copy = size_total < 0 ? size : min(size, size_total - offset);
+	if (offset < size_total) {
+		unsigned int copy = min(size, size_total - offset);
 
 		if (__copy_to_user(ubuf + offset, data, copy))
 			return -EFAULT;
@@ -1069,12 +1066,13 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
+			/* The next component has to fit fully into the output buffer: */
+			if (offset + size > size_total)
+				break;
+
 			ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
 			if (ret)
 				return ret;
-
-			if (offset + size >= size_total)
-				break;
 		}
 
 	}
-- 
cgit v1.2.3


From 8c0817f4a3188ac5485ce14f96f12a175800b881 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:53 +0200
Subject: x86/fpu: Simplify __copy_xstate_to_kernel() return values

__copy_xstate_to_kernel() can only return 0 (because kernel copies cannot fail),
simplify the code throughout.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-11-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b18c5457065a..00c3b41c3cf1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -924,7 +924,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  * This is similar to user_regset_copyout(), but will not add offset to
  * the source data pointer or increment pos, count, kbuf, and ubuf.
  */
-static inline int
+static inline void
 __copy_xstate_to_kernel(void *kbuf, const void *data,
 			unsigned int offset, unsigned int size, unsigned int size_total)
 {
@@ -933,7 +933,6 @@ __copy_xstate_to_kernel(void *kbuf, const void *data,
 
 		memcpy(kbuf + offset, data, copy);
 	}
-	return 0;
 }
 
 /*
@@ -946,8 +945,8 @@ __copy_xstate_to_kernel(void *kbuf, const void *data,
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
 {
 	unsigned int offset, size;
-	int ret, i;
 	struct xstate_header header;
+	int i;
 
 	/*
 	 * Currently copy_regset_to_user() starts from pos 0:
@@ -968,9 +967,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(header);
 
-	ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
-	if (ret)
-		return ret;
+	__copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
 		/*
@@ -986,9 +983,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 			if (offset + size > size_total)
 				break;
 
-			ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
-			if (ret)
-				return ret;
+			__copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
 		}
 
 	}
@@ -999,9 +994,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 	offset = offsetof(struct fxregs_state, sw_reserved);
 	size = sizeof(xstate_fx_sw_bytes);
 
-	ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
-	if (ret)
-		return ret;
+	__copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 79fecc2b7506f29fb91becc65e8788e5ae7eba9f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:54 +0200
Subject: x86/fpu: Split copy_user_to_xstate() into copy_kernel_to_xstate() &
 copy_user_to_xstate()

Similar to:

  x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user()

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-12-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  4 +--
 arch/x86/kernel/fpu/regset.c      | 10 ++++--
 arch/x86/kernel/fpu/xstate.c      | 66 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index fed6617a1079..79af79dbcab6 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
-		     struct xregs_state *xsave);
+int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave);
+int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index ec1404194b65..cb45dd81d617 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -134,10 +134,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	fpu__activate_fpstate_write(fpu);
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		ret = copy_user_to_xstate(kbuf, ubuf, xsave);
-	else
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+		if (kbuf)
+			ret = copy_kernel_to_xstate(kbuf, ubuf, xsave);
+		else
+			ret = copy_user_to_xstate(kbuf, ubuf, xsave);
+	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	}
 
 	/*
 	 * In case of failure, mark all states as init:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 00c3b41c3cf1..1ad25d1b8056 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1084,7 +1084,71 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 }
 
 /*
- * Convert from a ptrace standard-format buffer to kernel XSAVES format
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
+ * and copy to the target thread. This is called from xstateregs_set() and
+ * there we check the CPU has XSAVES and a whole standard-sized buffer
+ * exists.
+ */
+int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf,
+		     struct xregs_state *xsave)
+{
+	unsigned int offset, size;
+	int i;
+	u64 xfeatures;
+	u64 allowed_features;
+
+	offset = offsetof(struct xregs_state, header);
+	size = sizeof(xfeatures);
+
+	if (kbuf) {
+		memcpy(&xfeatures, kbuf + offset, size);
+	} else {
+		if (__copy_from_user(&xfeatures, ubuf + offset, size))
+			return -EFAULT;
+	}
+
+	/*
+	 * Reject if the user sets any disabled or supervisor features:
+	 */
+	allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
+
+	if (xfeatures & ~allowed_features)
+		return -EINVAL;
+
+	for (i = 0; i < XFEATURE_MAX; i++) {
+		u64 mask = ((u64)1 << i);
+
+		if (xfeatures & mask) {
+			void *dst = __raw_xsave_addr(xsave, 1 << i);
+
+			offset = xstate_offsets[i];
+			size = xstate_sizes[i];
+
+			if (kbuf) {
+				memcpy(dst, kbuf + offset, size);
+			} else {
+				if (__copy_from_user(dst, ubuf + offset, size))
+					return -EFAULT;
+			}
+		}
+	}
+
+	/*
+	 * The state that came in from userspace was user-state only.
+	 * Mask all the user states out of 'xfeatures':
+	 */
+	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+
+	/*
+	 * Add back in the features that came in from userspace:
+	 */
+	xsave->header.xfeatures |= xfeatures;
+
+	return 0;
+}
+
+/*
+ * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format
  * and copy to the target thread. This is called from xstateregs_set() and
  * there we check the CPU has XSAVES and a whole standard-sized buffer
  * exists.
-- 
cgit v1.2.3


From 59dffa4edba1f15b2bfdbe608aca1efe664c674c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:55 +0200
Subject: x86/fpu: Remove 'ubuf' parameter from the copy_kernel_to_xstate() API

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-13-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  2 +-
 arch/x86/kernel/fpu/regset.c      |  2 +-
 arch/x86/kernel/fpu/xstate.c      | 17 +++--------------
 3 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 79af79dbcab6..f10889bc0c88 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave);
+int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave);
 int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index cb45dd81d617..785302c75f38 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -136,7 +136,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		if (kbuf)
-			ret = copy_kernel_to_xstate(kbuf, ubuf, xsave);
+			ret = copy_kernel_to_xstate(kbuf, xsave);
 		else
 			ret = copy_user_to_xstate(kbuf, ubuf, xsave);
 	} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1ad25d1b8056..71cc8d367fdd 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1089,8 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
  * there we check the CPU has XSAVES and a whole standard-sized buffer
  * exists.
  */
-int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf,
-		     struct xregs_state *xsave)
+int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave)
 {
 	unsigned int offset, size;
 	int i;
@@ -1100,12 +1099,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf,
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(xfeatures);
 
-	if (kbuf) {
-		memcpy(&xfeatures, kbuf + offset, size);
-	} else {
-		if (__copy_from_user(&xfeatures, ubuf + offset, size))
-			return -EFAULT;
-	}
+	memcpy(&xfeatures, kbuf + offset, size);
 
 	/*
 	 * Reject if the user sets any disabled or supervisor features:
@@ -1124,12 +1118,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf,
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			if (kbuf) {
-				memcpy(dst, kbuf + offset, size);
-			} else {
-				if (__copy_from_user(dst, ubuf + offset, size))
-					return -EFAULT;
-			}
+			memcpy(dst, kbuf + offset, size);
 		}
 	}
 
-- 
cgit v1.2.3


From 7b9094c688f807c110a2dab6f6edc5876bfa7b0b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:56 +0200
Subject: x86/fpu: Remove 'kbuf' parameter from the copy_user_to_xstate() API

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-14-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  2 +-
 arch/x86/kernel/fpu/regset.c      |  2 +-
 arch/x86/kernel/fpu/signal.c      | 11 ++++-------
 arch/x86/kernel/fpu/xstate.c      | 19 +++++--------------
 4 files changed, 11 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index f10889bc0c88..4ceb90740d80 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -51,5 +51,5 @@ int using_compacted_format(void);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave);
-int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave);
+int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 785302c75f38..caf723f31737 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -138,7 +138,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		if (kbuf)
 			ret = copy_kernel_to_xstate(kbuf, xsave);
 		else
-			ret = copy_user_to_xstate(kbuf, ubuf, xsave);
+			ret = copy_user_to_xstate(ubuf, xsave);
 	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 	}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index b1fe9a1fc4e0..2c685b492fd6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -323,13 +323,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		fpu__drop(fpu);
 
-		if (using_compacted_format()) {
-			err = copy_user_to_xstate(NULL, buf_fx,
-					       &fpu->state.xsave);
-		} else {
-			err = __copy_from_user(&fpu->state.xsave,
-					       buf_fx, state_size);
-		}
+		if (using_compacted_format())
+			err = copy_user_to_xstate(buf_fx, &fpu->state.xsave);
+		else
+			err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
 		if (err || __copy_from_user(&env, buf, sizeof(env))) {
 			fpstate_init(&fpu->state);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 71cc8d367fdd..b1f3e4dae2e3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1142,8 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave)
  * there we check the CPU has XSAVES and a whole standard-sized buffer
  * exists.
  */
-int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
-		     struct xregs_state *xsave)
+int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave)
 {
 	unsigned int offset, size;
 	int i;
@@ -1153,12 +1152,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 	offset = offsetof(struct xregs_state, header);
 	size = sizeof(xfeatures);
 
-	if (kbuf) {
-		memcpy(&xfeatures, kbuf + offset, size);
-	} else {
-		if (__copy_from_user(&xfeatures, ubuf + offset, size))
-			return -EFAULT;
-	}
+	if (__copy_from_user(&xfeatures, ubuf + offset, size))
+		return -EFAULT;
 
 	/*
 	 * Reject if the user sets any disabled or supervisor features:
@@ -1177,12 +1172,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf,
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
 
-			if (kbuf) {
-				memcpy(dst, kbuf + offset, size);
-			} else {
-				if (__copy_from_user(dst, ubuf + offset, size))
-					return -EFAULT;
-			}
+			if (__copy_from_user(dst, ubuf + offset, size))
+				return -EFAULT;
 		}
 	}
 
-- 
cgit v1.2.3


From 6d7f7da5533a3f841eeb1d9657257c9367924274 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:57 +0200
Subject: x86/fpu: Flip the parameter order in copy_*_to_xstate()

Make it more consistent with regular memcpy() semantics, where the destination
argument comes first.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-15-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h | 4 ++--
 arch/x86/kernel/fpu/regset.c      | 4 ++--
 arch/x86/kernel/fpu/signal.c      | 2 +-
 arch/x86/kernel/fpu/xstate.c      | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 4ceb90740d80..579ac2358e63 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field);
 int using_compacted_format(void);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave);
-int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave);
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 #endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index caf723f31737..19a7385a912c 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -136,9 +136,9 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		if (kbuf)
-			ret = copy_kernel_to_xstate(kbuf, xsave);
+			ret = copy_kernel_to_xstate(xsave, kbuf);
 		else
-			ret = copy_user_to_xstate(ubuf, xsave);
+			ret = copy_user_to_xstate(xsave, ubuf);
 	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 	}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2c685b492fd6..2d682dac35d4 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		fpu__drop(fpu);
 
 		if (using_compacted_format())
-			err = copy_user_to_xstate(buf_fx, &fpu->state.xsave);
+			err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
 		else
 			err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b1f3e4dae2e3..0ef35040d0ad 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1089,7 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
  * there we check the CPU has XSAVES and a whole standard-sized buffer
  * exists.
  */
-int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave)
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 {
 	unsigned int offset, size;
 	int i;
@@ -1142,7 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave)
  * there we check the CPU has XSAVES and a whole standard-sized buffer
  * exists.
  */
-int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave)
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 {
 	unsigned int offset, size;
 	int i;
-- 
cgit v1.2.3


From b3a163081c28d1a4d1ad76259a9d93b34a82f1da Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:58 +0200
Subject: x86/fpu: Simplify fpu->fpregs_active use

The fpregs_active() inline function is pretty pointless - in almost
all the callsites it can be replaced with a direct fpu->fpregs_active
access.

Do so and eliminate the extra layer of obfuscation.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-16-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 17 +----------------
 arch/x86/kernel/fpu/core.c          |  2 +-
 arch/x86/kernel/fpu/signal.c        |  9 +++++----
 arch/x86/mm/pkeys.c                 |  2 +-
 4 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 554cdb205d17..b223c57dd5dc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -542,21 +542,6 @@ static inline void fpregs_activate(struct fpu *fpu)
 	trace_x86_fpu_regs_activated(fpu);
 }
 
-/*
- * The question "does this thread have fpu access?"
- * is slightly racy, since preemption could come in
- * and revoke it immediately after the test.
- *
- * However, even in that very unlikely scenario,
- * we can just assume we have FPU access - typically
- * to save the FP state - we'll just take a #NM
- * fault and get the FPU access back.
- */
-static inline int fpregs_active(void)
-{
-	return current->thread.fpu.fpregs_active;
-}
-
 /*
  * FPU state switching for scheduling.
  *
@@ -617,7 +602,7 @@ static inline void user_fpu_begin(void)
 	struct fpu *fpu = &current->thread.fpu;
 
 	preempt_disable();
-	if (!fpregs_active())
+	if (!fpu->fpregs_active)
 		fpregs_activate(fpu);
 	preempt_enable();
 }
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e1114f070c2d..bad57248e5a0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -367,7 +367,7 @@ void fpu__current_fpstate_write_end(void)
 	 * registers may still be out of date.  Update them with
 	 * an XRSTOR if they are active.
 	 */
-	if (fpregs_active())
+	if (fpu->fpregs_active)
 		copy_kernel_to_fpregs(&fpu->state);
 
 	/*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2d682dac35d4..684025654d0c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -155,7 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
-	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+	struct fpu *fpu = &current->thread.fpu;
+	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -170,13 +171,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	if (fpregs_active() || using_compacted_format()) {
+	if (fpu->fpregs_active || using_compacted_format()) {
 		/* Save the live register state to the user directly. */
 		if (copy_fpregs_to_sigframe(buf_fx))
 			return -1;
 		/* Update the thread's fxstate to save the fsave header. */
 		if (ia32_fxstate)
-			copy_fxregs_to_kernel(&tsk->thread.fpu);
+			copy_fxregs_to_kernel(fpu);
 	} else {
 		/*
 		 * It is a *bug* if kernel uses compacted-format for xsave
@@ -189,7 +190,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			return -1;
 		}
 
-		fpstate_sanitize_xstate(&tsk->thread.fpu);
+		fpstate_sanitize_xstate(fpu);
 		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
 			return -1;
 	}
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 2dab69a706ec..e2c23472233e 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -45,7 +45,7 @@ int __execute_only_pkey(struct mm_struct *mm)
 	 */
 	preempt_disable();
 	if (!need_to_set_mm_pkey &&
-	    fpregs_active() &&
+	    current->thread.fpu.fpregs_active &&
 	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
 		preempt_enable();
 		return execute_only_pkey;
-- 
cgit v1.2.3


From a10b6a16cdad88170f546d008c77453cddf918e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 14:59:59 +0200
Subject: x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic

Do this temporarily only, to make it easier to change the FPU state machine,
in particular this change couples the fpu->fpregs_active and fpu->fpstate_active
states: they are only set/cleared together (as far as the scheduler sees them).

This will be removed by later patches.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-17-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index bad57248e5a0..b7dc3833d41a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -462,9 +462,11 @@ void fpu__clear(struct fpu *fpu)
 	 * Make sure fpstate is cleared and initialized.
 	 */
 	if (static_cpu_has(X86_FEATURE_FPU)) {
+		preempt_disable();
 		fpu__activate_curr(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
+		preempt_enable();
 	}
 }
 
-- 
cgit v1.2.3


From b6aa85558d7e7b18fc3470d2bc1731d2205dd275 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:00 +0200
Subject: x86/fpu: Split the state handling in fpu__drop()

Prepare fpu__drop() to use fpu->fpregs_active.

There are two distinct usecases for fpu__drop() in this context:
exit_thread() when called for 'current' in exit(), and when called
for another task in fork().

This patch does not change behavior, it only adds a couple of
debug checks and structures the code to make the ->fpregs_active
change more obviously correct.

All the complications will be removed later on.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-18-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b7dc3833d41a..815dfba7781a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -414,12 +414,19 @@ void fpu__drop(struct fpu *fpu)
 {
 	preempt_disable();
 
-	if (fpu->fpregs_active) {
-		/* Ignore delayed exceptions from user space */
-		asm volatile("1: fwait\n"
-			     "2:\n"
-			     _ASM_EXTABLE(1b, 2b));
-		fpregs_deactivate(fpu);
+	if (fpu == &current->thread.fpu) {
+		WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
+
+		if (fpu->fpregs_active) {
+			/* Ignore delayed exceptions from user space */
+			asm volatile("1: fwait\n"
+				     "2:\n"
+				     _ASM_EXTABLE(1b, 2b));
+			if (fpu->fpregs_active)
+				fpregs_deactivate(fpu);
+		}
+	} else {
+		WARN_ON_FPU(fpu->fpregs_active);
 	}
 
 	fpu->fpstate_active = 0;
-- 
cgit v1.2.3


From f1c8cd0176078c7bcafdc89cac447cab672a0b5e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:01 +0200
Subject: x86/fpu: Change fpu->fpregs_active users to fpu->fpstate_active

We want to simplify the FPU state machine by eliminating fpu->fpregs_active,
and we can do that because the two state flags (::fpregs_active and
::fpstate_active) are set essentially together.

The old lazy FPU switching code used to make a distinction - but there's
no lazy switching code anymore, we always switch in an 'eager' fashion.

Do this by first changing all substantial uses of fpu->fpregs_active
to fpu->fpstate_active and adding a few debug checks to double check
our assumption is correct.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-19-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h |  4 +++-
 arch/x86/kernel/fpu/core.c          | 16 ++++++++++------
 arch/x86/kernel/fpu/signal.c        |  4 +++-
 arch/x86/mm/pkeys.c                 |  3 +--
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index b223c57dd5dc..7fa676f93ac1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -556,7 +556,9 @@ static inline void fpregs_activate(struct fpu *fpu)
 static inline void
 switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-	if (old_fpu->fpregs_active) {
+	WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active);
+
+	if (old_fpu->fpstate_active) {
 		if (!copy_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
 		else
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 815dfba7781a..eab244622402 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -100,7 +100,7 @@ void __kernel_fpu_begin(void)
 
 	kernel_fpu_disable();
 
-	if (fpu->fpregs_active) {
+	if (fpu->fpstate_active) {
 		/*
 		 * Ignore return value -- we don't care if reg state
 		 * is clobbered.
@@ -116,7 +116,7 @@ void __kernel_fpu_end(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (fpu->fpregs_active)
+	if (fpu->fpstate_active)
 		copy_kernel_to_fpregs(&fpu->state);
 
 	kernel_fpu_enable();
@@ -147,8 +147,10 @@ void fpu__save(struct fpu *fpu)
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
 	preempt_disable();
+	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
+
 	trace_x86_fpu_before_save(fpu);
-	if (fpu->fpregs_active) {
+	if (fpu->fpstate_active) {
 		if (!copy_fpregs_to_fpstate(fpu)) {
 			copy_kernel_to_fpregs(&fpu->state);
 		}
@@ -262,11 +264,12 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr);
  */
 void fpu__activate_fpstate_read(struct fpu *fpu)
 {
+	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
 	/*
 	 * If fpregs are active (in the current CPU), then
 	 * copy them to the fpstate:
 	 */
-	if (fpu->fpregs_active) {
+	if (fpu->fpstate_active) {
 		fpu__save(fpu);
 	} else {
 		if (!fpu->fpstate_active) {
@@ -362,12 +365,13 @@ void fpu__current_fpstate_write_end(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
+	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
 	/*
 	 * 'fpu' now has an updated copy of the state, but the
 	 * registers may still be out of date.  Update them with
 	 * an XRSTOR if they are active.
 	 */
-	if (fpu->fpregs_active)
+	if (fpu->fpstate_active)
 		copy_kernel_to_fpregs(&fpu->state);
 
 	/*
@@ -417,7 +421,7 @@ void fpu__drop(struct fpu *fpu)
 	if (fpu == &current->thread.fpu) {
 		WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
 
-		if (fpu->fpregs_active) {
+		if (fpu->fpstate_active) {
 			/* Ignore delayed exceptions from user space */
 			asm volatile("1: fwait\n"
 				     "2:\n"
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 684025654d0c..a88083ba7f8b 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -171,7 +171,9 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	if (fpu->fpregs_active || using_compacted_format()) {
+	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
+
+	if (fpu->fpstate_active || using_compacted_format()) {
 		/* Save the live register state to the user directly. */
 		if (copy_fpregs_to_sigframe(buf_fx))
 			return -1;
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index e2c23472233e..4d24269c071f 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -18,7 +18,6 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
-#include <asm/fpu/internal.h>           /* fpregs_active()              */
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm)
 	 */
 	preempt_disable();
 	if (!need_to_set_mm_pkey &&
-	    current->thread.fpu.fpregs_active &&
+	    current->thread.fpu.fpstate_active &&
 	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
 		preempt_enable();
 		return execute_only_pkey;
-- 
cgit v1.2.3


From 6cf4edbe0526db311a28734609da888fdfcb3604 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:02 +0200
Subject: x86/fpu: Decouple fpregs_activate()/fpregs_deactivate() from
 fpu->fpregs_active

The fpregs_activate()/fpregs_deactivate() are currently called in such a pattern:

	if (!fpu->fpregs_active)
		fpregs_activate(fpu);

	...

	if (fpu->fpregs_active)
		fpregs_deactivate(fpu);

But note that it's actually safe to call them without checking the flag first.

This further decouples the fpu->fpregs_active flag from actual FPU logic.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-20-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 7 +------
 arch/x86/kernel/fpu/core.c          | 3 +--
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 7fa676f93ac1..42a601673c09 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -526,8 +526,6 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
  */
 static inline void fpregs_deactivate(struct fpu *fpu)
 {
-	WARN_ON_FPU(!fpu->fpregs_active);
-
 	fpu->fpregs_active = 0;
 	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
 	trace_x86_fpu_regs_deactivated(fpu);
@@ -535,8 +533,6 @@ static inline void fpregs_deactivate(struct fpu *fpu)
 
 static inline void fpregs_activate(struct fpu *fpu)
 {
-	WARN_ON_FPU(fpu->fpregs_active);
-
 	fpu->fpregs_active = 1;
 	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
 	trace_x86_fpu_regs_activated(fpu);
@@ -604,8 +600,7 @@ static inline void user_fpu_begin(void)
 	struct fpu *fpu = &current->thread.fpu;
 
 	preempt_disable();
-	if (!fpu->fpregs_active)
-		fpregs_activate(fpu);
+	fpregs_activate(fpu);
 	preempt_enable();
 }
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index eab244622402..01a47e9edfb4 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -426,8 +426,7 @@ void fpu__drop(struct fpu *fpu)
 			asm volatile("1: fwait\n"
 				     "2:\n"
 				     _ASM_EXTABLE(1b, 2b));
-			if (fpu->fpregs_active)
-				fpregs_deactivate(fpu);
+			fpregs_deactivate(fpu);
 		}
 	} else {
 		WARN_ON_FPU(fpu->fpregs_active);
-- 
cgit v1.2.3


From 99dc26bda233ee722bbd370bddf20beece3ffb93 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:03 +0200
Subject: x86/fpu: Remove struct fpu::fpregs_active

The previous changes paved the way for the removal of the
fpu::fpregs_active state flag - we now only have the
fpu::fpstate_active and fpu::last_cpu fields left.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-21-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h |  5 -----
 arch/x86/include/asm/fpu/types.h    | 23 -----------------------
 arch/x86/include/asm/trace/fpu.h    |  5 +----
 arch/x86/kernel/fpu/core.c          |  9 ---------
 arch/x86/kernel/fpu/signal.c        |  2 --
 5 files changed, 1 insertion(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 42a601673c09..629e7abcd6c9 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -526,14 +526,12 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
  */
 static inline void fpregs_deactivate(struct fpu *fpu)
 {
-	fpu->fpregs_active = 0;
 	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
 	trace_x86_fpu_regs_deactivated(fpu);
 }
 
 static inline void fpregs_activate(struct fpu *fpu)
 {
-	fpu->fpregs_active = 1;
 	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
 	trace_x86_fpu_regs_activated(fpu);
 }
@@ -552,8 +550,6 @@ static inline void fpregs_activate(struct fpu *fpu)
 static inline void
 switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-	WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active);
-
 	if (old_fpu->fpstate_active) {
 		if (!copy_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
@@ -561,7 +557,6 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 			old_fpu->last_cpu = cpu;
 
 		/* But leave fpu_fpregs_owner_ctx! */
-		old_fpu->fpregs_active = 0;
 		trace_x86_fpu_regs_deactivated(old_fpu);
 	} else
 		old_fpu->last_cpu = -1;
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c80f5b9c09d..0c314a397cf5 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -298,29 +298,6 @@ struct fpu {
 	 */
 	unsigned char			fpstate_active;
 
-	/*
-	 * @fpregs_active:
-	 *
-	 * This flag determines whether a given context is actively
-	 * loaded into the FPU's registers and that those registers
-	 * represent the task's current FPU state.
-	 *
-	 * Note the interaction with fpstate_active:
-	 *
-	 *   # task does not use the FPU:
-	 *   fpstate_active == 0
-	 *
-	 *   # task uses the FPU and regs are active:
-	 *   fpstate_active == 1 && fpregs_active == 1
-	 *
-	 *   # the regs are inactive but still match fpstate:
-	 *   fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
-	 *
-	 * The third state is what we use for the lazy restore optimization
-	 * on lazy-switching CPUs.
-	 */
-	unsigned char			fpregs_active;
-
 	/*
 	 * @state:
 	 *
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 342e59789fcd..da565aae9fd2 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -12,7 +12,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
-		__field(bool, fpregs_active)
 		__field(bool, fpstate_active)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
@@ -20,16 +19,14 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
-		__entry->fpregs_active	= fpu->fpregs_active;
 		__entry->fpstate_active	= fpu->fpstate_active;
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
-			__entry->fpregs_active,
 			__entry->fpstate_active,
 			__entry->xfeatures,
 			__entry->xcomp_bv
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 01a47e9edfb4..93103a909c47 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -147,8 +147,6 @@ void fpu__save(struct fpu *fpu)
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
 	preempt_disable();
-	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
-
 	trace_x86_fpu_before_save(fpu);
 	if (fpu->fpstate_active) {
 		if (!copy_fpregs_to_fpstate(fpu)) {
@@ -191,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init);
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-	dst_fpu->fpregs_active = 0;
 	dst_fpu->last_cpu = -1;
 
 	if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU))
@@ -264,7 +261,6 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr);
  */
 void fpu__activate_fpstate_read(struct fpu *fpu)
 {
-	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
 	/*
 	 * If fpregs are active (in the current CPU), then
 	 * copy them to the fpstate:
@@ -365,7 +361,6 @@ void fpu__current_fpstate_write_end(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
 	/*
 	 * 'fpu' now has an updated copy of the state, but the
 	 * registers may still be out of date.  Update them with
@@ -419,8 +414,6 @@ void fpu__drop(struct fpu *fpu)
 	preempt_disable();
 
 	if (fpu == &current->thread.fpu) {
-		WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
-
 		if (fpu->fpstate_active) {
 			/* Ignore delayed exceptions from user space */
 			asm volatile("1: fwait\n"
@@ -428,8 +421,6 @@ void fpu__drop(struct fpu *fpu)
 				     _ASM_EXTABLE(1b, 2b));
 			fpregs_deactivate(fpu);
 		}
-	} else {
-		WARN_ON_FPU(fpu->fpregs_active);
 	}
 
 	fpu->fpstate_active = 0;
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a88083ba7f8b..629106e51a29 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -171,8 +171,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active);
-
 	if (fpu->fpstate_active || using_compacted_format()) {
 		/* Save the live register state to the user directly. */
 		if (copy_fpregs_to_sigframe(buf_fx))
-- 
cgit v1.2.3


From 0852b374173bb57f870d78e6c6839c77b339be5f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 23 Sep 2017 15:00:04 +0200
Subject: x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on
 Intel Skylake CPUs

On Skylake CPUs I noticed that XRSTOR is unable to deal with states
created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and
no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not
XFEATURE_MASK_FP.

The reason is that part of the SSE/YMM state lives in the MXCSR and
MXCSR_FLAGS fields of the FP state.

Ensure that whenever we copy SSE or YMM state around, the MXCSR and
MXCSR_FLAGS fields are also copied around.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170210085445.0f1cc708@annuminas.surriel.com
Link: http://lkml.kernel.org/r/20170923130016.21448-22-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h |  3 +++
 arch/x86/kernel/fpu/xstate.c     | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0c314a397cf5..71db45ca8870 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -68,6 +68,9 @@ struct fxregs_state {
 /* Default value for fxregs_state.mxcsr: */
 #define MXCSR_DEFAULT		0x1f80
 
+/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */
+#define MXCSR_AND_FLAGS_SIZE sizeof(u64)
+
 /*
  * Software based FPU emulation state. This is arbitrary really,
  * it matches the x87 format to make it easier to understand:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0ef35040d0ad..41c52256bdce 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -920,6 +920,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 }
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
+/*
+ * Weird legacy quirk: SSE and YMM states store information in the
+ * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
+ * area is marked as unused in the xfeatures header, we need to copy
+ * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ */
+static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+{
+	if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+		return 0;
+
+	if (xfeatures & XFEATURE_MASK_FP)
+		return 0;
+
+	return 1;
+}
+
 /*
  * This is similar to user_regset_copyout(), but will not add offset to
  * the source data pointer or increment pos, count, kbuf, and ubuf.
@@ -988,6 +1005,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 
 	}
 
+	if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+		offset = offsetof(struct fxregs_state, mxcsr);
+		size = MXCSR_AND_FLAGS_SIZE;
+		__copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
+	}
+
 	/*
 	 * Fill xsave->i387.sw_reserved value for ptrace frame:
 	 */
@@ -1070,6 +1093,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 
 	}
 
+	if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+		offset = offsetof(struct fxregs_state, mxcsr);
+		size = MXCSR_AND_FLAGS_SIZE;
+		__copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
+	}
+
 	/*
 	 * Fill xsave->i387.sw_reserved value for ptrace frame:
 	 */
@@ -1122,6 +1151,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 		}
 	}
 
+	if (xfeatures_mxcsr_quirk(xfeatures)) {
+		offset = offsetof(struct fxregs_state, mxcsr);
+		size = MXCSR_AND_FLAGS_SIZE;
+		memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
+	}
+
 	/*
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
@@ -1177,6 +1212,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 		}
 	}
 
+	if (xfeatures_mxcsr_quirk(xfeatures)) {
+		offset = offsetof(struct fxregs_state, mxcsr);
+		size = MXCSR_AND_FLAGS_SIZE;
+		if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+			return -EFAULT;
+	}
+
 	/*
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
-- 
cgit v1.2.3


From 4f8cef59bad29344aca0e2e6b0ad18dadd078fd0 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Sat, 23 Sep 2017 15:00:05 +0200
Subject: x86/fpu: Fix boolreturn.cocci warnings

arch/x86/kernel/fpu/xstate.c:931:9-10: WARNING: return of 0/1 in function 'xfeatures_mxcsr_quirk' with return type bool

 Return statements in functions returning bool should use true/false instead of 1/0.

Generated by: scripts/coccinelle/misc/boolreturn.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kbuild-all@01.org
Cc: tipbuild@zytor.com
Link: http://lkml.kernel.org/r/20170306004553.GA25764@lkp-wsm-ep1
Link: http://lkml.kernel.org/r/20170923130016.21448-23-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 41c52256bdce..fda1109cc355 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -929,12 +929,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
 {
 	if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
-		return 0;
+		return false;
 
 	if (xfeatures & XFEATURE_MASK_FP)
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From 03eaec81ac09814817e9f0307d572ffe8365f980 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 23 Sep 2017 15:00:06 +0200
Subject: x86/fpu: Turn WARN_ON() in context switch into WARN_ON_FPU()

copy_xregs_to_kernel checks if the alternatives have been already
patched.

This WARN_ON() is always executed in every context switch.

All the other checks in fpu internal.h are WARN_ON_FPU(), but
this one is plain WARN_ON(). I assume it was forgotten to switch it.

So switch it to WARN_ON_FPU() too to avoid some unnecessary code
in the context switch, and a potentially expensive cache line miss for the
global variable.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170329062605.4970-1-andi@firstfloor.org
Link: http://lkml.kernel.org/r/20170923130016.21448-24-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 629e7abcd6c9..2dca7c65319c 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -350,7 +350,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
 	u32 hmask = mask >> 32;
 	int err;
 
-	WARN_ON(!alternatives_patched);
+	WARN_ON_FPU(!alternatives_patched);
 
 	XSTATE_XSAVE(xstate, lmask, hmask, err);
 
-- 
cgit v1.2.3


From 814fb7bb7db5433757d76f4c4502c96fc53b0b5e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 23 Sep 2017 15:00:07 +0200
Subject: x86/fpu: Don't let userspace set bogus xcomp_bv

On x86, userspace can use the ptrace() or rt_sigreturn() system calls to
set a task's extended state (xstate) or "FPU" registers.  ptrace() can
set them for another task using the PTRACE_SETREGSET request with
NT_X86_XSTATE, while rt_sigreturn() can set them for the current task.
In either case, registers can be set to any value, but the kernel
assumes that the XSAVE area itself remains valid in the sense that the
CPU can restore it.

However, in the case where the kernel is using the uncompacted xstate
format (which it does whenever the XSAVES instruction is unavailable),
it was possible for userspace to set the xcomp_bv field in the
xstate_header to an arbitrary value.  However, all bits in that field
are reserved in the uncompacted case, so when switching to a task with
nonzero xcomp_bv, the XRSTOR instruction failed with a #GP fault.  This
caused the WARN_ON_FPU(err) in copy_kernel_to_xregs() to be hit.  In
addition, since the error is otherwise ignored, the FPU registers from
the task previously executing on the CPU were leaked.

Fix the bug by checking that the user-supplied value of xcomp_bv is 0 in
the uncompacted case, and returning an error otherwise.

The reason for validating xcomp_bv rather than simply overwriting it
with 0 is that we want userspace to see an error if it (incorrectly)
provides an XSAVE area in compacted format rather than in uncompacted
format.

Note that as before, in case of error we clear the task's FPU state.
This is perhaps non-ideal, especially for PTRACE_SETREGSET; it might be
better to return an error before changing anything.  But it seems the
"clear on error" behavior is fine for now, and it's a little tricky to
do otherwise because it would mean we couldn't simply copy the full
userspace state into kernel memory in one __copy_from_user().

This bug was found by syzkaller, which hit the above-mentioned
WARN_ON_FPU():

    WARNING: CPU: 1 PID: 0 at ./arch/x86/include/asm/fpu/internal.h:373 __switch_to+0x5b5/0x5d0
    CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.13.0 #453
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
    task: ffff9ba2bc8e42c0 task.stack: ffffa78cc036c000
    RIP: 0010:__switch_to+0x5b5/0x5d0
    RSP: 0000:ffffa78cc08bbb88 EFLAGS: 00010082
    RAX: 00000000fffffffe RBX: ffff9ba2b8bf2180 RCX: 00000000c0000100
    RDX: 00000000ffffffff RSI: 000000005cb10700 RDI: ffff9ba2b8bf36c0
    RBP: ffffa78cc08bbbd0 R08: 00000000929fdf46 R09: 0000000000000001
    R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ba2bc8e42c0
    R13: 0000000000000000 R14: ffff9ba2b8bf3680 R15: ffff9ba2bf5d7b40
    FS:  00007f7e5cb10700(0000) GS:ffff9ba2bf400000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00000000004005cc CR3: 0000000079fd5000 CR4: 00000000001406e0
    Call Trace:
    Code: 84 00 00 00 00 00 e9 11 fd ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 e7 fa ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 c2 fa ff ff <0f> ff 66 0f 1f 84 00 00 00 00 00 e9 d4 fc ff ff 66 66 2e 0f 1f

Here is a C reproducer.  The expected behavior is that the program spin
forever with no output.  However, on a buggy kernel running on a
processor with the "xsave" feature but without the "xsaves" feature
(e.g. Sandy Bridge through Broadwell for Intel), within a second or two
the program reports that the xmm registers were corrupted, i.e. were not
restored correctly.  With CONFIG_X86_DEBUG_FPU=y it also hits the above
kernel warning.

    #define _GNU_SOURCE
    #include <stdbool.h>
    #include <inttypes.h>
    #include <linux/elf.h>
    #include <stdio.h>
    #include <sys/ptrace.h>
    #include <sys/uio.h>
    #include <sys/wait.h>
    #include <unistd.h>

    int main(void)
    {
        int pid = fork();
        uint64_t xstate[512];
        struct iovec iov = { .iov_base = xstate, .iov_len = sizeof(xstate) };

        if (pid == 0) {
            bool tracee = true;
            for (int i = 0; i < sysconf(_SC_NPROCESSORS_ONLN) && tracee; i++)
                tracee = (fork() != 0);
            uint32_t xmm0[4] = { [0 ... 3] = tracee ? 0x00000000 : 0xDEADBEEF };
            asm volatile("   movdqu %0, %%xmm0\n"
                         "   mov %0, %%rbx\n"
                         "1: movdqu %%xmm0, %0\n"
                         "   mov %0, %%rax\n"
                         "   cmp %%rax, %%rbx\n"
                         "   je 1b\n"
                         : "+m" (xmm0) : : "rax", "rbx", "xmm0");
            printf("BUG: xmm registers corrupted!  tracee=%d, xmm0=%08X%08X%08X%08X\n",
                   tracee, xmm0[0], xmm0[1], xmm0[2], xmm0[3]);
        } else {
            usleep(100000);
            ptrace(PTRACE_ATTACH, pid, 0, 0);
            wait(NULL);
            ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov);
            xstate[65] = -1;
            ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov);
            ptrace(PTRACE_CONT, pid, 0, 0);
            wait(NULL);
        }
        return 1;
    }

Note: the program only tests for the bug using the ptrace() system call.
The bug can also be reproduced using the rt_sigreturn() system call, but
only when called from a 32-bit program, since for 64-bit programs the
kernel restores the FPU state from the signal frame by doing XRSTOR
directly from userspace memory (with proper error checking).

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: <stable@vger.kernel.org> [v3.17+]
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Fixes: 0b29643a5843 ("x86/xsaves: Change compacted format xsave area header")
Link: http://lkml.kernel.org/r/20170922174156.16780-2-ebiggers3@gmail.com
Link: http://lkml.kernel.org/r/20170923130016.21448-25-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/regset.c | 4 ++++
 arch/x86/kernel/fpu/signal.c | 9 +++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 19a7385a912c..c764f7405322 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -141,6 +141,10 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 			ret = copy_user_to_xstate(xsave, ubuf);
 	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+
+		/* xcomp_bv must be 0 when using uncompacted format */
+		if (!ret && xsave->header.xcomp_bv)
+			ret = -EINVAL;
 	}
 
 	/*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 629106e51a29..da68ea1c3a44 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -324,11 +324,16 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		fpu__drop(fpu);
 
-		if (using_compacted_format())
+		if (using_compacted_format()) {
 			err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
-		else
+		} else {
 			err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
+			/* xcomp_bv must be 0 when using uncompacted format */
+			if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv)
+				err = -EINVAL;
+		}
+
 		if (err || __copy_from_user(&env, buf, sizeof(env))) {
 			fpstate_init(&fpu->state);
 			trace_x86_fpu_init_state(fpu);
-- 
cgit v1.2.3


From d5c8028b4788f62b31fb79a331b3ad3e041fa366 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 23 Sep 2017 15:00:09 +0200
Subject: x86/fpu: Reinitialize FPU registers if restoring FPU state fails

Userspace can change the FPU state of a task using the ptrace() or
rt_sigreturn() system calls.  Because reserved bits in the FPU state can
cause the XRSTOR instruction to fail, the kernel has to carefully
validate that no reserved bits or other invalid values are being set.

Unfortunately, there have been bugs in this validation code.  For
example, we were not checking that the 'xcomp_bv' field in the
xstate_header was 0.  As-is, such bugs are exploitable to read the FPU
registers of other processes on the system.  To do so, an attacker can
create a task, assign to it an invalid FPU state, then spin in a loop
and monitor the values of the FPU registers.  Because the task's FPU
registers are not being restored, sometimes the FPU registers will have
the values from another process.

This is likely to continue to be a problem in the future because the
validation done by the CPU instructions like XRSTOR is not immediately
visible to kernel developers.  Nor will invalid FPU states ever be
encountered during ordinary use --- they will only be seen during
fuzzing or exploits.  There can even be reserved bits outside the
xstate_header which are easy to forget about.  For example, the MXCSR
register contains reserved bits, which were not validated by the
KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load
damaged SSEx MXCSR register").

Therefore, mitigate this class of vulnerability by restoring the FPU
registers from init_fpstate if restoring from the task's state fails.

We actually used to do this, but it was (perhaps unwisely) removed by
commit 9ccc27a5d297 ("x86/fpu: Remove error return values from
copy_kernel_to_*regs() functions").  This new patch is also a bit
different.  First, it only clears the registers, not also the bad
in-memory state; this is simpler and makes it easier to make the
mitigation cover all callers of __copy_kernel_to_fpregs().  Second, it
does the register clearing in an exception handler so that no extra
instructions are added to context switches.  In fact, we *remove*
instructions, since previously we were always zeroing the register
containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com
Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 51 +++++++++++--------------------------
 arch/x86/mm/extable.c               | 24 +++++++++++++++++
 2 files changed, 39 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2dca7c65319c..cf290d424e48 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -120,20 +120,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
 	err;								\
 })
 
-#define check_insn(insn, output, input...)				\
-({									\
-	int err;							\
+#define kernel_insn(insn, output, input...)				\
 	asm volatile("1:" #insn "\n\t"					\
 		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)	\
+		     : output : input)
 
 static inline int copy_fregs_to_user(struct fregs_state __user *fx)
 {
@@ -153,20 +144,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
 
 static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 {
-	int err;
-
 	if (IS_ENABLED(CONFIG_X86_32)) {
-		err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 	} else {
 		if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
-			err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+			kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 		} else {
 			/* See comment in copy_fxregs_to_kernel() below. */
-			err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+			kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
 		}
 	}
-	/* Copying from a kernel buffer to FPU registers should never fail: */
-	WARN_ON_FPU(err);
 }
 
 static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
@@ -183,9 +170,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
 
 static inline void copy_kernel_to_fregs(struct fregs_state *fx)
 {
-	int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	WARN_ON_FPU(err);
+	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
 static inline int copy_user_to_fregs(struct fregs_state __user *fx)
@@ -281,18 +266,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
  * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
  * XSAVE area format.
  */
-#define XSTATE_XRESTORE(st, lmask, hmask, err)				\
+#define XSTATE_XRESTORE(st, lmask, hmask)				\
 	asm volatile(ALTERNATIVE(XRSTOR,				\
 				 XRSTORS, X86_FEATURE_XSAVES)		\
 		     "\n"						\
-		     "xor %[err], %[err]\n"				\
 		     "3:\n"						\
-		     ".pushsection .fixup,\"ax\"\n"			\
-		     "4: movl $-2, %[err]\n"				\
-		     "jmp 3b\n"						\
-		     ".popsection\n"					\
-		     _ASM_EXTABLE(661b, 4b)				\
-		     : [err] "=r" (err)					\
+		     _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
+		     :							\
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
@@ -336,7 +316,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
 	else
 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
 
-	/* We should never fault when copying from a kernel buffer: */
+	/*
+	 * We should never fault when copying from a kernel buffer, and the FPU
+	 * state we set at boot time should be valid.
+	 */
 	WARN_ON_FPU(err);
 }
 
@@ -365,12 +348,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
 {
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
-	int err;
-
-	XSTATE_XRESTORE(xstate, lmask, hmask, err);
 
-	/* We should never fault when copying from a kernel buffer: */
-	WARN_ON_FPU(err);
+	XSTATE_XRESTORE(xstate, lmask, hmask);
 }
 
 /*
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index c076f710de4c..c3521e2be396 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,6 +2,7 @@
 #include <linux/uaccess.h>
 #include <linux/sched/debug.h>
 
+#include <asm/fpu/internal.h>
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
@@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL_GPL(ex_handler_refcount);
 
+/*
+ * Handler for when we fail to restore a task's FPU state.  We should never get
+ * here because the FPU state of a task using the FPU (task->thread.fpu.state)
+ * should always be valid.  However, past bugs have allowed userspace to set
+ * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
+ * These caused XRSTOR to fail when switching to the task, leaking the FPU
+ * registers of the task previously executing on the CPU.  Mitigate this class
+ * of vulnerability by restoring from the initial state (essentially, zeroing
+ * out all the FPU registers) if we can't restore from the task's FPU state.
+ */
+bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+			  struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+
+	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
+		  (void *)instruction_pointer(regs));
+
+	__copy_kernel_to_fpregs(&init_fpstate, -1);
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fprestore);
+
 bool ex_handler_ext(const struct exception_table_entry *fixup,
 		   struct pt_regs *regs, int trapnr)
 {
-- 
cgit v1.2.3


From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Mon, 4 Sep 2017 10:32:15 +0200
Subject: x86/mm: Fix fault error path using unsafe vma pointer

commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
generation code") passes down a vma pointer to the error path, but that is
done once the mmap_sem is released when calling mm_fault_error() from
__do_page_fault().

This is dangerous as the vma structure is no more safe to be used once the
mmap_sem has been released. As only the protection key value is required in
the error processing, we could just pass down this value.

Fix it by passing a pointer to a protection key value down to the fault
signal generation code. The use of a pointer allows to keep the check
generating a warning message in fill_sig_info_pkey() when the vma was not
known. If the pointer is valid, the protection value can be accessed by
deferencing the pointer.

[ tglx: Made *pkey u32 as that's the type which is passed in siginfo ]

Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code")
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com
---
 arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 39567b5c33da..e2baeaa053a5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
  *	     faulted on a pte with its pkey=4.
  */
-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
-		struct vm_area_struct *vma)
+static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 {
 	/* This is effectively an #ifdef */
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 */
-	if (!vma) {
+	if (!pkey) {
 		WARN_ONCE(1, "PKU fault with no VMA passed in");
 		info->si_pkey = 0;
 		return;
@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 	 * absolutely guranteed to be 100% accurate because of
 	 * the race explained above.
 	 */
-	info->si_pkey = vma_pkey(vma);
+	info->si_pkey = *pkey;
 }
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-		     struct task_struct *tsk, struct vm_area_struct *vma,
-		     int fault)
+		     struct task_struct *tsk, u32 *pkey, int fault)
 {
 	unsigned lsb = 0;
 	siginfo_t info;
@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 		lsb = PAGE_SHIFT;
 	info.si_addr_lsb = lsb;
 
-	fill_sig_info_pkey(si_code, &info, vma);
+	fill_sig_info_pkey(si_code, &info, pkey);
 
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -762,8 +760,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 	unsigned long flags;
 	int sig;
-	/* No context means no VMA to pass down */
-	struct vm_area_struct *vma = NULL;
 
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -788,7 +784,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
 			/* XXX: hwpoison faults will set the wrong code. */
 			force_sig_info_fault(signal, si_code, address,
-					     tsk, vma, 0);
+					     tsk, NULL, 0);
 		}
 
 		/*
@@ -896,8 +892,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		       unsigned long address, struct vm_area_struct *vma,
-		       int si_code)
+		       unsigned long address, u32 *pkey, int si_code)
 {
 	struct task_struct *tsk = current;
 
@@ -945,7 +940,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		tsk->thread.error_code	= error_code;
 		tsk->thread.trap_nr	= X86_TRAP_PF;
 
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
 
 		return;
 	}
@@ -958,9 +953,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		     unsigned long address, struct vm_area_struct *vma)
+		     unsigned long address, u32 *pkey)
 {
-	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
+	__bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
 }
 
 static void
@@ -968,6 +963,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	   unsigned long address,  struct vm_area_struct *vma, int si_code)
 {
 	struct mm_struct *mm = current->mm;
+	u32 pkey;
+
+	if (vma)
+		pkey = vma_pkey(vma);
 
 	/*
 	 * Something tried to access memory that isn't in our memory map..
@@ -975,7 +974,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	 */
 	up_read(&mm->mmap_sem);
 
-	__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
+	__bad_area_nosemaphore(regs, error_code, address,
+			       (vma) ? &pkey : NULL, si_code);
 }
 
 static noinline void
@@ -1018,7 +1018,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
-	  struct vm_area_struct *vma, unsigned int fault)
+	  u32 *pkey, unsigned int fault)
 {
 	struct task_struct *tsk = current;
 	int code = BUS_ADRERR;
@@ -1045,13 +1045,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 		code = BUS_MCEERR_AR;
 	}
 #endif
-	force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
+	force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
 }
 
 static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
-	       unsigned long address, struct vm_area_struct *vma,
-	       unsigned int fault)
+	       unsigned long address, u32 *pkey, unsigned int fault)
 {
 	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
@@ -1075,9 +1074,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
-			do_sigbus(regs, error_code, address, vma, fault);
+			do_sigbus(regs, error_code, address, pkey, fault);
 		else if (fault & VM_FAULT_SIGSEGV)
-			bad_area_nosemaphore(regs, error_code, address, vma);
+			bad_area_nosemaphore(regs, error_code, address, pkey);
 		else
 			BUG();
 	}
@@ -1267,6 +1266,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	struct mm_struct *mm;
 	int fault, major = 0;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	u32 pkey;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1467,9 +1467,10 @@ good_area:
 		return;
 	}
 
+	pkey = vma_pkey(vma);
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		mm_fault_error(regs, error_code, address, vma, fault);
+		mm_fault_error(regs, error_code, address, &pkey, fault);
 		return;
 	}
 
-- 
cgit v1.2.3


From 7d7099433d9eaaa5a989a55f1fa354c16a3ad297 Mon Sep 17 00:00:00 2001
From: Sean Fu <fxinrong@gmail.com>
Date: Mon, 11 Sep 2017 08:33:21 +0800
Subject: x86/sysfs: Fix off-by-one error in loop termination

An off-by-one error in loop terminantion conditions in
create_setup_data_nodes() will lead to memory leak when
create_setup_data_node() failed.

Signed-off-by: Sean Fu <fxinrong@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1505090001-1157-1-git-send-email-fxinrong@gmail.com
---
 arch/x86/kernel/ksysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4b0592ca9e47..8c1cc08f514f 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent)
 	return 0;
 
 out_clean_nodes:
-	for (j = i - 1; j > 0; j--)
+	for (j = i - 1; j >= 0; j--)
 		cleanup_setup_data_node(*(kobjp + j));
 	kfree(kobjp);
 out_setup_data_kobj:
-- 
cgit v1.2.3


From 5ac751d9e6b187c4a0000879d6598eb2292db949 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Tue, 12 Sep 2017 19:40:00 +0300
Subject: x86: Don't cast away the __user in __get_user_asm_u64()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't cast away the __user in __get_user_asm_u64() on x86-32.
Prevents sparse getting upset.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20170912164000.13745-1-ville.syrjala@linux.intel.com
---
 arch/x86/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 78e8fcc87d4c..4b892917edeb 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -337,7 +337,7 @@ do {									\
 		     _ASM_EXTABLE(1b, 4b)				\
 		     _ASM_EXTABLE(2b, 4b)				\
 		     : "=r" (retval), "=&A"(x)				\
-		     : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1),	\
+		     : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1),	\
 		       "i" (errret), "0" (retval));			\
 })
 
-- 
cgit v1.2.3


From b09c146f8f63c0e03adba74df76bf9c2be466fec Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Fri, 8 Sep 2017 17:34:47 -0400
Subject: perf/x86/intel/cstate: Add missing CPU IDs

Skylake server uses the same C-state residency events as Sandy Bridge.

Denverton and Gemini lake use the same C-state residency events as
Apollo Lake.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: piotr.luc@intel.com
Cc: harry.pan@intel.com
Cc: srinivas.pandruvada@linux.intel.com
Link: http://lkml.kernel.org/r/20170908213449.6224-1-kan.liang@intel.com
---
 arch/x86/events/intel/cstate.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 4cf100ff2a37..72db0664a53d 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -552,6 +552,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 
 	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE,  snb_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates),
 
 	X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE,  snb_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates),
@@ -560,6 +561,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 	X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates),
 
 	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates),
+
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates),
 	{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
-- 
cgit v1.2.3


From 1aaccc40a1864053da26605b0297be16dd52641e Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Fri, 8 Sep 2017 17:34:48 -0400
Subject: perf/x86/msr: Add missing CPU IDs

Goldmont, Glodmont plus and Xeon Phi have MSR_SMI_COUNT as well.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: piotr.luc@intel.com
Cc: harry.pan@intel.com
Cc: srinivas.pandruvada@linux.intel.com
Link: http://lkml.kernel.org/r/20170908213449.6224-2-kan.liang@intel.com
---
 arch/x86/events/msr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 4bb3ec69e8ea..06723671ae4e 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -63,6 +63,14 @@ static bool test_intel(int idx)
 	case INTEL_FAM6_ATOM_SILVERMONT1:
 	case INTEL_FAM6_ATOM_SILVERMONT2:
 	case INTEL_FAM6_ATOM_AIRMONT:
+
+	case INTEL_FAM6_ATOM_GOLDMONT:
+	case INTEL_FAM6_ATOM_DENVERTON:
+
+	case INTEL_FAM6_ATOM_GEMINI_LAKE:
+
+	case INTEL_FAM6_XEON_PHI_KNL:
+	case INTEL_FAM6_XEON_PHI_KNM:
 		if (idx == PERF_MSR_SMI)
 			return true;
 		break;
-- 
cgit v1.2.3


From 450a97893559354b927c935f39ee11126f01f520 Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Fri, 8 Sep 2017 17:34:49 -0400
Subject: perf/x86/intel/rapl: Add missing CPU IDs

DENVERTON and GEMINI_LAKE support same RAPL counters as Apollo Lake.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: piotr.luc@intel.com
Cc: harry.pan@intel.com
Cc: srinivas.pandruvada@linux.intel.com
Link: http://lkml.kernel.org/r/20170908213449.6224-3-kan.liang@intel.com
---
 arch/x86/events/intel/rapl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 8e2457cb6b4a..005908ee9333 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init),
 
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init),
+
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init),
 	{},
 };
 
-- 
cgit v1.2.3


From 29b46dfb136cdbeece542b3f01115237e43f2855 Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Mon, 11 Sep 2017 10:10:15 -0700
Subject: perf/x86/intel/uncore: Correct num_boxes for IIO and IRP

There are 6 IIO/IRP boxes for CBDMA, PCIe0-2, MCP 0 and MCP 1
separately. Correct the num_boxes.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: acme@kernel.org
Link: http://lkml.kernel.org/r/1505149816-12580-1-git-send-email-kan.liang@intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index db1fe377e6dd..a7196818416a 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3462,7 +3462,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 static struct intel_uncore_type skx_uncore_iio = {
 	.name			= "iio",
 	.num_counters		= 4,
-	.num_boxes		= 5,
+	.num_boxes		= 6,
 	.perf_ctr_bits		= 48,
 	.event_ctl		= SKX_IIO0_MSR_PMON_CTL0,
 	.perf_ctr		= SKX_IIO0_MSR_PMON_CTR0,
@@ -3492,7 +3492,7 @@ static const struct attribute_group skx_uncore_format_group = {
 static struct intel_uncore_type skx_uncore_irp = {
 	.name			= "irp",
 	.num_counters		= 2,
-	.num_boxes		= 5,
+	.num_boxes		= 6,
 	.perf_ctr_bits		= 48,
 	.event_ctl		= SKX_IRP0_MSR_PMON_CTL0,
 	.perf_ctr		= SKX_IRP0_MSR_PMON_CTR0,
-- 
cgit v1.2.3


From 4618e90965f272fe522f2af2523a60d0d4bc78f3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:10 +0200
Subject: x86/fpu: Fix fpu__activate_fpstate_read() and update comments

fpu__activate_fpstate_read() can be called for the current task
when coredumping - or for stopped tasks when ptrace-ing.

Implement this properly in the code and update the comments.

This also fixes an incorrect (but harmless) warning introduced by
one of the earlier patches.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-28-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 93103a909c47..afd3f2a5c64e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -254,18 +254,21 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr);
 /*
  * This function must be called before we read a task's fpstate.
  *
- * If the task has not used the FPU before then initialize its
- * fpstate.
+ * There's two cases where this gets called:
+ *
+ * - for the current task (when coredumping), in which case we have
+ *   to save the latest FPU registers into the fpstate,
+ *
+ * - or it's called for stopped tasks (ptrace), in which case the
+ *   registers were already saved by the context-switch code when
+ *   the task scheduled out - we only have to initialize the registers
+ *   if they've never been initialized.
  *
  * If the task has used the FPU before then save it.
  */
 void fpu__activate_fpstate_read(struct fpu *fpu)
 {
-	/*
-	 * If fpregs are active (in the current CPU), then
-	 * copy them to the fpstate:
-	 */
-	if (fpu->fpstate_active) {
+	if (fpu == &current->thread.fpu) {
 		fpu__save(fpu);
 	} else {
 		if (!fpu->fpstate_active) {
-- 
cgit v1.2.3


From 685c930d6e58e31e251ec354f9dca3958a4c5040 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:11 +0200
Subject: x86/fpu: Remove fpu__current_fpstate_write_begin/end()

These functions are not used anymore, so remove them.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Bobby Powers <bobbypowers@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-29-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h |  2 --
 arch/x86/kernel/fpu/core.c          | 63 -------------------------------------
 2 files changed, 65 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index cf290d424e48..508e4181c4af 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -26,8 +26,6 @@
 extern void fpu__activate_curr(struct fpu *fpu);
 extern void fpu__activate_fpstate_read(struct fpu *fpu);
 extern void fpu__activate_fpstate_write(struct fpu *fpu);
-extern void fpu__current_fpstate_write_begin(void);
-extern void fpu__current_fpstate_write_end(void);
 extern void fpu__save(struct fpu *fpu);
 extern void fpu__restore(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index afd3f2a5c64e..b2cdeb3b1860 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -316,69 +316,6 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 	}
 }
 
-/*
- * This function must be called before we write the current
- * task's fpstate.
- *
- * This call gets the current FPU register state and moves
- * it in to the 'fpstate'.  Preemption is disabled so that
- * no writes to the 'fpstate' can occur from context
- * swiches.
- *
- * Must be followed by a fpu__current_fpstate_write_end().
- */
-void fpu__current_fpstate_write_begin(void)
-{
-	struct fpu *fpu = &current->thread.fpu;
-
-	/*
-	 * Ensure that the context-switching code does not write
-	 * over the fpstate while we are doing our update.
-	 */
-	preempt_disable();
-
-	/*
-	 * Move the fpregs in to the fpu's 'fpstate'.
-	 */
-	fpu__activate_fpstate_read(fpu);
-
-	/*
-	 * The caller is about to write to 'fpu'.  Ensure that no
-	 * CPU thinks that its fpregs match the fpstate.  This
-	 * ensures we will not be lazy and skip a XRSTOR in the
-	 * future.
-	 */
-	__fpu_invalidate_fpregs_state(fpu);
-}
-
-/*
- * This function must be paired with fpu__current_fpstate_write_begin()
- *
- * This will ensure that the modified fpstate gets placed back in
- * the fpregs if necessary.
- *
- * Note: This function may be called whether or not an _actual_
- * write to the fpstate occurred.
- */
-void fpu__current_fpstate_write_end(void)
-{
-	struct fpu *fpu = &current->thread.fpu;
-
-	/*
-	 * 'fpu' now has an updated copy of the state, but the
-	 * registers may still be out of date.  Update them with
-	 * an XRSTOR if they are active.
-	 */
-	if (fpu->fpstate_active)
-		copy_kernel_to_fpregs(&fpu->state);
-
-	/*
-	 * Our update is done and the fpregs/fpstate are in sync
-	 * if necessary.  Context switches can happen again.
-	 */
-	preempt_enable();
-}
-
 /*
  * 'fpu__restore()' is called to copy FPU registers from
  * the FPU fpstate to the live hw registers and to activate
-- 
cgit v1.2.3


From e4a81bfcaae1ebbdc6efe74e8ea563144d90e9a9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 26 Sep 2017 09:43:36 +0200
Subject: x86/fpu: Rename fpu::fpstate_active to fpu::initialized

The x86 FPU code used to have a complex state machine where both the FPU
registers and the FPU state context could be 'active' (or inactive)
independently of each other - which enabled features like lazy FPU restore.

Much of this complexity is gone in the current code: now we basically can
have FPU-less tasks (kernel threads) that don't use (and save/restore) FPU
state at all, plus full FPU users that save/restore directly with no laziness
whatsoever.

But the fpu::fpstate_active still carries bits of the old complexity - meanwhile
this flag has become a simple flag that shows whether the FPU context saving
area in the thread struct is initialized and used, or not.

Rename it to fpu::initialized to express this simplicity in the name as well.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-30-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32_signal.c         |  2 +-
 arch/x86/include/asm/fpu/internal.h |  4 ++--
 arch/x86/include/asm/fpu/types.h    |  6 +++---
 arch/x86/include/asm/trace/fpu.h    |  8 ++++----
 arch/x86/kernel/fpu/core.c          | 24 ++++++++++++------------
 arch/x86/kernel/fpu/init.c          |  2 +-
 arch/x86/kernel/fpu/regset.c        |  6 +++---
 arch/x86/kernel/fpu/signal.c        |  8 ++++----
 arch/x86/kernel/fpu/xstate.c        |  2 +-
 arch/x86/kernel/signal.c            |  6 +++---
 arch/x86/mm/pkeys.c                 |  2 +-
 11 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index e0bb46c02857..0e2a5edbce00 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -231,7 +231,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
-	if (fpu->fpstate_active) {
+	if (fpu->initialized) {
 		unsigned long fx_aligned, math_size;
 
 		sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 508e4181c4af..b26ae05da18a 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -527,7 +527,7 @@ static inline void fpregs_activate(struct fpu *fpu)
 static inline void
 switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-	if (old_fpu->fpstate_active) {
+	if (old_fpu->initialized) {
 		if (!copy_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
 		else
@@ -550,7 +550,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
 	bool preload = static_cpu_has(X86_FEATURE_FPU) &&
-		       new_fpu->fpstate_active;
+		       new_fpu->initialized;
 
 	if (preload) {
 		if (!fpregs_state_valid(new_fpu, cpu))
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 71db45ca8870..a1520575d86b 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -293,13 +293,13 @@ struct fpu {
 	unsigned int			last_cpu;
 
 	/*
-	 * @fpstate_active:
+	 * @initialized:
 	 *
-	 * This flag indicates whether this context is active: if the task
+	 * This flag indicates whether this context is initialized: if the task
 	 * is not running then we can restore from this context, if the task
 	 * is running then we should save into this context.
 	 */
-	unsigned char			fpstate_active;
+	unsigned char			initialized;
 
 	/*
 	 * @state:
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index da565aae9fd2..39f7a27bef13 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -12,22 +12,22 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
-		__field(bool, fpstate_active)
+		__field(bool, initialized)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
 		),
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
-		__entry->fpstate_active	= fpu->fpstate_active;
+		__entry->initialized	= fpu->initialized;
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
-			__entry->fpstate_active,
+			__entry->initialized,
 			__entry->xfeatures,
 			__entry->xcomp_bv
 	)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index b2cdeb3b1860..c8d6032f04d0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -100,7 +100,7 @@ void __kernel_fpu_begin(void)
 
 	kernel_fpu_disable();
 
-	if (fpu->fpstate_active) {
+	if (fpu->initialized) {
 		/*
 		 * Ignore return value -- we don't care if reg state
 		 * is clobbered.
@@ -116,7 +116,7 @@ void __kernel_fpu_end(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (fpu->fpstate_active)
+	if (fpu->initialized)
 		copy_kernel_to_fpregs(&fpu->state);
 
 	kernel_fpu_enable();
@@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu)
 
 	preempt_disable();
 	trace_x86_fpu_before_save(fpu);
-	if (fpu->fpstate_active) {
+	if (fpu->initialized) {
 		if (!copy_fpregs_to_fpstate(fpu)) {
 			copy_kernel_to_fpregs(&fpu->state);
 		}
@@ -191,7 +191,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
 	dst_fpu->last_cpu = -1;
 
-	if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU))
+	if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
 		return 0;
 
 	WARN_ON_FPU(src_fpu != &current->thread.fpu);
@@ -240,13 +240,13 @@ void fpu__activate_curr(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	if (!fpu->fpstate_active) {
+	if (!fpu->initialized) {
 		fpstate_init(&fpu->state);
 		trace_x86_fpu_init_state(fpu);
 
 		trace_x86_fpu_activate_state(fpu);
 		/* Safe to do for the current task: */
-		fpu->fpstate_active = 1;
+		fpu->initialized = 1;
 	}
 }
 EXPORT_SYMBOL_GPL(fpu__activate_curr);
@@ -271,13 +271,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu)
 	if (fpu == &current->thread.fpu) {
 		fpu__save(fpu);
 	} else {
-		if (!fpu->fpstate_active) {
+		if (!fpu->initialized) {
 			fpstate_init(&fpu->state);
 			trace_x86_fpu_init_state(fpu);
 
 			trace_x86_fpu_activate_state(fpu);
 			/* Safe to do for current and for stopped child tasks: */
-			fpu->fpstate_active = 1;
+			fpu->initialized = 1;
 		}
 	}
 }
@@ -303,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 	 */
 	WARN_ON_FPU(fpu == &current->thread.fpu);
 
-	if (fpu->fpstate_active) {
+	if (fpu->initialized) {
 		/* Invalidate any lazy state: */
 		__fpu_invalidate_fpregs_state(fpu);
 	} else {
@@ -312,7 +312,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 
 		trace_x86_fpu_activate_state(fpu);
 		/* Safe to do for stopped child tasks: */
-		fpu->fpstate_active = 1;
+		fpu->initialized = 1;
 	}
 }
 
@@ -354,7 +354,7 @@ void fpu__drop(struct fpu *fpu)
 	preempt_disable();
 
 	if (fpu == &current->thread.fpu) {
-		if (fpu->fpstate_active) {
+		if (fpu->initialized) {
 			/* Ignore delayed exceptions from user space */
 			asm volatile("1: fwait\n"
 				     "2:\n"
@@ -363,7 +363,7 @@ void fpu__drop(struct fpu *fpu)
 		}
 	}
 
-	fpu->fpstate_active = 0;
+	fpu->initialized = 0;
 
 	trace_x86_fpu_dropped(fpu);
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d5d44c452624..7affb7e3d9a5 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void)
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 
-	WARN_ON_FPU(current->thread.fpu.fpstate_active);
+	WARN_ON_FPU(current->thread.fpu.initialized);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c764f7405322..19e82334e811 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -16,14 +16,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r
 {
 	struct fpu *target_fpu = &target->thread.fpu;
 
-	return target_fpu->fpstate_active ? regset->n : 0;
+	return target_fpu->initialized ? regset->n : 0;
 }
 
 int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
 	struct fpu *target_fpu = &target->thread.fpu;
 
-	if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active)
+	if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized)
 		return regset->n;
 	else
 		return 0;
@@ -380,7 +380,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
 	struct fpu *fpu = &tsk->thread.fpu;
 	int fpvalid;
 
-	fpvalid = fpu->fpstate_active;
+	fpvalid = fpu->initialized;
 	if (fpvalid)
 		fpvalid = !fpregs_get(tsk, NULL,
 				      0, sizeof(struct user_i387_ia32_struct),
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index da68ea1c3a44..ab2dd24cfea4 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -171,7 +171,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	if (fpu->fpstate_active || using_compacted_format()) {
+	if (fpu->initialized || using_compacted_format()) {
 		/* Save the live register state to the user directly. */
 		if (copy_fpregs_to_sigframe(buf_fx))
 			return -1;
@@ -315,12 +315,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		int err = 0;
 
 		/*
-		 * Drop the current fpu which clears fpu->fpstate_active. This ensures
+		 * Drop the current fpu which clears fpu->initialized. This ensures
 		 * that any context-switch during the copy of the new state,
 		 * avoids the intermediate state from getting restored/saved.
 		 * Thus avoiding the new restored state from getting corrupted.
 		 * We will be ready to restore/save the state only after
-		 * fpu->fpstate_active is again set.
+		 * fpu->initialized is again set.
 		 */
 		fpu__drop(fpu);
 
@@ -342,7 +342,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
 		}
 
-		fpu->fpstate_active = 1;
+		fpu->initialized = 1;
 		preempt_disable();
 		fpu__restore(fpu);
 		preempt_enable();
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index fda1109cc355..703e76d027ee 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -867,7 +867,7 @@ const void *get_xsave_field_ptr(int xsave_state)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (!fpu->fpstate_active)
+	if (!fpu->initialized)
 		return NULL;
 	/*
 	 * fpu__save() takes the CPU's xstate registers
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e04442345fc0..4e188fda5961 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -263,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	if (fpu->fpstate_active) {
+	if (fpu->initialized) {
 		sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
 					  &buf_fx, &math_size);
 		*fpstate = (void __user *)sp;
@@ -279,7 +279,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		return (void __user *)-1L;
 
 	/* save i387 and extended state */
-	if (fpu->fpstate_active &&
+	if (fpu->initialized &&
 	    copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
 		return (void __user *)-1L;
 
@@ -755,7 +755,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		if (fpu->fpstate_active)
+		if (fpu->initialized)
 			fpu__clear(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 4d24269c071f..d7bc0eea20a5 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -44,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm)
 	 */
 	preempt_disable();
 	if (!need_to_set_mm_pkey &&
-	    current->thread.fpu.fpstate_active &&
+	    current->thread.fpu.initialized &&
 	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
 		preempt_enable();
 		return execute_only_pkey;
-- 
cgit v1.2.3


From 7f1487c59b7c6dcb20155f4302985da2659a2997 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:13 +0200
Subject: x86/fpu: Fix stale comments about lazy FPU logic

We don't do any lazy restore anymore, what we have are two pieces of optimization:

 - no-FPU tasks that don't save/restore the FPU context (kernel threads are such)

 - cached FPU registers maintained via the fpu->last_cpu field. This means that
   if an FPU task context switches to a non-FPU task then we can maintain the
   FPU registers as an in-FPU copies (cache), and skip the restoration of them
   once we switch back to the original FPU-using task.

Update all the comments that still referred to old 'lazy' and 'unlazy' concepts.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-31-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index c8d6032f04d0..77668d91fdc1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -205,9 +205,6 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	/*
 	 * Save current FPU registers directly into the child
 	 * FPU context, without any memory-to-memory copying.
-	 * In lazy mode, if the FPU context isn't loaded into
-	 * fpregs, CR0.TS will be set and do_device_not_available
-	 * will load the FPU context.
 	 *
 	 * We have to do all this with preemption disabled,
 	 * mostly because of the FNSAVE case, because in that
@@ -285,13 +282,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu)
 /*
  * This function must be called before we write a task's fpstate.
  *
- * If the task has used the FPU before then unlazy it.
+ * If the task has used the FPU before then invalidate any cached FPU registers.
  * If the task has not used the FPU before then initialize its fpstate.
  *
  * After this function call, after registers in the fpstate are
  * modified and the child task has woken up, the child task will
  * restore the modified FPU state from the modified context. If we
- * didn't clear its lazy status here then the lazy in-registers
+ * didn't clear its cached status here then the cached in-registers
  * state pending on its former CPU could be restored, corrupting
  * the modifications.
  */
@@ -304,7 +301,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 	WARN_ON_FPU(fpu == &current->thread.fpu);
 
 	if (fpu->initialized) {
-		/* Invalidate any lazy state: */
+		/* Invalidate any cached state: */
 		__fpu_invalidate_fpregs_state(fpu);
 	} else {
 		fpstate_init(&fpu->state);
-- 
cgit v1.2.3


From e10078eba69859359ce8644dd423b4132a6a8913 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:14 +0200
Subject: x86/fpu: Simplify and speed up fpu__copy()

fpu__copy() has a preempt_disable()/enable() pair, which it had to do to
be able to atomically unlazy the current task when doing an FNSAVE.

But we don't unlazy tasks anymore, we always do direct saves/restores of
FPU context.

So remove both the unnecessary critical section, and update the comments.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-32-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 77668d91fdc1..52122dd418ae 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -206,22 +206,13 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	 * Save current FPU registers directly into the child
 	 * FPU context, without any memory-to-memory copying.
 	 *
-	 * We have to do all this with preemption disabled,
-	 * mostly because of the FNSAVE case, because in that
-	 * case we must not allow preemption in the window
-	 * between the FNSAVE and us marking the context lazy.
-	 *
-	 * It shouldn't be an issue as even FNSAVE is plenty
-	 * fast in terms of critical section length.
+	 * ( The function 'fails' in the FNSAVE case, which destroys
+	 *   register contents so we have to copy them back. )
 	 */
-	preempt_disable();
 	if (!copy_fpregs_to_fpstate(dst_fpu)) {
-		memcpy(&src_fpu->state, &dst_fpu->state,
-		       fpu_kernel_xstate_size);
-
+		memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
 		copy_kernel_to_fpregs(&src_fpu->state);
 	}
-	preempt_enable();
 
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
-- 
cgit v1.2.3


From 2ce03d850b9a2f17d55596ecfa86e72b5687a627 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 15:00:15 +0200
Subject: x86/fpu: Rename fpu__activate_curr() to fpu__initialize()

Rename this function to better express that it's all about
initializing the FPU state of a task which goes hand in hand
with the fpu::initialized field.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20170923130016.21448-33-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 2 +-
 arch/x86/kernel/fpu/core.c          | 8 ++++----
 arch/x86/kernel/fpu/signal.c        | 2 +-
 arch/x86/kvm/x86.c                  | 2 +-
 arch/x86/math-emu/fpu_entry.c       | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index b26ae05da18a..7c980aafb8aa 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -23,7 +23,7 @@
 /*
  * High level FPU state handling functions:
  */
-extern void fpu__activate_curr(struct fpu *fpu);
+extern void fpu__initialize(struct fpu *fpu);
 extern void fpu__activate_fpstate_read(struct fpu *fpu);
 extern void fpu__activate_fpstate_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 52122dd418ae..07db9d94b68b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -224,7 +224,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
  * Activate the current task's in-memory FPU context,
  * if it has not been used before:
  */
-void fpu__activate_curr(struct fpu *fpu)
+void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
@@ -237,7 +237,7 @@ void fpu__activate_curr(struct fpu *fpu)
 		fpu->initialized = 1;
 	}
 }
-EXPORT_SYMBOL_GPL(fpu__activate_curr);
+EXPORT_SYMBOL_GPL(fpu__initialize);
 
 /*
  * This function must be called before we read a task's fpstate.
@@ -316,7 +316,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
  */
 void fpu__restore(struct fpu *fpu)
 {
-	fpu__activate_curr(fpu);
+	fpu__initialize(fpu);
 
 	/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
 	kernel_fpu_disable();
@@ -392,7 +392,7 @@ void fpu__clear(struct fpu *fpu)
 	 */
 	if (static_cpu_has(X86_FEATURE_FPU)) {
 		preempt_disable();
-		fpu__activate_curr(fpu);
+		fpu__initialize(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
 		preempt_enable();
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index ab2dd24cfea4..7fa3bdb331e9 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -280,7 +280,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	if (!access_ok(VERIFY_READ, buf, size))
 		return -EACCES;
 
-	fpu__activate_curr(fpu);
+	fpu__initialize(fpu);
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return fpregs_soft_set(current, NULL,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd17b7d9a107..03869eb7fcd6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7225,7 +7225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	sigset_t sigsaved;
 
-	fpu__activate_curr(fpu);
+	fpu__initialize(fpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index d4a7df2205b8..220638a4cb94 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -114,7 +114,7 @@ void math_emulate(struct math_emu_info *info)
 	struct desc_struct code_descriptor;
 	struct fpu *fpu = &current->thread.fpu;
 
-	fpu__activate_curr(fpu);
+	fpu__initialize(fpu);
 
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
-- 
cgit v1.2.3


From 369a036de206710ff27a66f9bffe78ef657648c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 23 Sep 2017 13:37:45 +0200
Subject: x86/fpu: Rename fpu__activate_fpstate_read/write() to
 fpu__prepare_[read|write]()

As per the new nomenclature we don't 'activate' the FPU state
anymore, we initialize it. So drop the _activate_fpstate name
from these functions, which were a bit of a mouthful anyway,
and name them:

	fpu__prepare_read()
	fpu__prepare_write()

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h |  4 ++--
 arch/x86/kernel/fpu/core.c          |  4 ++--
 arch/x86/kernel/fpu/regset.c        | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 7c980aafb8aa..e3221ffa304e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -24,8 +24,8 @@
  * High level FPU state handling functions:
  */
 extern void fpu__initialize(struct fpu *fpu);
-extern void fpu__activate_fpstate_read(struct fpu *fpu);
-extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__prepare_read(struct fpu *fpu);
+extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern void fpu__restore(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 07db9d94b68b..f92a6593de1e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(fpu__initialize);
  *
  * If the task has used the FPU before then save it.
  */
-void fpu__activate_fpstate_read(struct fpu *fpu)
+void fpu__prepare_read(struct fpu *fpu)
 {
 	if (fpu == &current->thread.fpu) {
 		fpu__save(fpu);
@@ -283,7 +283,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu)
  * state pending on its former CPU could be restored, corrupting
  * the modifications.
  */
-void fpu__activate_fpstate_write(struct fpu *fpu)
+void fpu__prepare_write(struct fpu *fpu)
 {
 	/*
 	 * Only stopped child tasks can be used to modify the FPU
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 19e82334e811..ee8d2f049818 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -38,7 +38,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (!boot_cpu_has(X86_FEATURE_FXSR))
 		return -ENODEV;
 
-	fpu__activate_fpstate_read(fpu);
+	fpu__prepare_read(fpu);
 	fpstate_sanitize_xstate(fpu);
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -55,7 +55,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!boot_cpu_has(X86_FEATURE_FXSR))
 		return -ENODEV;
 
-	fpu__activate_fpstate_write(fpu);
+	fpu__prepare_write(fpu);
 	fpstate_sanitize_xstate(fpu);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -89,7 +89,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	xsave = &fpu->state.xsave;
 
-	fpu__activate_fpstate_read(fpu);
+	fpu__prepare_read(fpu);
 
 	if (using_compacted_format()) {
 		if (kbuf)
@@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	xsave = &fpu->state.xsave;
 
-	fpu__activate_fpstate_write(fpu);
+	fpu__prepare_write(fpu);
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		if (kbuf)
@@ -310,7 +310,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	struct fpu *fpu = &target->thread.fpu;
 	struct user_i387_ia32_struct env;
 
-	fpu__activate_fpstate_read(fpu);
+	fpu__prepare_read(fpu);
 
 	if (!boot_cpu_has(X86_FEATURE_FPU))
 		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
@@ -340,7 +340,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	fpu__activate_fpstate_write(fpu);
+	fpu__prepare_write(fpu);
 	fpstate_sanitize_xstate(fpu);
 
 	if (!boot_cpu_has(X86_FEATURE_FPU))
-- 
cgit v1.2.3


From e63e5d5c15c6b1dba26f7cbd1b1089a1d6155db5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:04 +0200
Subject: x86/fpu: Introduce validate_xstate_header()

Move validation of user-supplied xstate_header into a helper function,
in preparation of calling it from both the ptrace and sigreturn syscall
paths.

The new function also considers it to be an error if *any* reserved bits
are set, whereas before we were just clearing most of them silently.

This should reduce the chance of bugs that fail to correctly validate
user-supplied XSAVE areas.  It also will expose any broken userspace
programs that set the other reserved bits; this is desirable because
such programs will lose compatibility with future CPUs and kernels if
those bits are ever used for anything.  (There shouldn't be any such
programs, and in fact in the case where the compacted format is in use
we were already validating xfeatures.  But you never know...)

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-2-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h |  4 ++++
 arch/x86/kernel/fpu/xstate.c      | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 579ac2358e63..83fee2469eb7 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -52,4 +52,8 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+extern int validate_xstate_header(const struct xstate_header *hdr);
+
 #endif
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 703e76d027ee..2427aeea33b5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -483,6 +483,30 @@ int using_compacted_format(void)
 	return boot_cpu_has(X86_FEATURE_XSAVES);
 }
 
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+int validate_xstate_header(const struct xstate_header *hdr)
+{
+	/* No unknown or supervisor features may be set */
+	if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+		return -EINVAL;
+
+	/* Userspace must use the uncompacted format */
+	if (hdr->xcomp_bv)
+		return -EINVAL;
+
+	/*
+	 * If 'reserved' is shrunken to add a new field, make sure to validate
+	 * that new field here!
+	 */
+	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
+
+	/* No reserved bits may be set */
+	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+		return -EINVAL;
+
+	return 0;
+}
+
 static void __xstate_dump_leaves(void)
 {
 	int i;
-- 
cgit v1.2.3


From cf9df81b139b6ebaec188d73758f02ca3b2110e4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:05 +0200
Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header
 in xstateregs_set()

Tighten the checks in xstateregs_set().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-3-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/regset.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index ee8d2f049818..b831d5b9de99 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -141,27 +141,20 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 			ret = copy_user_to_xstate(xsave, ubuf);
 	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-
-		/* xcomp_bv must be 0 when using uncompacted format */
-		if (!ret && xsave->header.xcomp_bv)
-			ret = -EINVAL;
+		if (!ret)
+			ret = validate_xstate_header(&xsave->header);
 	}
 
-	/*
-	 * In case of failure, mark all states as init:
-	 */
-	if (ret)
-		fpstate_init(&fpu->state);
-
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
 	xsave->i387.mxcsr &= mxcsr_feature_mask;
-	xsave->header.xfeatures &= xfeatures_mask;
+
 	/*
-	 * These bits must be zero.
+	 * In case of failure, mark all states as init:
 	 */
-	memset(&xsave->header.reserved, 0, 48);
+	if (ret)
+		fpstate_init(&fpu->state);
 
 	return ret;
 }
-- 
cgit v1.2.3


From b11e2e18a7fc8eaa3d592c260d50c7129e094ded Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:06 +0200
Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header
 in __fpu__restore_sig()

Tighten the checks in __fpu__restore_sig() and update comments.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-4-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/signal.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7fa3bdb331e9..fb639e70048f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -214,8 +214,11 @@ sanitize_restored_xstate(struct task_struct *tsk,
 	struct xstate_header *header = &xsave->header;
 
 	if (use_xsave()) {
-		/* These bits must be zero. */
-		memset(header->reserved, 0, 48);
+		/*
+		 * Note: we don't need to zero the reserved bits in the
+		 * xstate_header here because we either didn't copy them at all,
+		 * or we checked earlier that they aren't set.
+		 */
 
 		/*
 		 * Init the state that is not present in the memory
@@ -224,7 +227,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 		if (fx_only)
 			header->xfeatures = XFEATURE_MASK_FPSSE;
 		else
-			header->xfeatures &= (xfeatures_mask & xfeatures);
+			header->xfeatures &= xfeatures;
 	}
 
 	if (use_fxsr()) {
@@ -308,7 +311,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		/*
 		 * For 32-bit frames with fxstate, copy the user state to the
 		 * thread's fpu state, reconstruct fxstate from the fsave
-		 * header. Sanitize the copied state etc.
+		 * header. Validate and sanitize the copied state.
 		 */
 		struct fpu *fpu = &tsk->thread.fpu;
 		struct user_i387_ia32_struct env;
@@ -329,9 +332,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		} else {
 			err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
-			/* xcomp_bv must be 0 when using uncompacted format */
-			if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv)
-				err = -EINVAL;
+			if (!err && state_size > offsetof(struct xregs_state, header))
+				err = validate_xstate_header(&fpu->state.xsave.header);
 		}
 
 		if (err || __copy_from_user(&env, buf, sizeof(env))) {
-- 
cgit v1.2.3


From 80d8ae86b36791a545ca28ddc95133ea59bba6e0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:07 +0200
Subject: x86/fpu: Copy the full state_header in copy_kernel_to_xstate()

This is in preparation to verify the full xstate header as supplied by user-space.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-5-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 2427aeea33b5..02591b96bb25 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1148,11 +1148,13 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	int i;
 	u64 xfeatures;
 	u64 allowed_features;
+	struct xstate_header hdr;
 
 	offset = offsetof(struct xregs_state, header);
-	size = sizeof(xfeatures);
+	size = sizeof(hdr);
 
-	memcpy(&xfeatures, kbuf + offset, size);
+	memcpy(&hdr, kbuf + offset, size);
+	xfeatures = hdr.xfeatures;
 
 	/*
 	 * Reject if the user sets any disabled or supervisor features:
-- 
cgit v1.2.3


From b89eda482d7849a1c146b6d0a42f4e76369bb08e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:08 +0200
Subject: x86/fpu: Eliminate the 'xfeatures' local variable in
 copy_kernel_to_xstate()

We have this information in the xstate_header.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-6-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 02591b96bb25..c97c4a9db52a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1146,7 +1146,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 {
 	unsigned int offset, size;
 	int i;
-	u64 xfeatures;
 	u64 allowed_features;
 	struct xstate_header hdr;
 
@@ -1154,20 +1153,19 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	size = sizeof(hdr);
 
 	memcpy(&hdr, kbuf + offset, size);
-	xfeatures = hdr.xfeatures;
 
 	/*
 	 * Reject if the user sets any disabled or supervisor features:
 	 */
 	allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
 
-	if (xfeatures & ~allowed_features)
+	if (hdr.xfeatures & ~allowed_features)
 		return -EINVAL;
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
 		u64 mask = ((u64)1 << i);
 
-		if (xfeatures & mask) {
+		if (hdr.xfeatures & mask) {
 			void *dst = __raw_xsave_addr(xsave, 1 << i);
 
 			offset = xstate_offsets[i];
@@ -1177,7 +1175,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 		}
 	}
 
-	if (xfeatures_mxcsr_quirk(xfeatures)) {
+	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
 		offset = offsetof(struct fxregs_state, mxcsr);
 		size = MXCSR_AND_FLAGS_SIZE;
 		memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
@@ -1192,7 +1190,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	/*
 	 * Add back in the features that came in from userspace:
 	 */
-	xsave->header.xfeatures |= xfeatures;
+	xsave->header.xfeatures |= hdr.xfeatures;
 
 	return 0;
 }
-- 
cgit v1.2.3


From af95774b3ca080b0e1e651c0fc7680f3444ddda7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:09 +0200
Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header
 in copy_kernel_to_xstate()

Tighten the checks in copy_kernel_to_xstate().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-7-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c97c4a9db52a..325db7850335 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1138,15 +1138,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 
 /*
  * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
- * and copy to the target thread. This is called from xstateregs_set() and
- * there we check the CPU has XSAVES and a whole standard-sized buffer
- * exists.
+ * and copy to the target thread. This is called from xstateregs_set().
  */
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 {
 	unsigned int offset, size;
 	int i;
-	u64 allowed_features;
 	struct xstate_header hdr;
 
 	offset = offsetof(struct xregs_state, header);
@@ -1154,12 +1151,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 
 	memcpy(&hdr, kbuf + offset, size);
 
-	/*
-	 * Reject if the user sets any disabled or supervisor features:
-	 */
-	allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
-
-	if (hdr.xfeatures & ~allowed_features)
+	if (validate_xstate_header(&hdr))
 		return -EINVAL;
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
-- 
cgit v1.2.3


From af2c4322d986a08a6e793b74b83a62b325019c20 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:10 +0200
Subject: x86/fpu: Copy the full header in copy_user_to_xstate()

This is in preparation to verify the full xstate header as supplied by user-space.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-8-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 325db7850335..0cd7b73c25e8 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1199,13 +1199,16 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	int i;
 	u64 xfeatures;
 	u64 allowed_features;
+	struct xstate_header hdr;
 
 	offset = offsetof(struct xregs_state, header);
-	size = sizeof(xfeatures);
+	size = sizeof(hdr);
 
-	if (__copy_from_user(&xfeatures, ubuf + offset, size))
+	if (__copy_from_user(&hdr, ubuf + offset, size))
 		return -EFAULT;
 
+	xfeatures = hdr.xfeatures;
+
 	/*
 	 * Reject if the user sets any disabled or supervisor features:
 	 */
-- 
cgit v1.2.3


From 3d703477bcfe8bb57079d97198cf1e342fe1fef9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:11 +0200
Subject: x86/fpu: Eliminate the 'xfeatures' local variable in
 copy_user_to_xstate()

We now have this field in hdr.xfeatures.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-9-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0cd7b73c25e8..b6d78b78b5c2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1197,7 +1197,6 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 {
 	unsigned int offset, size;
 	int i;
-	u64 xfeatures;
 	u64 allowed_features;
 	struct xstate_header hdr;
 
@@ -1207,20 +1206,18 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	if (__copy_from_user(&hdr, ubuf + offset, size))
 		return -EFAULT;
 
-	xfeatures = hdr.xfeatures;
-
 	/*
 	 * Reject if the user sets any disabled or supervisor features:
 	 */
 	allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
 
-	if (xfeatures & ~allowed_features)
+	if (hdr.xfeatures & ~allowed_features)
 		return -EINVAL;
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
 		u64 mask = ((u64)1 << i);
 
-		if (xfeatures & mask) {
+		if (hdr.xfeatures & mask) {
 			void *dst = __raw_xsave_addr(xsave, 1 << i);
 
 			offset = xstate_offsets[i];
@@ -1231,7 +1228,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 		}
 	}
 
-	if (xfeatures_mxcsr_quirk(xfeatures)) {
+	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
 		offset = offsetof(struct fxregs_state, mxcsr);
 		size = MXCSR_AND_FLAGS_SIZE;
 		if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
@@ -1247,7 +1244,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	/*
 	 * Add back in the features that came in from userspace:
 	 */
-	xsave->header.xfeatures |= xfeatures;
+	xsave->header.xfeatures |= hdr.xfeatures;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 98c0fad9d60e8b2cd47e15b7bee7df343648f5bb Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:12 +0200
Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header
 in copy_user_to_xstate()

Tighten the checks in copy_user_to_xstate().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-10-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b6d78b78b5c2..f1d5476c9022 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1188,16 +1188,15 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 }
 
 /*
- * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format
- * and copy to the target thread. This is called from xstateregs_set() and
- * there we check the CPU has XSAVES and a whole standard-sized buffer
- * exists.
+ * Convert from a ptrace or sigreturn standard-format user-space buffer to
+ * kernel XSAVES format and copy to the target thread. This is called from
+ * xstateregs_set(), as well as potentially from the sigreturn() and
+ * rt_sigreturn() system calls.
  */
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 {
 	unsigned int offset, size;
 	int i;
-	u64 allowed_features;
 	struct xstate_header hdr;
 
 	offset = offsetof(struct xregs_state, header);
@@ -1206,12 +1205,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	if (__copy_from_user(&hdr, ubuf + offset, size))
 		return -EFAULT;
 
-	/*
-	 * Reject if the user sets any disabled or supervisor features:
-	 */
-	allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
-
-	if (hdr.xfeatures & ~allowed_features)
+	if (validate_xstate_header(&hdr))
 		return -EINVAL;
 
 	for (i = 0; i < XFEATURE_MAX; i++) {
-- 
cgit v1.2.3


From 738f48cb5fdd5878d11934f1898aa2bcf1578289 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2017 12:59:13 +0200
Subject: x86/fpu: Use using_compacted_format() instead of open coded
 X86_FEATURE_XSAVES

This is the canonical method to use.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170924105913.9157-11-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/regset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index b831d5b9de99..3ea151372389 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -134,7 +134,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	fpu__prepare_write(fpu);
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+	if (using_compacted_format()) {
 		if (kbuf)
 			ret = copy_kernel_to_xstate(xsave, kbuf);
 		else
-- 
cgit v1.2.3


From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Jun 2017 12:57:04 +0200
Subject: KVM: VMX: extract __pi_post_block
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simple code movement patch, preparing for the next one.

Cc: Huangweidong <weidong.huang@huawei.com>
Cc: Gonglei <arei.gonglei@huawei.com>
Cc: wangxin <wangxinxin.wang@huawei.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c83d28b0ab05..0002b14307ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11705,6 +11705,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+	unsigned long flags;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		dest = cpu_physical_id(vcpu->cpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+
+		/* set 'NV' to 'notification vector' */
+		new.nv = POSTED_INTR_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	if(vcpu->pre_pcpu != -1) {
+		spin_lock_irqsave(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock_irqrestore(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		vcpu->pre_pcpu = -1;
+	}
+}
+
 /*
  * This routine does the following things for vCPU which is going
  * to be blocked if VT-d PI is enabled.
@@ -11798,44 +11835,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 
 static void pi_post_block(struct kvm_vcpu *vcpu)
 {
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	struct pi_desc old, new;
-	unsigned int dest;
-	unsigned long flags;
-
 	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
 		!kvm_vcpu_apicv_active(vcpu))
 		return;
 
-	do {
-		old.control = new.control = pi_desc->control;
-
-		dest = cpu_physical_id(vcpu->cpu);
-
-		if (x2apic_enabled())
-			new.ndst = dest;
-		else
-			new.ndst = (dest << 8) & 0xFF00;
-
-		/* Allow posting non-urgent interrupts */
-		new.sn = 0;
-
-		/* set 'NV' to 'notification vector' */
-		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
-
-	if(vcpu->pre_pcpu != -1) {
-		spin_lock_irqsave(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
-		list_del(&vcpu->blocked_vcpu_list);
-		spin_unlock_irqrestore(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
-		vcpu->pre_pcpu = -1;
-	}
+	__pi_post_block(vcpu);
 }
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Jun 2017 12:57:05 +0200
Subject: KVM: VMX: avoid double list add with VT-d posted interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases, for example involving hot-unplug of assigned
devices, pi_post_block can forget to remove the vCPU from the
blocked_vcpu_list.  When this happens, the next call to
pi_pre_block corrupts the list.

Fix this in two ways.  First, check vcpu->pre_pcpu in pi_pre_block
and WARN instead of adding the element twice in the list.  Second,
always do the list removal in pi_post_block if vcpu->pre_pcpu is
set (not -1).

The new code keeps interrupts disabled for the whole duration of
pi_pre_block/pi_post_block.  This is not strictly necessary, but
easier to follow.  For the same reason, PI.ON is checked only
after the cmpxchg, and to handle it we just call the post-block
code.  This removes duplication of the list removal code.

Cc: Huangweidong <weidong.huang@huawei.com>
Cc: Gonglei <arei.gonglei@huawei.com>
Cc: wangxin <wangxinxin.wang@huawei.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0002b14307ab..0bfe97e50a40 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11710,10 +11710,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct pi_desc old, new;
 	unsigned int dest;
-	unsigned long flags;
 
 	do {
 		old.control = new.control = pi_desc->control;
+		WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+		     "Wakeup handler not enabled while the VCPU is blocked\n");
 
 		dest = cpu_physical_id(vcpu->cpu);
 
@@ -11730,14 +11731,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
 
-	if(vcpu->pre_pcpu != -1) {
-		spin_lock_irqsave(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
+	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 		list_del(&vcpu->blocked_vcpu_list);
-		spin_unlock_irqrestore(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 		vcpu->pre_pcpu = -1;
 	}
 }
@@ -11757,7 +11754,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
  */
 static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
-	unsigned long flags;
 	unsigned int dest;
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11767,34 +11763,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 		!kvm_vcpu_apicv_active(vcpu))
 		return 0;
 
-	vcpu->pre_pcpu = vcpu->cpu;
-	spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-			  vcpu->pre_pcpu), flags);
-	list_add_tail(&vcpu->blocked_vcpu_list,
-		      &per_cpu(blocked_vcpu_on_cpu,
-		      vcpu->pre_pcpu));
-	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
-			       vcpu->pre_pcpu), flags);
+	WARN_ON(irqs_disabled());
+	local_irq_disable();
+	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+		vcpu->pre_pcpu = vcpu->cpu;
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+		list_add_tail(&vcpu->blocked_vcpu_list,
+			      &per_cpu(blocked_vcpu_on_cpu,
+				       vcpu->pre_pcpu));
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+	}
 
 	do {
 		old.control = new.control = pi_desc->control;
 
-		/*
-		 * We should not block the vCPU if
-		 * an interrupt is posted for it.
-		 */
-		if (pi_test_on(pi_desc) == 1) {
-			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-					  vcpu->pre_pcpu), flags);
-			list_del(&vcpu->blocked_vcpu_list);
-			spin_unlock_irqrestore(
-					&per_cpu(blocked_vcpu_on_cpu_lock,
-					vcpu->pre_pcpu), flags);
-			vcpu->pre_pcpu = -1;
-
-			return 1;
-		}
-
 		WARN((pi_desc->sn == 1),
 		     "Warning: SN field of posted-interrupts "
 		     "is set before blocking\n");
@@ -11819,7 +11801,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
 
-	return 0;
+	/* We should not block the vCPU if an interrupt is posted for it.  */
+	if (pi_test_on(pi_desc) == 1)
+		__pi_post_block(vcpu);
+
+	local_irq_enable();
+	return (vcpu->pre_pcpu == -1);
 }
 
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -11835,12 +11822,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 
 static void pi_post_block(struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	if (vcpu->pre_pcpu == -1)
 		return;
 
+	WARN_ON(irqs_disabled());
+	local_irq_disable();
 	__pi_post_block(vcpu);
+	local_irq_enable();
 }
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Jun 2017 12:57:06 +0200
Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The simplify part: do not touch pi_desc.nv, we can set it when the
VCPU is first created.  Likewise, pi_desc.sn is only handled by
vmx_vcpu_pi_load, do not touch it in __pi_post_block.

The fix part: do not check kvm_arch_has_assigned_device, instead
check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
This matches what the previous patch did in pi_post_block.

Cc: Huangweidong <weidong.huang@huawei.com>
Cc: Gonglei <arei.gonglei@huawei.com>
Cc: wangxin <wangxinxin.wang@huawei.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0bfe97e50a40..b9d2140eb212 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2202,43 +2202,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	struct pi_desc old, new;
 	unsigned int dest;
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	/*
+	 * In case of hot-plug or hot-unplug, we may have to undo
+	 * vmx_vcpu_pi_put even if there is no assigned device.  And we
+	 * always keep PI.NDST up to date for simplicity: it makes the
+	 * code easier, and CPU migration is not a fast path.
+	 */
+	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+		return;
+
+	/*
+	 * First handle the simple case where no cmpxchg is necessary; just
+	 * allow posting non-urgent interrupts.
+	 *
+	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+	 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+	 * expects the VCPU to be on the blocked_vcpu_list that matches
+	 * PI.NDST.
+	 */
+	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+	    vcpu->cpu == cpu) {
+		pi_clear_sn(pi_desc);
 		return;
+	}
 
+	/* The full case.  */
 	do {
 		old.control = new.control = pi_desc->control;
 
-		/*
-		 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
-		 * are two possible cases:
-		 * 1. After running 'pre_block', context switch
-		 *    happened. For this case, 'sn' was set in
-		 *    vmx_vcpu_put(), so we need to clear it here.
-		 * 2. After running 'pre_block', we were blocked,
-		 *    and woken up by some other guy. For this case,
-		 *    we don't need to do anything, 'pi_post_block'
-		 *    will do everything for us. However, we cannot
-		 *    check whether it is case #1 or case #2 here
-		 *    (maybe, not needed), so we also clear sn here,
-		 *    I think it is not a big deal.
-		 */
-		if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
-			if (vcpu->cpu != cpu) {
-				dest = cpu_physical_id(cpu);
-
-				if (x2apic_enabled())
-					new.ndst = dest;
-				else
-					new.ndst = (dest << 8) & 0xFF00;
-			}
+		dest = cpu_physical_id(cpu);
 
-			/* set 'NV' to 'notification vector' */
-			new.nv = POSTED_INTR_VECTOR;
-		}
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
 
-		/* Allow posting non-urgent interrupts */
 		new.sn = 0;
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
@@ -9592,6 +9590,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
 
+	/*
+	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+	 * or POSTED_INTR_WAKEUP_VECTOR.
+	 */
+	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+	vmx->pi_desc.sn = 1;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -11723,9 +11728,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 		else
 			new.ndst = (dest << 8) & 0xFF00;
 
-		/* Allow posting non-urgent interrupts */
-		new.sn = 0;
-
 		/* set 'NV' to 'notification vector' */
 		new.nv = POSTED_INTR_VECTOR;
 	} while (cmpxchg(&pi_desc->control, old.control,
-- 
cgit v1.2.3


From 0d805ee70a69eabd38160dc199e183ac2f13fe4b Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Wed, 27 Sep 2017 02:41:25 -0700
Subject: xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables
 mapping

When bootup a PVM guest with large memory(Ex.240GB), XEN provided initial
mapping overlaps with kernel module virtual space. When mapping in this space
is cleared by xen_cleanhighmap(), in certain case there could be an 2MB mapping
left. This is due to XEN initialize 4MB aligned mapping but xen_cleanhighmap()
finish at 2MB boundary.

When module loading is just on top of the 2MB space, got below warning:

WARNING: at mm/vmalloc.c:106 vmap_pte_range+0x14e/0x190()
Call Trace:
 [<ffffffff81117083>] warn_alloc_failed+0xf3/0x160
 [<ffffffff81146022>] __vmalloc_area_node+0x182/0x1c0
 [<ffffffff810ac91e>] ? module_alloc_update_bounds+0x1e/0x80
 [<ffffffff81145df7>] __vmalloc_node_range+0xa7/0x110
 [<ffffffff810ac91e>] ? module_alloc_update_bounds+0x1e/0x80
 [<ffffffff8103ca54>] module_alloc+0x64/0x70
 [<ffffffff810ac91e>] ? module_alloc_update_bounds+0x1e/0x80
 [<ffffffff810ac91e>] module_alloc_update_bounds+0x1e/0x80
 [<ffffffff810ac9a7>] move_module+0x27/0x150
 [<ffffffff810aefa0>] layout_and_allocate+0x120/0x1b0
 [<ffffffff810af0a8>] load_module+0x78/0x640
 [<ffffffff811ff90b>] ? security_file_permission+0x8b/0x90
 [<ffffffff810af6d2>] sys_init_module+0x62/0x1e0
 [<ffffffff815154c2>] system_call_fastpath+0x16/0x1b

Then the mapping of 2MB is cleared, finally oops when the page in that space is
accessed.

BUG: unable to handle kernel paging request at ffff880022600000
IP: [<ffffffff81260877>] clear_page_c_e+0x7/0x10
PGD 1788067 PUD 178c067 PMD 22434067 PTE 0
Oops: 0002 [#1] SMP
Call Trace:
 [<ffffffff81116ef7>] ? prep_new_page+0x127/0x1c0
 [<ffffffff81117d42>] get_page_from_freelist+0x1e2/0x550
 [<ffffffff81133010>] ? ii_iovec_copy_to_user+0x90/0x140
 [<ffffffff81119c9d>] __alloc_pages_nodemask+0x12d/0x230
 [<ffffffff81155516>] alloc_pages_vma+0xc6/0x1a0
 [<ffffffff81006ffd>] ? pte_mfn_to_pfn+0x7d/0x100
 [<ffffffff81134cfb>] do_anonymous_page+0x16b/0x350
 [<ffffffff81139c34>] handle_pte_fault+0x1e4/0x200
 [<ffffffff8100712e>] ? xen_pmd_val+0xe/0x10
 [<ffffffff810052c9>] ? __raw_callee_save_xen_pmd_val+0x11/0x1e
 [<ffffffff81139dab>] handle_mm_fault+0x15b/0x270
 [<ffffffff81510c10>] do_page_fault+0x140/0x470
 [<ffffffff8150d7d5>] page_fault+0x25/0x30

Call xen_cleanhighmap() with 4MB aligned for page tables mapping to fix it.
The unnecessory call of xen_cleanhighmap() in DEBUG mode is also removed.

-v2: add comment about XEN alignment from Juergen.

References: https://lists.xen.org/archives/html/xen-devel/2012-07/msg01562.html
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>

[boris: added 'xen/mmu' tag to commit subject]
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 509f560bd0c6..58b09fcadbaa 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1238,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void)
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
 	 * the ramdisk). We continue on, erasing PMD entries that point to page
 	 * tables - do note that they are accessible at this stage via __va.
-	 * For good measure we also round up to the PMD - which means that if
+	 * As Xen is aligning the memory end to a 4MB boundary, for good
+	 * measure we also round up to PMD_SIZE * 2 - which means that if
 	 * anybody is using __ka address to the initial boot-stack - and try
 	 * to use it - they are going to crash. The xen_start_info has been
 	 * taken care of already in xen_setup_kernel_pagetable. */
 	addr = xen_start_info->pt_base;
-	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
+	size = xen_start_info->nr_pt_frames * PAGE_SIZE;
 
-	xen_cleanhighmap(addr, addr + size);
+	xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2));
 	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
-#ifdef DEBUG
-	/* This is superfluous and is not necessary, but you know what
-	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
-	 * anything at this stage. */
-	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
-#endif
 }
 #endif
 
-- 
cgit v1.2.3


From c0a1666bcb2a33e84187a15eabdcd54056be9a97 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 28 Sep 2017 17:58:41 +0200
Subject: KVM: VMX: use cmpxchg64

This fixes a compilation failure on 32-bit systems.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b9d2140eb212..7f62c94196d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2238,8 +2238,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 			new.ndst = (dest << 8) & 0xFF00;
 
 		new.sn = 0;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 }
 
 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
@@ -11730,8 +11730,8 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 
 		/* set 'NV' to 'notification vector' */
 		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 
 	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
 		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
@@ -11800,8 +11800,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 
 		/* set 'NV' to 'wakeup vector' */
 		new.nv = POSTED_INTR_WAKEUP_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 
 	/* We should not block the vCPU if an interrupt is posted for it.  */
 	if (pi_test_on(pi_desc) == 1)
-- 
cgit v1.2.3


From 520a13c530aeb5f63e011d668c42db1af19ed349 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 28 Sep 2017 16:58:26 -0500
Subject: x86/asm: Fix inline asm call constraints for GCC 4.4

The kernel test bot (run by Xiaolong Ye) reported that the following commit:

  f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")

is causing double faults in a kernel compiled with GCC 4.4.

Linus subsequently diagnosed the crash pattern and the buggy commit and found that
the issue is with this code:

  register unsigned int __asm_call_sp asm("esp");
  #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)

Even on a 64-bit kernel, it's using ESP instead of RSP.  That causes GCC
to produce the following bogus code:

  ffffffff8147461d:       89 e0                   mov    %esp,%eax
  ffffffff8147461f:       4c 89 f7                mov    %r14,%rdi
  ffffffff81474622:       4c 89 fe                mov    %r15,%rsi
  ffffffff81474625:       ba 20 00 00 00          mov    $0x20,%edx
  ffffffff8147462a:       89 c4                   mov    %eax,%esp
  ffffffff8147462c:       e8 bf 52 05 00          callq  ffffffff814c98f0 <copy_user_generic_unrolled>

Despite the absurdity of it backing up and restoring the stack pointer
for no reason, the bug is actually the fact that it's only backing up
and restoring the lower 32 bits of the stack pointer.  The upper 32 bits
are getting cleared out, corrupting the stack pointer.

So change the '__asm_call_sp' register variable to be associated with
the actual full-size stack pointer.

This also requires changing the __ASM_SEL() macro to be based on the
actual compiled arch size, rather than the CONFIG value, because
CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso).
Otherwise Clang fails to build the kernel because it complains about the
use of a 64-bit register (RSP) in a 32-bit file.

Reported-and-Bisected-and-Tested-by: kernel test robot <xiaolong.ye@intel.com>
Diagnosed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")
Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/asm.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index c1eadbaf1115..30c3c9ac784a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -11,10 +11,12 @@
 # define __ASM_FORM_COMMA(x) " " #x ","
 #endif
 
-#ifdef CONFIG_X86_32
+#ifndef __x86_64__
+/* 32 bit */
 # define __ASM_SEL(a,b) __ASM_FORM(a)
 # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
 #else
+/* 64 bit */
 # define __ASM_SEL(a,b) __ASM_FORM(b)
 # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
 #endif
@@ -139,7 +141,7 @@
  * gets set up by the containing function.  If you forget to do this, objtool
  * may print a "call without frame pointer save/setup" warning.
  */
-register unsigned int __asm_call_sp asm("esp");
+register unsigned long __asm_call_sp asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)
 #endif
 
-- 
cgit v1.2.3


From 305d0ab4764d36a02c8e7cddb67099aca65351ce Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 28 Sep 2017 18:16:44 -0700
Subject: KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

------------[ cut here ]------------
 WARNING: CPU: 4 PID: 5280 at /home/kernel/linux/arch/x86/kvm//vmx.c:11394 nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel]
 CPU: 4 PID: 5280 Comm: qemu-system-x86 Tainted: G        W  OE   4.13.0+ #17
 RIP: 0010:nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel]
 Call Trace:
  ? emulator_read_emulated+0x15/0x20 [kvm]
  ? segmented_read+0xae/0xf0 [kvm]
  vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel]
  ? vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel]
  x86_emulate_instruction+0x733/0x810 [kvm]
  vmx_handle_exit+0x2f4/0xda0 [kvm_intel]
  ? kvm_arch_vcpu_ioctl_run+0xd2f/0x1c60 [kvm]
  kvm_arch_vcpu_ioctl_run+0xdab/0x1c60 [kvm]
  ? kvm_arch_vcpu_load+0x62/0x230 [kvm]
  kvm_vcpu_ioctl+0x340/0x700 [kvm]
  ? kvm_vcpu_ioctl+0x340/0x700 [kvm]
  ? __fget+0xfc/0x210
  do_vfs_ioctl+0xa4/0x6a0
  ? __fget+0x11d/0x210
  SyS_ioctl+0x79/0x90
  entry_SYSCALL_64_fastpath+0x23/0xc2

A nested #PF is triggered during L0 emulating instruction for L2. However, it
doesn't consider we should not break L1's vmlauch/vmresme. This patch fixes
it by queuing the #PF exception instead ,requesting an immediate VM exit from
L2 and keeping the exception for L1 pending for a subsequent nested VM exit.

This should actually work all the time, making vmx_inject_page_fault_nested
totally unnecessary.  However, that's not working yet, so this patch can work
around the issue in the meanwhile.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7f62c94196d1..5bfa353f6354 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9845,7 +9845,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	WARN_ON(!is_guest_mode(vcpu));
 
-	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+		!to_vmx(vcpu)->nested.nested_run_pending) {
 		vmcs12->vm_exit_intr_error_code = fault->error_code;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-- 
cgit v1.2.3


From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Fri, 29 Sep 2017 19:01:45 +0800
Subject: kvm/x86: Handle async PF in RCU read-side critical sections

Sasha Levin reported a WARNING:

| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
| rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
| rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
...
| CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
| 1.10.1-1ubuntu1 04/01/2014
| Call Trace:
...
| RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
| RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
| RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002
| RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000
| RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410
| RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001
| R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08
| R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040
| __schedule+0x201/0x2240 kernel/sched/core.c:3292
| schedule+0x113/0x460 kernel/sched/core.c:3421
| kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158
| do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271
| async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069
| RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996
| RSP: 0018:ffff88003b2df520 EFLAGS: 00010283
| RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670
| RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140
| RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000
| R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8
| R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000
| vsnprintf+0x173/0x1700 lib/vsprintf.c:2136
| sprintf+0xbe/0xf0 lib/vsprintf.c:2386
| proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23
| get_link fs/namei.c:1047 [inline]
| link_path_walk+0x1041/0x1490 fs/namei.c:2127
...

This happened when the host hit a page fault, and delivered it as in an
async page fault, while the guest was in an RCU read-side critical
section.  The guest then tries to reschedule in kvm_async_pf_task_wait(),
but rcu_preempt_note_context_switch() would treat the reschedule as a
sleep in RCU read-side critical section, which is not allowed (even in
preemptible RCU).  Thus the WARN.

To cure this, make kvm_async_pf_task_wait() go to the halt path if the
PF happens in a RCU read-side critical section.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index aa60a08b65b1..e675704fa6f7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) || preempt_count() > 1;
+	n.halted = is_idle_task(current) || preempt_count() > 1 ||
+		   rcu_preempt_depth();
 	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	raw_spin_unlock(&b->lock);
-- 
cgit v1.2.3


From bc829ee36e0ec92383c9d9b88fe08f00d4d592f8 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 29 Sep 2017 11:24:19 -0500
Subject: x86/mm: Disable branch profiling in mem_encrypt.c

Some routines in mem_encrypt.c are called very early in the boot process,
e.g. sme_encrypt_kernel(). When CONFIG_TRACE_BRANCH_PROFILING=y is defined
the resulting branch profiling associated with the check to see if SME is
active results in a kernel crash. Disable branch profiling for
mem_encrypt.c by defining DISABLE_BRANCH_PROFILING before including any
header files.

Reported-by: kernel test robot <lkp@01.org>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929162419.6016.53390.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/mem_encrypt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 3fcc8e01683b..16c5f37933a2 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -10,6 +10,8 @@
  * published by the Free Software Foundation.
  */
 
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-- 
cgit v1.2.3


From 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 29 Sep 2017 17:15:36 +0300
Subject: x86/asm: Use register variable to get stack pointer value

Currently we use current_stack_pointer() function to get the value
of the stack pointer register. Since commit:

  f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")

... we have a stack register variable declared. It can be used instead of
current_stack_pointer() function which allows to optimize away some
excessive "mov %rsp, %<dst>" instructions:

 -mov    %rsp,%rdx
 -sub    %rdx,%rax
 -cmp    $0x3fff,%rax
 -ja     ffffffff810722fd <ist_begin_non_atomic+0x2d>

 +sub    %rsp,%rax
 +cmp    $0x3fff,%rax
 +ja     ffffffff810722fa <ist_begin_non_atomic+0x2a>

Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer
and use it instead of the removed function.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/asm.h         |  4 ++--
 arch/x86/include/asm/thread_info.h | 11 -----------
 arch/x86/kernel/irq_32.c           |  6 +++---
 arch/x86/kernel/traps.c            |  2 +-
 arch/x86/mm/tlb.c                  |  2 +-
 5 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 30c3c9ac784a..b0dc91f4bedc 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -141,8 +141,8 @@
  * gets set up by the containing function.  If you forget to do this, objtool
  * may print a "call without frame pointer save/setup" warning.
  */
-register unsigned long __asm_call_sp asm(_ASM_SP);
-#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
 
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 5161da1a0fa0..89e7eeb5cec1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -158,17 +158,6 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-static inline unsigned long current_stack_pointer(void)
-{
-	unsigned long sp;
-#ifdef CONFIG_X86_64
-	asm("mov %%rsp,%0" : "=g" (sp));
-#else
-	asm("mov %%esp,%0" : "=g" (sp));
-#endif
-	return sp;
-}
-
 /*
  * Walks up the stack frames to make sure that the specified object is
  * entirely contained by a single stack frame.
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1f38d9a4d9de..d4eb450144fd 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -64,7 +64,7 @@ static void call_on_stack(void *func, void *stack)
 
 static inline void *current_stack(void)
 {
-	return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
+	return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
 }
 
 static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
@@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 
 	/* Save the next esp at the bottom of the stack */
 	prev_esp = (u32 *)irqstk;
-	*prev_esp = current_stack_pointer();
+	*prev_esp = current_stack_pointer;
 
 	if (unlikely(overflow))
 		call_on_stack(print_stack_overflow, isp);
@@ -139,7 +139,7 @@ void do_softirq_own_stack(void)
 
 	/* Push the previous esp onto the stack */
 	prev_esp = (u32 *)irqstk;
-	*prev_esp = current_stack_pointer();
+	*prev_esp = current_stack_pointer;
 
 	call_on_stack(__do_softirq, isp);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 34ea3651362e..67db4f43309e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -142,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * from double_fault.
 	 */
 	BUG_ON((unsigned long)(current_top_of_stack() -
-			       current_stack_pointer()) >= THREAD_SIZE);
+			       current_stack_pointer) >= THREAD_SIZE);
 
 	preempt_enable_no_resched();
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 93fe97cce581..49d9778376d7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -191,7 +191,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 * mapped in the new pgd, we'll double-fault.  Forcibly
 			 * map it.
 			 */
-			unsigned int index = pgd_index(current_stack_pointer());
+			unsigned int index = pgd_index(current_stack_pointer);
 			pgd_t *pgd = next->pgd + index;
 
 			if (unlikely(pgd_none(*pgd)))
-- 
cgit v1.2.3


From ee213fc72fd67d0988525af501534f4cb924d1e9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Oct 2017 08:51:43 -0500
Subject: kprobes/x86: Set up frame pointer in kprobe trampoline

Richard Weinberger saw an unwinder warning when running bcc's opensnoop:

  WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008
  unwind stack type:0 next_sp:          (null) mask:0x2 graph_idx:0
  ...
  ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0)
  ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90)
  ...

A lockdep stack trace was initiated from inside a kprobe handler, when
the unwinder noticed a bad frame pointer on the stack.  The bad frame
pointer is related to the fact that the kprobe optprobe trampoline
doesn't save the frame pointer before calling into optimized_callback().

Reported-and-tested-by: Richard Weinberger <richard@sigma-star.at>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/common.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..3fc0f9a794cb 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -3,6 +3,15 @@
 
 /* Kprobes and Optprobes common header */
 
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING "	push %" _ASM_BP "\n" \
+			 "	mov  %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING "	push %" _ASM_BP "\n"
+#endif
+
 #ifdef CONFIG_X86_64
 #define SAVE_REGS_STRING			\
 	/* Skip cs, ip, orig_ax. */		\
@@ -17,7 +26,7 @@
 	"	pushq %r10\n"			\
 	"	pushq %r11\n"			\
 	"	pushq %rbx\n"			\
-	"	pushq %rbp\n"			\
+	SAVE_RBP_STRING				\
 	"	pushq %r12\n"			\
 	"	pushq %r13\n"			\
 	"	pushq %r14\n"			\
@@ -48,7 +57,7 @@
 	"	pushl %es\n"			\
 	"	pushl %ds\n"			\
 	"	pushl %eax\n"			\
-	"	pushl %ebp\n"			\
+	SAVE_RBP_STRING				\
 	"	pushl %edi\n"			\
 	"	pushl %esi\n"			\
 	"	pushl %edx\n"			\
-- 
cgit v1.2.3


From b664d57f39d01e775204d4f1a7e2f8bda77bc549 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 3 Oct 2017 16:18:02 +0900
Subject: kprobes/x86: Remove IRQ disabling from jprobe handlers

Jprobes actually don't need to disable IRQs while calling
handlers, because of how we specify the kernel interface in
Documentation/kprobes.txt:

-----
 Probe handlers are run with preemption disabled.  Depending on the
 architecture and optimization state, handlers may also run with
 interrupts disabled (e.g., kretprobe handlers and optimized kprobe
 handlers run without interrupt disabled on x86/x86-64).
-----

So let's remove IRQ disabling from jprobes too.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150701508194.32266.14458959863314097305.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f0153714ddac..0742491cbb73 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 * raw stack chunk with redzones:
 	 */
 	__memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
-	regs->flags &= ~X86_EFLAGS_IF;
-	trace_hardirqs_off();
 	regs->ip = (unsigned long)(jp->entry);
 
 	/*
-- 
cgit v1.2.3


From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 3 Oct 2017 15:37:20 -0700
Subject: bpf: fix bpf_tail_call() x64 JIT

- bpf prog_array just like all other types of bpf array accepts 32-bit index.
  Clarify that in the comment.
- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
- tighten corresponding check in the interpreter to stay consistent

The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
though JIT code is wrong it will check bounds correctly.
Hence two fixes tags. All other JITs don't have this problem.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 4 ++--
 include/uapi/linux/bpf.h    | 2 +-
 kernel/bpf/core.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c9573660d51..0554e8aef4d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	/* if (index >= array->map.max_entries)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
+	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
 #define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..f90860d1f897 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -312,7 +312,7 @@ union bpf_attr {
  *     jump into another BPF program
  *     @ctx: context pointer passed to next program
  *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: index inside array that selects specific program to run
+ *     @index: 32-bit index inside array that selects specific program to run
  *     Return: 0 on success or negative error
  *
  * int bpf_clone_redirect(skb, ifindex, flags)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
 		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
 		struct bpf_array *array = container_of(map, struct bpf_array, map);
 		struct bpf_prog *prog;
-		u64 index = BPF_R3;
+		u32 index = BPF_R3;
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-- 
cgit v1.2.3


From a2b7861bb33b2538420bb5d8554153484d3f961f Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 3 Oct 2017 21:36:51 +0800
Subject: kvm/x86: Avoid async PF preempting the kernel incorrectly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, in PREEMPT_COUNT=n kernel, kvm_async_pf_task_wait() could call
schedule() to reschedule in some cases.  This could result in
accidentally ending the current RCU read-side critical section early,
causing random memory corruption in the guest, or otherwise preempting
the currently running task inside between preempt_disable and
preempt_enable.

The difficulty to handle this well is because we don't know whether an
async PF delivered in a preemptible section or RCU read-side critical section
for PREEMPT_COUNT=n, since preempt_disable()/enable() and rcu_read_lock/unlock()
are both no-ops in that case.

To cure this, we treat any async PF interrupting a kernel context as one
that cannot be preempted, preventing kvm_async_pf_task_wait() from choosing
the schedule() path in that case.

To do so, a second parameter for kvm_async_pf_task_wait() is introduced,
so that we know whether it's called from a context interrupting the
kernel, and the parameter is set properly in all the callsites.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_para.h |  4 ++--
 arch/x86/kernel/kvm.c           | 14 ++++++++++----
 arch/x86/kvm/mmu.c              |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index bc62e7cbf1b1..59ad3d132353 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 void __init kvm_guest_init(void);
-void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
@@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void)
 
 #else /* CONFIG_KVM_GUEST */
 #define kvm_guest_init() do {} while (0)
-#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wait(T, I) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 
 static inline bool kvm_para_available(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e675704fa6f7..8bb9594d0761 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
 	return NULL;
 }
 
-void kvm_async_pf_task_wait(u32 token)
+/*
+ * @interrupt_kernel: Is this called from a routine which interrupts the kernel
+ * 		      (other than user space)?
+ */
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
 {
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) || preempt_count() > 1 ||
-		   rcu_preempt_depth();
+	n.halted = is_idle_task(current) ||
+		   (IS_ENABLED(CONFIG_PREEMPT_COUNT)
+		    ? preempt_count() > 1 || rcu_preempt_depth()
+		    : interrupt_kernel);
 	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	raw_spin_unlock(&b->lock);
@@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
 		prev_state = exception_enter();
-		kvm_async_pf_task_wait((u32)read_cr2());
+		kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
 		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca30c1eb1d9..106d4a029a8a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		vcpu->arch.apf.host_apf_reason = 0;
 		local_irq_disable();
-		kvm_async_pf_task_wait(fault_address);
+		kvm_async_pf_task_wait(fault_address, 0);
 		local_irq_enable();
 		break;
 	case KVM_PV_REASON_PAGE_READY:
-- 
cgit v1.2.3


From 262e681183ddcdb24d64a2f993e41a226adcec29 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 2 Oct 2017 11:28:36 +0200
Subject: x86/mce: Hide mca_cfg

Now that lguest is gone, put it in the internal header which should be
used only by MCA/RAS code.

Add missing header guards while at it.

No functional change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20171002092836.22971-3-bp@alien8.de
---
 arch/x86/include/asm/mce.h                | 1 -
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 7 +++++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c      | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 181264989db5..8edac1de2e35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -187,7 +187,6 @@ struct mca_msr_regs {
 
 extern struct mce_vendor_flags mce_flags;
 
-extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
 
 enum mce_notifier_prios {
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 098530a93bb7..debb974fd17d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,6 @@
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
 #include <linux/device.h>
 #include <asm/mce.h>
 
@@ -108,3 +111,7 @@ static inline void mce_work_trigger(void)	{ }
 static inline void mce_register_injector_chain(struct notifier_block *nb)	{ }
 static inline void mce_unregister_injector_chain(struct notifier_block *nb)	{ }
 #endif
+
+extern struct mca_config mca_cfg;
+
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 40e28ed77fbf..486f640b02ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -28,6 +28,8 @@
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
 
+#include "mce-internal.h"
+
 #define NR_BLOCKS         5
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
-- 
cgit v1.2.3


From f26e60167d8b5b1c67b3efd4cb5672da446bdb0e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 4 Oct 2017 10:39:05 -0500
Subject: x86/kvm: Move kvm_fastop_exception to .fixup section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling the kernel with the '-frecord-gcc-switches' flag, objtool
complains:

  arch/x86/kvm/emulate.o: warning: objtool: .GCC.command.line+0x0: special: can't find new instruction

And also the kernel fails to link.

The problem is that the 'kvm_fastop_exception' code gets placed into the
throwaway '.GCC.command.line' section instead of '.text'.

Exception fixup code is conventionally placed in the '.fixup' section,
so put it there where it belongs.

Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a36254cbf776..d90cdc77e077 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	#op " %al \n\t" \
 	FOP_RET
 
-asm(".global kvm_fastop_exception \n"
-    "kvm_fastop_exception: xor %esi, %esi; ret");
+asm(".pushsection .fixup, \"ax\"\n"
+    ".global kvm_fastop_exception \n"
+    "kvm_fastop_exception: xor %esi, %esi; ret\n"
+    ".popsection");
 
 FOP_START(setcc)
 FOP_SETCC(seto)
-- 
cgit v1.2.3


From e42eef4ba38806b18c4a74f0c276fb2e0b548173 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 4 Oct 2017 12:28:18 +0200
Subject: KVM: add X86_LOCAL_APIC dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The rework of the posted interrupt handling broke building without
support for the local APIC:

ERROR: "boot_cpu_physical_apicid" [arch/x86/kvm/kvm-intel.ko] undefined!

That configuration is probably not particularly useful anyway, so
we can avoid the randconfig failures by adding a Kconfig dependency.

Fixes: 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3ea624452f93..3c48bc8bf08c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
 	depends on HIGH_RES_TIMERS
 	# for TASKSTATS/TASK_DELAY_ACCT:
 	depends on NET && MULTIUSER
+	depends on X86_LOCAL_APIC
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select ANON_INODES
-- 
cgit v1.2.3


From 924c6b900cfdf376b07bccfd80e62b21914f8a5a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 8 Oct 2017 21:53:05 -0700
Subject: x86/mm/64: Fix reboot interaction with CR4.PCIDE

Trying to reboot via real mode fails with PCID on: long mode cannot
be exited while CR4.PCIDE is set.  (No, I have no idea why, but the
SDM and actual CPUs are in agreement here.)  The result is a GPF and
a hang instead of a reboot.

I didn't catch this in testing because neither my computer nor my VM
reboots this way.  I can trigger it with reboot=bios, though.

Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems")
Reported-and-tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org
---
 arch/x86/kernel/reboot.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 54180fa6f66f..add33f600531 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
 	load_cr3(initial_page_table);
 #else
 	write_cr3(real_mode_header->trampoline_pgd);
+
+	/* Exiting long mode will fail if CR4.PCIDE is set. */
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr4_clear_bits(X86_CR4_PCIDE);
 #endif
 
 	/* Jump to the identity-mapped low memory code */
-- 
cgit v1.2.3


From 6b32c126d33d5cb379bca280ab8acedc1ca978ff Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 5 Oct 2017 20:30:12 +0200
Subject: x86/alternatives: Fix alt_max_short macro to really be a max()

The alt_max_short() macro in asm/alternative.h does not work as
intended, leading to nasty bugs. E.g. alt_max_short("1", "3")
evaluates to 3, but alt_max_short("3", "1") evaluates to 1 -- not
exactly the maximum of 1 and 3.

In fact, I had to learn it the hard way by crashing my kernel in not
so funny ways by attempting to make use of the ALTENATIVE_2 macro
with alternatives where the first one was larger than the second
one.

According to [1] and commit dbe4058a6a44 ("x86/alternatives: Fix
ALTERNATIVE_2 padding generation properly") the right handed side
should read "-(-(a < b))" not "-(-(a - b))". Fix that, to make the
macro work as intended.

While at it, fix up the comments regarding the additional "-", too.
It's not about gas' usage of s32 but brain dead logic of having a
"true" value of -1 for the < operator ... *sigh*

Btw., the one in asm/alternative-asm.h is correct. And, apparently,
all current users of ALTERNATIVE_2() pass same sized alternatives,
avoiding to hit the bug.

[1] http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax

Reviewed-and-tested-by: Borislav Petkov <bp@suse.de>
Fixes: dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly")
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1507228213-13095-1-git-send-email-minipli@googlemail.com
---
 arch/x86/include/asm/alternative-asm.h | 4 +++-
 arch/x86/include/asm/alternative.h     | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e7636bac7372..6c98821fef5e 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -62,8 +62,10 @@
 #define new_len2		145f-144f
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c096624137ae..ccbe24e697c4 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	alt_end_marker ":\n"
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
  *
- * The additional "-" is needed because gas works with s32s.
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
-#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
 
 /*
  * Pad the second replacement alternative with additional NOPs if it is
-- 
cgit v1.2.3


From 62dd86ac01f9fb6386d7f8c6b389c3ea4582a50a Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:02 -0500
Subject: x86/unwind: Fix dereference of untrusted pointer

Tetsuo Handa and Fengguang Wu reported a panic in the unwinder:

  BUG: unable to handle kernel NULL pointer dereference at 000001f2
  IP: update_stack_state+0xd4/0x340
  *pde = 00000000

  Oops: 0000 [#1] PREEMPT SMP
  CPU: 0 PID: 18728 Comm: 01-cpu-hotplug Not tainted 4.13.0-rc4-00170-gb09be67 #592
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014
  task: bb0b53c0 task.stack: bb3ac000
  EIP: update_stack_state+0xd4/0x340
  EFLAGS: 00010002 CPU: 0
  EAX: 0000a570 EBX: bb3adccb ECX: 0000f401 EDX: 0000a570
  ESI: 00000001 EDI: 000001ba EBP: bb3adc6b ESP: bb3adc3f
   DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
  CR0: 80050033 CR2: 000001f2 CR3: 0b3a7000 CR4: 00140690
  DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
  DR6: fffe0ff0 DR7: 00000400
  Call Trace:
   ? unwind_next_frame+0xea/0x400
   ? __unwind_start+0xf5/0x180
   ? __save_stack_trace+0x81/0x160
   ? save_stack_trace+0x20/0x30
   ? __lock_acquire+0xfa5/0x12f0
   ? lock_acquire+0x1c2/0x230
   ? tick_periodic+0x3a/0xf0
   ? _raw_spin_lock+0x42/0x50
   ? tick_periodic+0x3a/0xf0
   ? tick_periodic+0x3a/0xf0
   ? debug_smp_processor_id+0x12/0x20
   ? tick_handle_periodic+0x23/0xc0
   ? local_apic_timer_interrupt+0x63/0x70
   ? smp_trace_apic_timer_interrupt+0x235/0x6a0
   ? trace_apic_timer_interrupt+0x37/0x3c
   ? strrchr+0x23/0x50
  Code: 0f 95 c1 89 c7 89 45 e4 0f b6 c1 89 c6 89 45 dc 8b 04 85 98 cb 74 bc 88 4d e3 89 45 f0 83 c0 01 84 c9 89 04 b5 98 cb 74 bc 74 3b <8b> 47 38 8b 57 34 c6 43 1d 01 25 00 00 02 00 83 e2 03 09 d0 83
  EIP: update_stack_state+0xd4/0x340 SS:ESP: 0068:bb3adc3f
  CR2: 00000000000001f2
  ---[ end trace 0d147fd4aba8ff50 ]---
  Kernel panic - not syncing: Fatal exception in interrupt

On x86-32, after decoding a frame pointer to get a regs address,
regs_size() dereferences the regs pointer when it checks regs->cs to see
if the regs are user mode.  This is dangerous because it's possible that
what looks like a decoded frame pointer is actually a corrupt value, and
we don't want the unwinder to make things worse.

Instead of calling regs_size() on an unsafe pointer, just assume they're
kernel regs to start with.  Later, once it's safe to access the regs, we
can do the user mode check and corresponding safety check for the
remaining two regs.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 5ed8d8bb38c5 ("x86/unwind: Move common code into update_stack_state()")
Link: http://lkml.kernel.org/r/7f95b9a6993dec7674b3f3ab3dcd3294f7b9644d.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d145a0b1f529..d05637726c10 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -184,6 +184,12 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 	return (struct pt_regs *)(regs & ~0x1);
 }
 
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
+
 static bool update_stack_state(struct unwind_state *state,
 			       unsigned long *next_bp)
 {
@@ -202,7 +208,7 @@ static bool update_stack_state(struct unwind_state *state,
 	regs = decode_frame_pointer(next_bp);
 	if (regs) {
 		frame = (unsigned long *)regs;
-		len = regs_size(regs);
+		len = KERNEL_REGS_SIZE;
 		state->got_irq = true;
 	} else {
 		frame = next_bp;
@@ -226,6 +232,14 @@ static bool update_stack_state(struct unwind_state *state,
 	    frame < prev_frame_end)
 		return false;
 
+	/*
+	 * On 32-bit with user mode regs, make sure the last two regs are safe
+	 * to access:
+	 */
+	if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+	    !on_stack(info, frame, len + 2*sizeof(long)))
+		return false;
+
 	/* Move state to the next frame: */
 	if (regs) {
 		state->regs = regs;
-- 
cgit v1.2.3


From 5c99b692cfd62f6915ed7ee7ed3c38d65ba3feab Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:03 -0500
Subject: x86/unwind: Use MSB for frame pointer encoding on 32-bit

On x86-32, Tetsuo Handa and Fengguang Wu reported unwinder warnings
like:

  WARNING: kernel stack regs at f60bb9c8 in swapper:1 has bad 'bp' value 0ba00000

And also there were some stack dumps with a bunch of unreliable '?'
symbols after an apic_timer_interrupt symbol, meaning the unwinder got
confused when it tried to read the regs.

The cause of those issues is that, with GCC 4.8 (and possibly older),
there are cases where GCC misaligns the stack pointer in a leaf function
for no apparent reason:

  c124a388 <acpi_rs_move_data>:
  c124a388:       55                      push   %ebp
  c124a389:       89 e5                   mov    %esp,%ebp
  c124a38b:       57                      push   %edi
  c124a38c:       56                      push   %esi
  c124a38d:       89 d6                   mov    %edx,%esi
  c124a38f:       53                      push   %ebx
  c124a390:       31 db                   xor    %ebx,%ebx
  c124a392:       83 ec 03                sub    $0x3,%esp
  ...
  c124a3e3:       83 c4 03                add    $0x3,%esp
  c124a3e6:       5b                      pop    %ebx
  c124a3e7:       5e                      pop    %esi
  c124a3e8:       5f                      pop    %edi
  c124a3e9:       5d                      pop    %ebp
  c124a3ea:       c3                      ret

If an interrupt occurs in such a function, the regs on the stack will be
unaligned, which breaks the frame pointer encoding assumption.  So on
32-bit, use the MSB instead of the LSB to encode the regs.

This isn't an issue on 64-bit, because interrupts align the stack before
writing to it.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/279a26996a482ca716605c7dbc7f2db9d8d91e81.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S      |  4 ++--
 arch/x86/kernel/unwind_frame.c | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8a13d468635a..50e0d2bc4528 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -176,7 +176,7 @@
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
  * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
+ * is just clearing the MSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
  * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
@@ -185,7 +185,7 @@
 .macro ENCODE_FRAME_POINTER
 #ifdef CONFIG_FRAME_POINTER
 	mov %esp, %ebp
-	orl $0x1, %ebp
+	andl $0x7fffffff, %ebp
 #endif
 .endm
 
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d05637726c10..4949bbc95f75 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -174,6 +174,7 @@ static bool is_last_task_frame(struct unwind_state *state)
  * This determines if the frame pointer actually contains an encoded pointer to
  * pt_regs on the stack.  See ENCODE_FRAME_POINTER.
  */
+#ifdef CONFIG_X86_64
 static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 {
 	unsigned long regs = (unsigned long)bp;
@@ -183,6 +184,17 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 
 	return (struct pt_regs *)(regs & ~0x1);
 }
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+	unsigned long regs = (unsigned long)bp;
+
+	if (regs & 0x80000000)
+		return NULL;
+
+	return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
 
 #ifdef CONFIG_X86_32
 #define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
-- 
cgit v1.2.3


From 99bd28a49b150e4b938313a63b5532d95ba77885 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:04 -0500
Subject: x86/unwind: Align stack pointer in unwinder dump

When printing the unwinder dump, the stack pointer could be unaligned,
for one of two reasons:

- stack corruption; or

- GCC created an unaligned stack.

There's no way for the unwinder to tell the difference between the two,
so we have to assume one or the other.  GCC unaligned stacks are very
rare, and have only been spotted before GCC 5.  Presumably, if we're
doing an unwinder stack dump, stack corruption is more likely than a
GCC unaligned stack.  So always align the stack before starting the
dump.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2f540c515946ab09ed267e1a1d6421202a0cce08.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 4949bbc95f75..81aca077fbb6 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
 			state->stack_info.type, state->stack_info.next_sp,
 			state->stack_mask, state->graph_idx);
 
-	for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+	for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+	     sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
 		if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
 			break;
 
-- 
cgit v1.2.3


From d4a2d031dd42f9594107d317e2a9a0c6d73ad46b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:05 -0500
Subject: x86/unwind: Disable unwinder warnings on 32-bit

x86-32 doesn't have stack validation, so in most cases it doesn't make
sense to warn about bad frame pointers.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a69658760800bf281e6353248c23e0fa0acf5230.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 81aca077fbb6..3dc26f95d46e 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -355,6 +355,13 @@ bad_address:
 	    state->regs->sp < (unsigned long)task_pt_regs(state->task))
 		goto the_end;
 
+	/*
+	 * There are some known frame pointer issues on 32-bit.  Disable
+	 * unwinder warnings on 32-bit until it gets objtool support.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32))
+		goto the_end;
+
 	if (state->regs) {
 		printk_deferred_once(KERN_WARNING
 			"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
-- 
cgit v1.2.3


From 629eb703d3e46aa570c6c91235d38fd09ed8c58b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 9 Oct 2017 18:26:55 +0100
Subject: perf/x86/intel/uncore: Fix memory leaks on allocation failures

Currently if an allocation fails then the error return paths
don't free up any currently allocated pmus[].boxes and pmus causing
a memory leak.  Add an error clean up exit path that frees these
objects.

Detected by CoverityScan, CID#711632 ("Resource Leak")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Fixes: 087bfbb03269 ("perf/x86: Add generic Intel uncore PMU support")
Link: http://lkml.kernel.org/r/20171009172655.6132-1-colin.king@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 1c5390f1cf09..d45e06346f14 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 		pmus[i].type	= type;
 		pmus[i].boxes	= kzalloc(size, GFP_KERNEL);
 		if (!pmus[i].boxes)
-			return -ENOMEM;
+			goto err;
 	}
 
 	type->pmus = pmus;
@@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
 					sizeof(*attr_group), GFP_KERNEL);
 		if (!attr_group)
-			return -ENOMEM;
+			goto err;
 
 		attrs = (struct attribute **)(attr_group + 1);
 		attr_group->name = "events";
@@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 	}
 
 	type->pmu_group = &uncore_pmu_attr_group;
+
 	return 0;
+
+err:
+	for (i = 0; i < type->num_boxes; i++)
+		kfree(pmus[i].boxes);
+	kfree(pmus);
+
+	return -ENOMEM;
 }
 
 static int __init
-- 
cgit v1.2.3


From a3b7424392924e778b608e30ee321f7b10cc94b8 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 6 Oct 2017 17:48:54 +0200
Subject: x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded
 vCPUs

hv_flush_pcpu_ex structures are not cleared between calls for performance
reasons (they're variable size up to PAGE_SIZE each) but we must clear
hv_vp_set.bank_contents part of it to avoid flushing unneeded vCPUs. The
rest of the structure is formed correctly.

To do the clearing in an efficient way stash the maximum possible vCPU
number (this may differ from Linux CPU id).

Reported-by: Jork Loeser <Jork.Loeser@microsoft.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20171006154854.18092-1-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/hv_init.c       |  5 +++++
 arch/x86/hyperv/mmu.c           | 17 ++++++++++++-----
 arch/x86/include/asm/mshyperv.h |  1 +
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1a8eb550c40f..a5db63f728a2 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 u32 *hv_vp_index;
 EXPORT_SYMBOL_GPL(hv_vp_index);
 
+u32 hv_max_vp_index;
+
 static int hv_cpu_init(unsigned int cpu)
 {
 	u64 msr_vp_index;
@@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
 
 	hv_vp_index[smp_processor_id()] = msr_vp_index;
 
+	if (msr_vp_index > hv_max_vp_index)
+		hv_max_vp_index = msr_vp_index;
+
 	return 0;
 }
 
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 39e7f6e50919..9502d04c0c95 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
 {
 	int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
 
+	/* valid_bank_mask can represent up to 64 banks */
+	if (hv_max_vp_index / 64 >= 64)
+		return 0;
+
+	/*
+	 * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+	 * structs are not cleared between calls, we risk flushing unneeded
+	 * vCPUs otherwise.
+	 */
+	for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+		flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
+
 	/*
 	 * Some banks may end up being empty but this is acceptable.
 	 */
@@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
 		vcpu = hv_cpu_number_to_vp_number(cpu);
 		vcpu_bank = vcpu / 64;
 		vcpu_offset = vcpu % 64;
-
-		/* valid_bank_mask can represent up to 64 banks */
-		if (vcpu_bank >= 64)
-			return 0;
-
 		__set_bit(vcpu_offset, (unsigned long *)
 			  &flush->hv_vp_set.bank_contents[vcpu_bank]);
 		if (vcpu_bank >= nr_bank)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 738503e1f80c..530f448fddaf 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
  * to this information.
  */
 extern u32 *hv_vp_index;
+extern u32 hv_max_vp_index;
 
 /**
  * hv_cpu_number_to_vp_number() - Map CPU to VP.
-- 
cgit v1.2.3


From 60d73a7c96601434dfdb56d5b9167ff3b850d8d7 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 5 Oct 2017 13:39:24 +0200
Subject: x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex
 structures

hv_do_hypercall() does virt_to_phys() translation and with some configs
(CONFIG_SLAB) this doesn't work for percpu areas, we pass wrong memory to
hypervisor and get #GP. We could use working slow_virt_to_phys() instead
but doing so kills the performance.

Move pcpu_flush/pcpu_flush_ex structures out of percpu areas and
allocate memory on first call. The additional level of indirection gives
us a small performance penalty, in future we may consider introducing
hypercall functions which avoid virt_to_phys() conversion and cache
physical addresses of pcpu_flush/pcpu_flush_ex structures somewhere.

Reported-by: Simon Xiao <sixiao@microsoft.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20171005113924.28021-1-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/mmu.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 9502d04c0c95..f21cebbb5f6c 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
 /* Each gva in gva_list encodes up to 4096 pages to flush */
 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
 
-static struct hv_flush_pcpu __percpu *pcpu_flush;
+static struct hv_flush_pcpu __percpu **pcpu_flush;
 
-static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
+static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
 
 /*
  * Fills in gva_list starting from offset. Returns the number of items added.
@@ -109,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 				    const struct flush_tlb_info *info)
 {
 	int cpu, vcpu, gva_n, max_gvas;
+	struct hv_flush_pcpu **flush_pcpu;
 	struct hv_flush_pcpu *flush;
 	u64 status = U64_MAX;
 	unsigned long flags;
@@ -123,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 
 	local_irq_save(flags);
 
-	flush = this_cpu_ptr(pcpu_flush);
+	flush_pcpu = this_cpu_ptr(pcpu_flush);
+
+	if (unlikely(!*flush_pcpu))
+		*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+	flush = *flush_pcpu;
+
+	if (unlikely(!flush)) {
+		local_irq_restore(flags);
+		goto do_native;
+	}
 
 	if (info->mm) {
 		flush->address_space = virt_to_phys(info->mm->pgd);
@@ -180,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 				       const struct flush_tlb_info *info)
 {
 	int nr_bank = 0, max_gvas, gva_n;
+	struct hv_flush_pcpu_ex **flush_pcpu;
 	struct hv_flush_pcpu_ex *flush;
 	u64 status = U64_MAX;
 	unsigned long flags;
@@ -194,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 
 	local_irq_save(flags);
 
-	flush = this_cpu_ptr(pcpu_flush_ex);
+	flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
+
+	if (unlikely(!*flush_pcpu))
+		*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+	flush = *flush_pcpu;
+
+	if (unlikely(!flush)) {
+		local_irq_restore(flags);
+		goto do_native;
+	}
 
 	if (info->mm) {
 		flush->address_space = virt_to_phys(info->mm->pgd);
@@ -273,7 +295,7 @@ void hyper_alloc_mmu(void)
 		return;
 
 	if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
-		pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+		pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
 	else
-		pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+		pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
 }
-- 
cgit v1.2.3


From ab7ff471aa5db670197070760f022622793da7e5 Mon Sep 17 00:00:00 2001
From: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Date: Thu, 5 Oct 2017 10:34:29 -0300
Subject: x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing

Do not consider the fixed size of hv_vp_set when passing the variable
header size to hv_do_rep_hypercall().

The Hyper-V hypervisor specification states that for a hypercall with a
variable header only the size of the variable portion should be supplied
via the input control.

For HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX/LIST_EX calls that means the
fixed portion of hv_vp_set should not be considered.

That fixes random failures of some applications that are unexpectedly
killed with SIGBUS or SIGSEGV.

Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: devel@linuxdriverproject.org
Fixes: 628f54cc6451 ("x86/hyper-v: Support extended CPU ranges for TLB flush hypercalls")
Link: http://lkml.kernel.org/r/1507210469-29065-1-git-send-email-marcelo.cerri@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index f21cebbb5f6c..9cc9e1c1e2db 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -251,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
 		status = hv_do_rep_hypercall(
 			HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-			0, nr_bank + 2, flush, NULL);
+			0, nr_bank, flush, NULL);
 	} else if (info->end &&
 		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
 		status = hv_do_rep_hypercall(
 			HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-			0, nr_bank + 2, flush, NULL);
+			0, nr_bank, flush, NULL);
 	} else {
 		gva_n = fill_gva_list(flush->gva_list, nr_bank,
 				      info->start, info->end);
 		status = hv_do_rep_hypercall(
 			HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
-			gva_n, nr_bank + 2, flush, NULL);
+			gva_n, nr_bank, flush, NULL);
 	}
 
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From eac779aa509d453a55da0ea4302bdb79c4e0854f Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Sun, 8 Oct 2017 19:58:46 -0700
Subject: xen/vcpu: Use a unified name about cpu hotplug state for pv and pvhvm

As xen_cpuhp_setup is called by PV and PVHVM, the name of "x86/xen/hvm_guest"
is confusing.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/enlighten.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0e7ef69e8531..d669e9d89001 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
 	int rc;
 
 	rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
-				       "x86/xen/hvm_guest:prepare",
+				       "x86/xen/guest:prepare",
 				       cpu_up_prepare_cb, cpu_dead_cb);
 	if (rc >= 0) {
 		rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
-					       "x86/xen/hvm_guest:online",
+					       "x86/xen/guest:online",
 					       xen_cpu_up_online, NULL);
 		if (rc < 0)
 			cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);
-- 
cgit v1.2.3


From fd19d3b45164466a4adce7cbff448ba9189e1427 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 5 Oct 2017 11:10:22 +0200
Subject: KVM: nVMX: update last_nonleaf_level when initializing nested EPT

The function updates context->root_level but didn't call
update_last_nonleaf_level so the previous and potentially wrong value
was used for page walks.  For example, a zero value of last_nonleaf_level
would allow a potential out-of-bounds access in arch/x86/mmu/paging_tmpl.h's
walk_addr_generic function (CVE-2017-12188).

Fixes: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 106d4a029a8a..3c25f20115bc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4555,6 +4555,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 
 	update_permission_bitmask(vcpu, context, true);
 	update_pkru_bitmask(vcpu, context, true);
+	update_last_nonleaf_level(vcpu, context);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
-- 
cgit v1.2.3


From 829ee279aed43faa5cb1e4d65c0cad52f2426c53 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 5 Oct 2017 11:10:23 +0200
Subject: KVM: MMU: always terminate page walks at level 1

is_last_gpte() is not equivalent to the pseudo-code given in commit
6bb69c9b69c31 ("KVM: MMU: simplify last_pte_bitmap") because an incorrect
value of last_nonleaf_level may override the result even if level == 1.

It is critical for is_last_gpte() to return true on level == 1 to
terminate page walks. Otherwise memory corruption may occur as level
is used as an index to various data structures throughout the page
walking code.  Even though the actual bug would be wherever the MMU is
initialized (as in the previous patch), be defensive and ensure here
that is_last_gpte() returns the correct value.

This patch is also enough to fix CVE-2017-12188.

Fixes: 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2
Cc: stable@vger.kernel.org
Cc: Andy Honig <ahonig@google.com>
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
[Panic if walk_addr_generic gets an incorrect level; this is a serious
 bug and it's not worth a WARN_ON where the recovery path might hide
 further exploitable issues; suggested by Andrew Honig. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 14 +++++++-------
 arch/x86/kvm/paging_tmpl.h |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3c25f20115bc..7a69cf053711 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3973,13 +3973,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 static inline bool is_last_gpte(struct kvm_mmu *mmu,
 				unsigned level, unsigned gpte)
 {
-	/*
-	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
-	 */
-	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
 	/*
 	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
 	 * If it is clear, there are no large pages at this level, so clear
@@ -3987,6 +3980,13 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 	 */
 	gpte &= level - mmu->last_nonleaf_level;
 
+	/*
+	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 */
+	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
 	return gpte & PT_PAGE_SIZE_MASK;
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 86b68dc5a649..f18d1f8d332b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -334,10 +334,11 @@ retry_walk:
 		--walker->level;
 
 		index = PT_INDEX(addr, walker->level);
-
 		table_gfn = gpte_to_gfn(pte);
 		offset    = index * sizeof(pt_element_t);
 		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
+
+		BUG_ON(walker->level < 1);
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-- 
cgit v1.2.3


From 67bb8e999e0aeac285d22f0e53c856b9df5282c6 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 10 Oct 2017 14:45:04 -0500
Subject: x86/mm: Disable various instrumentations of mm/mem_encrypt.c and
 mm/tlb.c

Some routines in mem_encrypt.c are called very early in the boot process,
e.g. sme_enable().  When CONFIG_KCOV=y is defined the resulting code added
to sme_enable() (and others) for KCOV instrumentation results in a kernel
crash.  Disable the KCOV instrumentation for mem_encrypt.c by adding
KCOV_INSTRUMENT_mem_encrypt.o := n to arch/x86/mm/Makefile.

In order to avoid other possible early boot issues, model mem_encrypt.c
after head64.c in regards to tools. In addition to disabling KCOV as
stated above and a previous patch that disables branch profiling, also
remove the "-pg" CFLAG if CONFIG_FUNCTION_TRACER is enabled and set
KASAN_SANITIZE to "n", each of which are done on a file basis.

Reported-by: kernel test robot <lkp@01.org>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171010194504.18887.38053.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/Makefile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 72bf8c01c6e3..e1f095884386 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,12 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o	:= n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o		:= n
+KCOV_INSTRUMENT_mem_encrypt.o	:= n
+
+KASAN_SANITIZE_mem_encrypt.o	:= n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o	= -pg
+endif
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o
-- 
cgit v1.2.3


From 8eb3f87d903168bdbd1222776a6b1e281f50513e Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Tue, 10 Oct 2017 15:01:22 +0800
Subject: KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit

When KVM emulates an exit from L2 to L1, it loads L1 CR4 into the
guest CR4. Before this CR4 loading, the guest CR4 refers to L2
CR4. Because these two CR4's are in different levels of guest, we
should vmx_set_cr4() rather than kvm_set_cr4() here. The latter, which
is used to handle guest writes to its CR4, checks the guest change to
CR4 and may fail if the change is invalid.

The failure may cause trouble. Consider we start
  a L1 guest with non-zero L1 PCID in use,
     (i.e. L1 CR4.PCIDE == 1 && L1 CR3.PCID != 0)
and
  a L2 guest with L2 PCID disabled,
     (i.e. L2 CR4.PCIDE == 0)
and following events may happen:

1. If kvm_set_cr4() is used in load_vmcs12_host_state() to load L1 CR4
   into guest CR4 (in VMCS01) for L2 to L1 exit, it will fail because
   of PCID check. As a result, the guest CR4 recorded in L0 KVM (i.e.
   vcpu->arch.cr4) is left to the value of L2 CR4.

2. Later, if L1 attempts to change its CR4, e.g., clearing VMXE bit,
   kvm_set_cr4() in L0 KVM will think L1 also wants to enable PCID,
   because the wrong L2 CR4 is used by L0 KVM as L1 CR4. As L1
   CR3.PCID != 0, L0 KVM will inject GP to L1 guest.

Fixes: 4704d0befb072 ("KVM: nVMX: Exiting from L2 to L1")
Cc: qemu-stable@nongnu.org
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a2b804e10c95..95a01609d7ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11297,7 +11297,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
 	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
-	kvm_set_cr4(vcpu, vmcs12->host_cr4);
+	vmx_set_cr4(vcpu, vmcs12->host_cr4);
 
 	nested_ept_uninit_mmu_context(vcpu);
 
-- 
cgit v1.2.3


From cc6afe2240298049585e86b1ade85efc8a7f225d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Oct 2017 12:12:57 +0200
Subject: x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on
 hypervisors

Commit 594a30fb1242 ("x86/apic: Silence "FW_BUG TSC_DEADLINE disabled
due to Errata" on CPUs without the feature", 2017-08-30) was also about
silencing the warning on VirtualBox; however, KVM does expose the TSC
deadline timer, and it's virtualized so that it is immune from CPU errata.

Therefore, booting 4.13 with "-cpu Haswell" shows this in the logs:

     [    0.000000] [Firmware Bug]: TSC_DEADLINE disabled due to Errata;
                    please update microcode to version: 0xb2 (or later)

Even if you had a hypervisor that does _not_ virtualize the TSC deadline
and rather exposes the hardware one, it should be the hypervisors task
to update microcode and possibly hide the flag from CPUID.  So just
hide the message when running on _any_ hypervisor, not just those that
do not support the TSC deadline timer.

The older check still makes sense, so keep it.

Fixes: bd9240a18e ("x86/apic: Add TSC_DEADLINE quirk due to errata")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: kvm@vger.kernel.org
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1507630377-54471-1-git-send-email-pbonzini@redhat.com
---
 arch/x86/kernel/apic/apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d705c769f77d..50109eae8cd7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -600,7 +600,8 @@ static void apic_check_deadline_errata(void)
 	const struct x86_cpu_id *m;
 	u32 rev;
 
-	if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+	if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
+	    boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return;
 
 	m = x86_match_cpu(deadline_match);
-- 
cgit v1.2.3


From 616dd5872e52493863b0202632703eebd51243dc Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 11 Oct 2017 17:16:04 -0400
Subject: x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping

SKX stepping-3 fixed the TSC_DEADLINE issue in a different ucode
version number than stepping-4.  Linux needs to know this stepping-3
specific version number to also enable the TSC_DEADLINE on stepping-3.

The steppings and ucode versions are documented in the SKX BIOS update:
https://downloadmirror.intel.com/26978/eng/ReleaseNotes_R00.01.0004.txt

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Link: https://lkml.kernel.org/r/60f2bbf7cf617e212b522e663f84225bfebc50e5.1507756305.git.len.brown@intel.com
---
 arch/x86/kernel/apic/apic.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 50109eae8cd7..ff891772c9f8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
 	return ~0U;
 }
 
+static u32 skx_deadline_rev(void)
+{
+	switch (boot_cpu_data.x86_mask) {
+	case 0x03: return 0x01000136;
+	case 0x04: return 0x02000014;
+	}
+
+	return ~0U;
+}
+
 static const struct x86_cpu_id deadline_match[] = {
 	DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X,	hsx_deadline_rev),
 	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X,	0x0b000020),
 	DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D,	bdx_deadline_rev),
-	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X,	0x02000014),
+	DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X,	skx_deadline_rev),
 
 	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE,	0x22),
 	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT,	0x20),
-- 
cgit v1.2.3


From b956575bed91ecfb136a8300742ecbbf451471ab Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 9 Oct 2017 09:50:49 -0700
Subject: x86/mm: Flush more aggressively in lazy TLB mode

Since commit:

  94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")

x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.

From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.

Unfortunately, I forgot about a subtlety.  By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent.  This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.

I can imagine this causing two different problems:

 - A speculative page walk starting from a bogus page table could read
   IO addresses.  I haven't seen any reports of this causing problems.

 - A speculative page walk that involves a bogus page table can install
   garbage in the TLB.  Such garbage would always be at a user VA, but
   some AMD CPUs have logic that triggers a machine check when it notices
   these bogus entries.  I've seen a couple reports of this.

Boris further explains the failure mode:

> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.

To fix this, reinstate TLB coherence in lazy mode.  With this patch
applied, we do it in one of two ways:

 - If we have PCID, we simply switch back to init_mm's page tables
   when we enter a kernel thread -- this seems to be quite cheap
   except for the cost of serializing the CPU.

 - If we don't have PCID, then we set a flag and switch to init_mm
   the first time we would otherwise need to flush the TLB.

The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.

In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed.  Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h |   8 +-
 arch/x86/include/asm/tlbflush.h    |  24 ++++++
 arch/x86/mm/tlb.c                  | 153 +++++++++++++++++++++++++++----------
 3 files changed, 136 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index c120b5db178a..3c856a15b98e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-	int cpu = smp_processor_id();
-
-	if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
-		cpumask_clear_cpu(cpu, mm_cpumask(mm));
-}
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4893abf7f74f..d362161d3291 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
+ * to init_mm when we switch to a kernel thread (e.g. the idle thread).  If
+ * it's false, then we immediately switch CR3 when entering a kernel thread.
+ */
+DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
 /*
  * 6 because 6 should be plenty and struct tlb_state will fit in
  * two cache lines.
@@ -104,6 +111,23 @@ struct tlb_state {
 	u16 loaded_mm_asid;
 	u16 next_asid;
 
+	/*
+	 * We can be in one of several states:
+	 *
+	 *  - Actively using an mm.  Our CPU's bit will be set in
+	 *    mm_cpumask(loaded_mm) and is_lazy == false;
+	 *
+	 *  - Not using a real mm.  loaded_mm == &init_mm.  Our CPU's bit
+	 *    will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+	 *
+	 *  - Lazily using a real mm.  loaded_mm != &init_mm, our bit
+	 *    is set in mm_cpumask(loaded_mm), but is_lazy == true.
+	 *    We're heuristically guessing that the CR3 load we
+	 *    skipped more than makes up for the overhead added by
+	 *    lazy mode.
+	 */
+	bool is_lazy;
+
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..658bf0090565 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 			    u16 *new_asid, bool *need_flush)
 {
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
 		return;
 
 	/* Warn if we're not lazy. */
-	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
 
 	switch_mm(NULL, &init_mm, NULL);
 }
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		__flush_tlb_all();
 	}
 #endif
+	this_cpu_write(cpu_tlbstate.is_lazy, false);
 
 	if (real_prev == next) {
 		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 			  next->context.ctx_id);
 
-		if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
-			/*
-			 * There's nothing to do: we weren't lazy, and we
-			 * aren't changing our mm.  We don't need to flush
-			 * anything, nor do we need to update CR3, CR4, or
-			 * LDTR.
-			 */
-			return;
-		}
-
-		/* Resume remote flushes and then read tlb_gen. */
-		cpumask_set_cpu(cpu, mm_cpumask(next));
-		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
-		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
-		    next_tlb_gen) {
-			/*
-			 * Ideally, we'd have a flush_tlb() variant that
-			 * takes the known CR3 value as input.  This would
-			 * be faster on Xen PV and on hypothetical CPUs
-			 * on which INVPCID is fast.
-			 */
-			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
-				       next_tlb_gen);
-			write_cr3(build_cr3(next, prev_asid));
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-					TLB_FLUSH_ALL);
-		}
-
 		/*
-		 * We just exited lazy mode, which means that CR4 and/or LDTR
-		 * may be stale.  (Changes to the required CR4 and LDTR states
-		 * are not reflected in tlb_gen.)
+		 * We don't currently support having a real mm loaded without
+		 * our cpu set in mm_cpumask().  We have all the bookkeeping
+		 * in place to figure out whether we would need to flush
+		 * if our cpu were cleared in mm_cpumask(), but we don't
+		 * currently use it.
 		 */
+		if (WARN_ON_ONCE(real_prev != &init_mm &&
+				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+			cpumask_set_cpu(cpu, mm_cpumask(next));
+
+		return;
 	} else {
 		u16 new_asid;
 		bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		}
 
 		/* Stop remote flushes for the previous mm */
-		if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
-			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
-		VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+		VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+				real_prev != &init_mm);
+		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 
 		/*
 		 * Start remote flushes and then read tlb_gen.
@@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	switch_ldt(real_prev, next);
 }
 
+/*
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm.  Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row.  It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+		return;
+
+	if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+		/*
+		 * There's a significant optimization that may be possible
+		 * here.  We have accurate enough TLB flush tracking that we
+		 * don't need to maintain coherence of TLB per se when we're
+		 * lazy.  We do, however, need to maintain coherence of
+		 * paging-structure caches.  We could, in principle, leave our
+		 * old mm loaded and only switch to init_mm when
+		 * tlb_remove_page() happens.
+		 */
+		this_cpu_write(cpu_tlbstate.is_lazy, true);
+	} else {
+		switch_mm(NULL, &init_mm, NULL);
+	}
+}
+
 /*
  * Call this when reinitializing a CPU.  It fixes the following potential
  * problems:
@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
+	if (unlikely(loaded_mm == &init_mm))
+		return;
+
 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
 		   loaded_mm->context.ctx_id);
 
-	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
 		/*
-		 * We're in lazy mode -- don't flush.  We can get here on
-		 * remote flushes due to races and on local flushes if a
-		 * kernel thread coincidentally flushes the mm it's lazily
-		 * still using.
+		 * We're in lazy mode.  We need to at least flush our
+		 * paging-structure cache to avoid speculatively reading
+		 * garbage into our TLB.  Since switching to init_mm is barely
+		 * slower than a minimal flush, just switch to init_mm.
 		 */
+		switch_mm_irqs_off(NULL, &init_mm, NULL);
 		return;
 	}
 
@@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 	return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
+
+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
+				 size_t count, loff_t *ppos)
+{
+	char buf[2];
+
+	buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+	buf[1] = '\n';
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t tlblazy_write_file(struct file *file,
+		 const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	bool val;
+
+	if (kstrtobool_from_user(user_buf, count, &val))
+		return -EINVAL;
+
+	if (val)
+		static_branch_enable(&tlb_use_lazy_mode);
+	else
+		static_branch_disable(&tlb_use_lazy_mode);
+
+	return count;
+}
+
+static const struct file_operations fops_tlblazy = {
+	.read = tlblazy_read_file,
+	.write = tlblazy_write_file,
+	.llseek = default_llseek,
+};
+
+static int __init init_tlb_use_lazy_mode(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCID)) {
+		/*
+		 * Heuristic: with PCID on, switching to and from
+		 * init_mm is reasonably fast, but remote flush IPIs
+		 * as expensive as ever, so turn off lazy TLB mode.
+		 *
+		 * We can't do this in setup_pcid() because static keys
+		 * haven't been initialized yet, and it would blow up
+		 * badly.
+		 */
+		static_branch_disable(&tlb_use_lazy_mode);
+	}
+
+	debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+			    arch_debugfs_dir, NULL, &fops_tlblazy);
+	return 0;
+}
+late_initcall(init_tlb_use_lazy_mode);
-- 
cgit v1.2.3


From 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 12 Oct 2017 13:23:16 +0200
Subject: x86/microcode: Do the family check first

On CPUs like AMD's Geode, for example, we shouldn't even try to load
microcode because they do not support the modern microcode loading
interface.

However, we do the family check *after* the other checks whether the
loader has been disabled on the command line or whether we're running in
a guest.

So move the family checks first in order to exit early if we're being
loaded on an unsupported family.

Reported-and-tested-by: Sven Glodowski <glodi1@arcor.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.11..
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://bugzilla.suse.com/show_bug.cgi?id=1061396
Link: http://lkml.kernel.org/r/20171012112316.977-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 86e8f0b2537b..c4fa4a85d4cb 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
 	bool *res = &dis_ucode_ldr;
 #endif
 
-	if (!have_cpuid_p())
-		return *res;
-
 	/*
 	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
 	 * completely accurate as xen pv guests don't see that CPUID bit set but
@@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
 void __init load_ucode_bsp(void)
 {
 	unsigned int cpuid_1_eax;
+	bool intel = true;
 
-	if (check_loader_disabled_bsp())
+	if (!have_cpuid_p())
 		return;
 
 	cpuid_1_eax = native_cpuid_eax(1);
 
 	switch (x86_cpuid_vendor()) {
 	case X86_VENDOR_INTEL:
-		if (x86_family(cpuid_1_eax) >= 6)
-			load_ucode_intel_bsp();
+		if (x86_family(cpuid_1_eax) < 6)
+			return;
 		break;
+
 	case X86_VENDOR_AMD:
-		if (x86_family(cpuid_1_eax) >= 0x10)
-			load_ucode_amd_bsp(cpuid_1_eax);
+		if (x86_family(cpuid_1_eax) < 0x10)
+			return;
+		intel = false;
 		break;
+
 	default:
-		break;
+		return;
 	}
+
+	if (check_loader_disabled_bsp())
+		return;
+
+	if (intel)
+		load_ucode_intel_bsp();
+	else
+		load_ucode_amd_bsp(cpuid_1_eax);
 }
 
 static bool check_loader_disabled_ap(void)
-- 
cgit v1.2.3