From 1b42f017415b46c317e71d41c34ec088417a1883 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 11:30:45 +0200
Subject: x86/speculation/taa: Add mitigation for TSX Async Abort

TSX Async Abort (TAA) is a side channel vulnerability to the internal
buffers in some Intel processors similar to Microachitectural Data
Sampling (MDS). In this case, certain loads may speculatively pass
invalid data to dependent operations when an asynchronous abort
condition is pending in a TSX transaction.

This includes loads with no fault or assist condition. Such loads may
speculatively expose stale data from the uarch data structures as in
MDS. Scope of exposure is within the same-thread and cross-thread. This
issue affects all current processors that support TSX, but do not have
ARCH_CAP_TAA_NO (bit 8) set in MSR_IA32_ARCH_CAPABILITIES.

On CPUs which have their IA32_ARCH_CAPABILITIES MSR bit MDS_NO=0,
CPUID.MD_CLEAR=1 and the MDS mitigation is clearing the CPU buffers
using VERW or L1D_FLUSH, there is no additional mitigation needed for
TAA. On affected CPUs with MDS_NO=1 this issue can be mitigated by
disabling the Transactional Synchronization Extensions (TSX) feature.

A new MSR IA32_TSX_CTRL in future and current processors after a
microcode update can be used to control the TSX feature. There are two
bits in that MSR:

* TSX_CTRL_RTM_DISABLE disables the TSX sub-feature Restricted
Transactional Memory (RTM).

* TSX_CTRL_CPUID_CLEAR clears the RTM enumeration in CPUID. The other
TSX sub-feature, Hardware Lock Elision (HLE), is unconditionally
disabled with updated microcode but still enumerated as present by
CPUID(EAX=7).EBX{bit4}.

The second mitigation approach is similar to MDS which is clearing the
affected CPU buffers on return to user space and when entering a guest.
Relevant microcode update is required for the mitigation to work.  More
details on this approach can be found here:

  https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html

The TSX feature can be controlled by the "tsx" command line parameter.
If it is force-enabled then "Clear CPU buffers" (MDS mitigation) is
deployed. The effective mitigation state can be read from sysfs.

 [ bp:
   - massage + comments cleanup
   - s/TAA_MITIGATION_TSX_DISABLE/TAA_MITIGATION_TSX_DISABLED/g - Josh.
   - remove partial TAA mitigation in update_mds_branch_idle() - Josh.
   - s/tsx_async_abort_cmdline/tsx_async_abort_parse_cmdline/g
 ]

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/include/asm/processor.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..54f5d54280f6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -988,4 +988,11 @@ enum mds_mitigations {
 	MDS_MITIGATION_VMWERV,
 };
 
+enum taa_mitigations {
+	TAA_MITIGATION_OFF,
+	TAA_MITIGATION_UCODE_NEEDED,
+	TAA_MITIGATION_VERW,
+	TAA_MITIGATION_TSX_DISABLED,
+};
+
 #endif /* _ASM_X86_PROCESSOR_H */
-- 
cgit v1.2.3


From 5494c3a6a0b965906ffdcb620d94079ea4cb69ea Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 29 Oct 2019 14:13:49 -0700
Subject: x86/mm: Report which part of kernel image is freed

The memory freeing report wasn't very useful for figuring out which
parts of the kernel image were being freed. Add the details for clearer
reporting in dmesg.

Before:

  Freeing unused kernel image memory: 1348K
  Write protecting the kernel read-only data: 20480k
  Freeing unused kernel image memory: 2040K
  Freeing unused kernel image memory: 172K

After:

  Freeing unused kernel image (initmem) memory: 1348K
  Write protecting the kernel read-only data: 20480k
  Freeing unused kernel image (text/rodata gap) memory: 2040K
  Freeing unused kernel image (rodata/data gap) memory: 172K

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: linux-ia64@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: x86-ml <x86@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Link: https://lkml.kernel.org/r/20191029211351.13243-28-keescook@chromium.org
---
 arch/x86/include/asm/processor.h | 2 +-
 arch/x86/mm/init.c               | 8 ++++----
 arch/x86/mm/init_64.c            | 6 ++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..790f250d39a8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -958,7 +958,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
 
 extern unsigned long arch_align_stack(unsigned long sp);
 void free_init_pages(const char *what, unsigned long begin, unsigned long end);
-extern void free_kernel_image_pages(void *begin, void *end);
+extern void free_kernel_image_pages(const char *what, void *begin, void *end);
 
 void default_idle(void);
 #ifdef	CONFIG_XEN
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fd10d91a6115..e7bb483557c9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -829,14 +829,13 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
  * used for the kernel image only.  free_init_pages() will do the
  * right thing for either kind of address.
  */
-void free_kernel_image_pages(void *begin, void *end)
+void free_kernel_image_pages(const char *what, void *begin, void *end)
 {
 	unsigned long begin_ul = (unsigned long)begin;
 	unsigned long end_ul = (unsigned long)end;
 	unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
 
-
-	free_init_pages("unused kernel image", begin_ul, end_ul);
+	free_init_pages(what, begin_ul, end_ul);
 
 	/*
 	 * PTI maps some of the kernel into userspace.  For performance,
@@ -865,7 +864,8 @@ void __ref free_initmem(void)
 
 	mem_encrypt_free_decrypted_mem();
 
-	free_kernel_image_pages(&__init_begin, &__init_end);
+	free_kernel_image_pages("unused kernel image (initmem)",
+				&__init_begin, &__init_end);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e67ddca8b7a8..dcb9bc961b39 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1334,8 +1334,10 @@ void mark_rodata_ro(void)
 	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 #endif
 
-	free_kernel_image_pages((void *)text_end, (void *)rodata_start);
-	free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
+	free_kernel_image_pages("unused kernel image (text/rodata gap)",
+				(void *)text_end, (void *)rodata_start);
+	free_kernel_image_pages("unused kernel image (rodata/data gap)",
+				(void *)rodata_end, (void *)_sdata);
 
 	debug_checkwx();
 }
-- 
cgit v1.2.3


From db8c33f8b5bea59d00ca12dcd6b65d01b1ea98ef Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 16 Sep 2019 15:39:58 -0700
Subject: x86/cpu: Align the x86_capability array to size of unsigned long

The x86_capability array in cpuinfo_x86 is of type u32 and thus is
naturally aligned to 4 bytes. But, set_bit() and clear_bit() require the
array to be aligned to size of unsigned long (i.e. 8 bytes on 64-bit
systems).

The array pointer is handed into atomic bit operations. If the access is
not aligned to unsigned long then the atomic bit operations can end up
crossing a cache line boundary, which causes the CPU to do a full bus lock
as it can't lock both cache lines at once. The bus lock operation is heavy
weight and can cause severe performance degradation.

The upcoming #AC split lock detection mechanism will issue warnings for
this kind of access.

Force the alignment of the array to unsigned long. This avoids the massive
code changes which would be required when converting the array data type to
unsigned long.

[ tglx: Rewrote changelog so it contains information WHY this is required ]

Suggested-by: David Laight <David.Laight@aculab.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190916223958.27048-4-tony.luck@intel.com
---
 arch/x86/include/asm/processor.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..c073534ca485 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -93,7 +93,15 @@ struct cpuinfo_x86 {
 	__u32			extended_cpuid_level;
 	/* Maximum supported CPUID level, -1=no CPUID: */
 	int			cpuid_level;
-	__u32			x86_capability[NCAPINTS + NBUGINTS];
+	/*
+	 * Align to size of unsigned long because the x86_capability array
+	 * is passed to bitops which require the alignment. Use unnamed
+	 * union to enforce the array is aligned to size of unsigned long.
+	 */
+	union {
+		__u32		x86_capability[NCAPINTS + NBUGINTS];
+		unsigned long	x86_capability_alignment;
+	};
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
 	/* in KB - valid for CPUS which support this call: */
-- 
cgit v1.2.3


From ecc7e37d4dadd16f6be125ca496feccd05454da4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Nov 2019 23:03:20 +0100
Subject: x86/io: Speedup schedule out of I/O bitmap user

There is no requirement to update the TSS I/O bitmap when a thread using it is
scheduled out and the incoming thread does not use it.

For the permission check based on the TSS I/O bitmap the CPU calculates the memory
location of the I/O bitmap by the address of the TSS and the io_bitmap_base member
of the tss_struct. The easiest way to invalidate the I/O bitmap is to switch the
offset to an address outside of the TSS limit.

If an I/O instruction is issued from user space the TSS limit causes #GP to be
raised in the same was as valid I/O bitmap with all bits set to 1 would do.

This removes the extra work when an I/O bitmap using task is scheduled out
and puts the burden on the rare I/O bitmap users when they are scheduled
in.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/processor.h | 38 ++++++++++++++++--------
 arch/x86/kernel/cpu/common.c     |  3 +-
 arch/x86/kernel/doublefault.c    |  2 +-
 arch/x86/kernel/ioport.c         |  4 +++
 arch/x86/kernel/process.c        | 63 +++++++++++++++++++++++-----------------
 5 files changed, 69 insertions(+), 41 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..6d0059c21969 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -330,8 +330,23 @@ struct x86_hw_tss {
 #define IO_BITMAP_BITS			65536
 #define IO_BITMAP_BYTES			(IO_BITMAP_BITS/8)
 #define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET		(offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
-#define INVALID_IO_BITMAP_OFFSET	0x8000
+
+#define IO_BITMAP_OFFSET_VALID				\
+	(offsetof(struct tss_struct, io_bitmap) -	\
+	 offsetof(struct tss_struct, x86_tss))
+
+/*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap.
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+#define __KERNEL_TSS_LIMIT	\
+	(IO_BITMAP_OFFSET_VALID + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
+
+/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
+#define IO_BITMAP_OFFSET_INVALID	(__KERNEL_TSS_LIMIT + 1)
 
 struct entry_stack {
 	unsigned long		words[64];
@@ -349,6 +364,15 @@ struct tss_struct {
 	 */
 	struct x86_hw_tss	x86_tss;
 
+	/*
+	 * Store the dirty size of the last io bitmap offender. The next
+	 * one will have to do the cleanup as the switch out to a non io
+	 * bitmap user will just set x86_tss.io_bitmap_base to a value
+	 * outside of the TSS limit. So for sane tasks there is no need to
+	 * actually touch the io_bitmap at all.
+	 */
+	unsigned int		io_bitmap_prev_max;
+
 	/*
 	 * The extra 1 is there because the CPU will access an
 	 * additional byte beyond the end of the IO permission
@@ -360,16 +384,6 @@ struct tss_struct {
 
 DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
 
-/*
- * sizeof(unsigned long) coming from an extra "long" at the end
- * of the iobitmap.
- *
- * -1? seg base+limit should be pointing to the address of the
- * last valid byte
- */
-#define __KERNEL_TSS_LIMIT	\
-	(IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
-
 /* Per CPU interrupt stacks */
 struct irq_stack {
 	char		stack[IRQ_STACK_SIZE];
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d52ec1a82120..8c1000a57b55 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1860,7 +1860,8 @@ void cpu_init(void)
 
 	/* Initialize the TSS. */
 	tss_setup_ist(tss);
-	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+	tss->io_bitmap_prev_max = 0;
 	memset(tss->io_bitmap, 0xff, sizeof(tss->io_bitmap));
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0b8cedb20d6d..cedb07d0909c 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -54,7 +54,7 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
 	.sp0		= STACK_START,
 	.ss0		= __KERNEL_DS,
 	.ldt		= 0,
-	.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
+	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
 
 	.ip		= (unsigned long) doublefault_fn,
 	/* 0x2 bit is always set */
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 80fa36be2670..eed218a3fd48 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -82,6 +82,10 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	/* Update the TSS */
 	tss = this_cpu_ptr(&cpu_tss_rw);
 	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+	/* Store the new end of the zero bits */
+	tss->io_bitmap_prev_max = bytes;
+	/* Make the bitmap base in the TSS valid */
+	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
 	/* Make sure the TSS limit covers the I/O bitmap. */
 	refresh_tss_limit();
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7e07f0bb0def..2444fe296de5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -72,18 +72,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
 #endif
+		.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
 	 },
-#ifdef CONFIG_X86_32
-	 /*
-	  * Note that the .io_bitmap member must be extra-big. This is because
-	  * the CPU will access an additional byte beyond the end of the IO
-	  * permission bitmap. The extra byte must be all 1 bits, and must
-	  * be within the limit.
-	  */
-	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
-#endif
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 
@@ -112,18 +103,18 @@ void exit_thread(struct task_struct *tsk)
 	struct thread_struct *t = &tsk->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
 	struct fpu *fpu = &t->fpu;
+	struct tss_struct *tss;
 
 	if (bp) {
-		struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
+		preempt_disable();
+		tss = this_cpu_ptr(&cpu_tss_rw);
 
 		t->io_bitmap_ptr = NULL;
-		clear_thread_flag(TIF_IO_BITMAP);
-		/*
-		 * Careful, clear this in the TSS too:
-		 */
-		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
-		put_cpu();
+		clear_thread_flag(TIF_IO_BITMAP);
+		/* Invalidate the io bitmap base in the TSS */
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+		preempt_enable();
 		kfree(bp);
 	}
 
@@ -369,29 +360,47 @@ void arch_setup_new_exec(void)
 	}
 }
 
-static inline void switch_to_bitmap(struct thread_struct *prev,
-				    struct thread_struct *next,
+static inline void switch_to_bitmap(struct thread_struct *next,
 				    unsigned long tifp, unsigned long tifn)
 {
 	struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
 
 	if (tifn & _TIF_IO_BITMAP) {
 		/*
-		 * Copy the relevant range of the IO bitmap.
-		 * Normally this is 128 bytes or less:
+		 * Copy at least the size of the incoming tasks bitmap
+		 * which covers the last permitted I/O port.
+		 *
+		 * If the previous task which used an io bitmap had more
+		 * bits permitted, then the copy needs to cover those as
+		 * well so they get turned off.
 		 */
 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-		       max(prev->io_bitmap_max, next->io_bitmap_max));
+		       max(tss->io_bitmap_prev_max, next->io_bitmap_max));
+
+		/* Store the new max and set io_bitmap_base valid */
+		tss->io_bitmap_prev_max = next->io_bitmap_max;
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
+
 		/*
-		 * Make sure that the TSS limit is correct for the CPU
-		 * to notice the IO bitmap.
+		 * Make sure that the TSS limit is covering the io bitmap.
+		 * It might have been cut down by a VMEXIT to 0x67 which
+		 * would cause a subsequent I/O access from user space to
+		 * trigger a #GP because tbe bitmap is outside the TSS
+		 * limit.
 		 */
 		refresh_tss_limit();
 	} else if (tifp & _TIF_IO_BITMAP) {
 		/*
-		 * Clear any possible leftover bits:
+		 * Do not touch the bitmap. Let the next bitmap using task
+		 * deal with the mess. Just make the io_bitmap_base invalid
+		 * by moving it outside the TSS limit so any subsequent I/O
+		 * access from user space will trigger a #GP.
+		 *
+		 * This is correct even when VMEXIT rewrites the TSS limit
+		 * to 0x67 as the only requirement is that the base points
+		 * outside the limit.
 		 */
-		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
 	}
 }
 
@@ -605,7 +614,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
 
 	tifn = READ_ONCE(task_thread_info(next_p)->flags);
 	tifp = READ_ONCE(task_thread_info(prev_p)->flags);
-	switch_to_bitmap(prev, next, tifp, tifn);
+	switch_to_bitmap(next, tifp, tifn);
 
 	propagate_user_return_notify(prev_p, next_p);
 
-- 
cgit v1.2.3


From f5848e5fd2f813c3a8009a642dfbcf635287c199 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Nov 2019 18:45:29 +0100
Subject: x86/tss: Move I/O bitmap data into a seperate struct

Move the non hardware portion of I/O bitmap data into a seperate struct for
readability sake.

Originally-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/processor.h | 35 +++++++++++++++++++++--------------
 arch/x86/kernel/cpu/common.c     |  4 ++--
 arch/x86/kernel/ioport.c         |  4 ++--
 arch/x86/kernel/process.c        |  6 +++---
 4 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6d0059c21969..cd7cd7d10b81 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -328,11 +328,11 @@ struct x86_hw_tss {
  * IO-bitmap sizes:
  */
 #define IO_BITMAP_BITS			65536
-#define IO_BITMAP_BYTES			(IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_BYTES			(IO_BITMAP_BITS / BITS_PER_BYTE)
+#define IO_BITMAP_LONGS			(IO_BITMAP_BYTES / sizeof(long))
 
-#define IO_BITMAP_OFFSET_VALID				\
-	(offsetof(struct tss_struct, io_bitmap) -	\
+#define IO_BITMAP_OFFSET_VALID					\
+	(offsetof(struct tss_struct, io_bitmap.bitmap) -	\
 	 offsetof(struct tss_struct, x86_tss))
 
 /*
@@ -356,14 +356,10 @@ struct entry_stack_page {
 	struct entry_stack stack;
 } __aligned(PAGE_SIZE);
 
-struct tss_struct {
-	/*
-	 * The fixed hardware portion.  This must not cross a page boundary
-	 * at risk of violating the SDM's advice and potentially triggering
-	 * errata.
-	 */
-	struct x86_hw_tss	x86_tss;
-
+/*
+ * All IO bitmap related data stored in the TSS:
+ */
+struct x86_io_bitmap {
 	/*
 	 * Store the dirty size of the last io bitmap offender. The next
 	 * one will have to do the cleanup as the switch out to a non io
@@ -371,7 +367,7 @@ struct tss_struct {
 	 * outside of the TSS limit. So for sane tasks there is no need to
 	 * actually touch the io_bitmap at all.
 	 */
-	unsigned int		io_bitmap_prev_max;
+	unsigned int		prev_max;
 
 	/*
 	 * The extra 1 is there because the CPU will access an
@@ -379,7 +375,18 @@ struct tss_struct {
 	 * bitmap. The extra byte must be all 1 bits, and must
 	 * be within the limit.
 	 */
-	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
+	unsigned long		bitmap[IO_BITMAP_LONGS + 1];
+};
+
+struct tss_struct {
+	/*
+	 * The fixed hardware portion.  This must not cross a page boundary
+	 * at risk of violating the SDM's advice and potentially triggering
+	 * errata.
+	 */
+	struct x86_hw_tss	x86_tss;
+
+	struct x86_io_bitmap	io_bitmap;
 } __aligned(PAGE_SIZE);
 
 DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8c1000a57b55..3aee167246f7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1861,8 +1861,8 @@ void cpu_init(void)
 	/* Initialize the TSS. */
 	tss_setup_ist(tss);
 	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
-	tss->io_bitmap_prev_max = 0;
-	memset(tss->io_bitmap, 0xff, sizeof(tss->io_bitmap));
+	tss->io_bitmap.prev_max = 0;
+	memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
 	load_TR_desc();
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index eed218a3fd48..80d99bb826f3 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -81,9 +81,9 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 
 	/* Update the TSS */
 	tss = this_cpu_ptr(&cpu_tss_rw);
-	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+	memcpy(tss->io_bitmap.bitmap, t->io_bitmap_ptr, bytes_updated);
 	/* Store the new end of the zero bits */
-	tss->io_bitmap_prev_max = bytes;
+	tss->io_bitmap.prev_max = bytes;
 	/* Make the bitmap base in the TSS valid */
 	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
 	/* Make sure the TSS limit covers the I/O bitmap. */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2444fe296de5..35f1c80df7a8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -374,11 +374,11 @@ static inline void switch_to_bitmap(struct thread_struct *next,
 		 * bits permitted, then the copy needs to cover those as
 		 * well so they get turned off.
 		 */
-		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-		       max(tss->io_bitmap_prev_max, next->io_bitmap_max));
+		memcpy(tss->io_bitmap.bitmap, next->io_bitmap_ptr,
+		       max(tss->io_bitmap.prev_max, next->io_bitmap_max));
 
 		/* Store the new max and set io_bitmap_base valid */
-		tss->io_bitmap_prev_max = next->io_bitmap_max;
+		tss->io_bitmap.prev_max = next->io_bitmap_max;
 		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
 
 		/*
-- 
cgit v1.2.3


From 577d5cd7e5851d3832066cd0422475fa7db2ee17 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Nov 2019 23:03:21 +0100
Subject: x86/ioperm: Move iobitmap data into a struct

No point in having all the data in thread_struct, especially as upcoming
changes add more.

Make the bitmap in the new struct accessible as array of longs and as array
of characters via a union, so both the bitmap functions and the update
logic can avoid type casts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/io_bitmap.h | 13 +++++++++++++
 arch/x86/include/asm/processor.h |  6 ++----
 arch/x86/kernel/ioport.c         | 27 ++++++++++++++-------------
 arch/x86/kernel/process.c        | 38 ++++++++++++++++++++------------------
 arch/x86/kernel/ptrace.c         | 12 ++++++++----
 5 files changed, 57 insertions(+), 39 deletions(-)
 create mode 100644 arch/x86/include/asm/io_bitmap.h

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
new file mode 100644
index 000000000000..1a12b9ff5e4e
--- /dev/null
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IOBITMAP_H
+#define _ASM_X86_IOBITMAP_H
+
+#include <asm/processor.h>
+
+struct io_bitmap {
+	/* The maximum number of bytes to copy so all zero bits are covered */
+	unsigned int	max;
+	unsigned long	bitmap[IO_BITMAP_LONGS];
+};
+
+#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cd7cd7d10b81..c949e0e5cbe6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -7,6 +7,7 @@
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
+struct io_bitmap;
 struct vm86;
 
 #include <asm/math_emu.h>
@@ -501,10 +502,8 @@ struct thread_struct {
 	struct vm86		*vm86;
 #endif
 	/* IO permissions: */
-	unsigned long		*io_bitmap_ptr;
+	struct io_bitmap	*io_bitmap;
 	unsigned long		iopl;
-	/* Max allowed port in the bitmap, in bytes: */
-	unsigned		io_bitmap_max;
 
 	mm_segment_t		addr_limit;
 
@@ -862,7 +861,6 @@ static inline void spin_lock_prefetch(const void *x)
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
 	.sysenter_cs		= __KERNEL_CS,				  \
-	.io_bitmap_ptr		= NULL,					  \
 	.addr_limit		= KERNEL_DS,				  \
 }
 
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 80d99bb826f3..05f77f3af46d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+#include <asm/io_bitmap.h>
 #include <asm/desc.h>
 
 /*
@@ -21,7 +22,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	unsigned int i, max_long, bytes, bytes_updated;
 	struct thread_struct *t = &current->thread;
 	struct tss_struct *tss;
-	unsigned long *bitmap;
+	struct io_bitmap *iobm;
 
 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
 		return -EINVAL;
@@ -34,16 +35,16 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 * IO bitmap up. ioperm() is much less timing critical than clone(),
 	 * this is why we delay this operation until now:
 	 */
-	bitmap = t->io_bitmap_ptr;
-	if (!bitmap) {
+	iobm = t->io_bitmap;
+	if (!iobm) {
 		/* No point to allocate a bitmap just to clear permissions */
 		if (!turn_on)
 			return 0;
-		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-		if (!bitmap)
+		iobm = kmalloc(sizeof(*iobm), GFP_KERNEL);
+		if (!iobm)
 			return -ENOMEM;
 
-		memset(bitmap, 0xff, IO_BITMAP_BYTES);
+		memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap));
 	}
 
 	/*
@@ -52,9 +53,9 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 */
 	preempt_disable();
 	if (turn_on)
-		bitmap_clear(bitmap, from, num);
+		bitmap_clear(iobm->bitmap, from, num);
 	else
-		bitmap_set(bitmap, from, num);
+		bitmap_set(iobm->bitmap, from, num);
 
 	/*
 	 * Search for a (possibly new) maximum. This is simple and stupid,
@@ -62,26 +63,26 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 */
 	max_long = 0;
 	for (i = 0; i < IO_BITMAP_LONGS; i++) {
-		if (bitmap[i] != ~0UL)
+		if (iobm->bitmap[i] != ~0UL)
 			max_long = i;
 	}
 
 	bytes = (max_long + 1) * sizeof(unsigned long);
-	bytes_updated = max(bytes, t->io_bitmap_max);
+	bytes_updated = max(bytes, t->io_bitmap->max);
 
 	/* Update the thread data */
-	t->io_bitmap_max = bytes;
+	iobm->max = bytes;
 	/*
 	 * Store the bitmap pointer (might be the same if the task already
 	 * head one). Set the TIF flag, just in case this is the first
 	 * invocation.
 	 */
-	t->io_bitmap_ptr = bitmap;
+	t->io_bitmap = iobm;
 	set_thread_flag(TIF_IO_BITMAP);
 
 	/* Update the TSS */
 	tss = this_cpu_ptr(&cpu_tss_rw);
-	memcpy(tss->io_bitmap.bitmap, t->io_bitmap_ptr, bytes_updated);
+	memcpy(tss->io_bitmap.bitmap, iobm->bitmap, bytes_updated);
 	/* Store the new end of the zero bits */
 	tss->io_bitmap.prev_max = bytes;
 	/* Make the bitmap base in the TSS valid */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 35f1c80df7a8..1504fd2b9bc4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -41,6 +41,7 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/spec-ctrl.h>
+#include <asm/io_bitmap.h>
 #include <asm/proto.h>
 
 #include "process.h"
@@ -101,21 +102,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 void exit_thread(struct task_struct *tsk)
 {
 	struct thread_struct *t = &tsk->thread;
-	unsigned long *bp = t->io_bitmap_ptr;
+	struct io_bitmap *iobm = t->io_bitmap;
 	struct fpu *fpu = &t->fpu;
 	struct tss_struct *tss;
 
-	if (bp) {
+	if (iobm) {
 		preempt_disable();
 		tss = this_cpu_ptr(&cpu_tss_rw);
 
-		t->io_bitmap_ptr = NULL;
-		t->io_bitmap_max = 0;
+		t->io_bitmap = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
 		/* Invalidate the io bitmap base in the TSS */
 		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
 		preempt_enable();
-		kfree(bp);
+		kfree(iobm);
 	}
 
 	free_vm86(t);
@@ -135,25 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
 
 static inline int copy_io_bitmap(struct task_struct *tsk)
 {
+	struct io_bitmap *iobm = current->thread.io_bitmap;
+
 	if (likely(!test_tsk_thread_flag(current, TIF_IO_BITMAP)))
 		return 0;
 
-	tsk->thread.io_bitmap_ptr = kmemdup(current->thread.io_bitmap_ptr,
-					    IO_BITMAP_BYTES, GFP_KERNEL);
-	if (!tsk->thread.io_bitmap_ptr) {
-		tsk->thread.io_bitmap_max = 0;
+	tsk->thread.io_bitmap = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL);
+
+	if (!tsk->thread.io_bitmap)
 		return -ENOMEM;
-	}
+
 	set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
 	return 0;
 }
 
 static inline void free_io_bitmap(struct task_struct *tsk)
 {
-	if (tsk->thread.io_bitmap_ptr) {
-		kfree(tsk->thread.io_bitmap_ptr);
-		tsk->thread.io_bitmap_ptr = NULL;
-		tsk->thread.io_bitmap_max = 0;
+	if (tsk->thread.io_bitmap) {
+		kfree(tsk->thread.io_bitmap);
+		tsk->thread.io_bitmap = NULL;
 	}
 }
 
@@ -172,7 +172,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	frame->bp = 0;
 	frame->ret_addr = (unsigned long) ret_from_fork;
 	p->thread.sp = (unsigned long) fork_frame;
-	p->thread.io_bitmap_ptr = NULL;
+	p->thread.io_bitmap = NULL;
 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 #ifdef CONFIG_X86_64
@@ -366,6 +366,8 @@ static inline void switch_to_bitmap(struct thread_struct *next,
 	struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
 
 	if (tifn & _TIF_IO_BITMAP) {
+		struct io_bitmap *iobm = next->io_bitmap;
+
 		/*
 		 * Copy at least the size of the incoming tasks bitmap
 		 * which covers the last permitted I/O port.
@@ -374,11 +376,11 @@ static inline void switch_to_bitmap(struct thread_struct *next,
 		 * bits permitted, then the copy needs to cover those as
 		 * well so they get turned off.
 		 */
-		memcpy(tss->io_bitmap.bitmap, next->io_bitmap_ptr,
-		       max(tss->io_bitmap.prev_max, next->io_bitmap_max));
+		memcpy(tss->io_bitmap.bitmap, next->io_bitmap->bitmap,
+		       max(tss->io_bitmap.prev_max, next->io_bitmap->max));
 
 		/* Store the new max and set io_bitmap_base valid */
-		tss->io_bitmap.prev_max = next->io_bitmap_max;
+		tss->io_bitmap.prev_max = next->io_bitmap->max;
 		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
 
 		/*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7c526741215b..066e5b01a7e0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -42,6 +42,7 @@
 #include <asm/traps.h>
 #include <asm/syscall.h>
 #include <asm/fsgsbase.h>
+#include <asm/io_bitmap.h>
 
 #include "tls.h"
 
@@ -697,7 +698,9 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n,
 static int ioperm_active(struct task_struct *target,
 			 const struct user_regset *regset)
 {
-	return DIV_ROUND_UP(target->thread.io_bitmap_max, regset->size);
+	struct io_bitmap *iobm = target->thread.io_bitmap;
+
+	return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0;
 }
 
 static int ioperm_get(struct task_struct *target,
@@ -705,12 +708,13 @@ static int ioperm_get(struct task_struct *target,
 		      unsigned int pos, unsigned int count,
 		      void *kbuf, void __user *ubuf)
 {
-	if (!target->thread.io_bitmap_ptr)
+	struct io_bitmap *iobm = target->thread.io_bitmap;
+
+	if (!iobm)
 		return -ENXIO;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   target->thread.io_bitmap_ptr,
-				   0, IO_BITMAP_BYTES);
+				   iobm->bitmap, 0, IO_BITMAP_BYTES);
 }
 
 /*
-- 
cgit v1.2.3


From 060aa16fdb7c5078a4159a76e5dc87d6a493af9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Nov 2019 23:03:22 +0100
Subject: x86/ioperm: Add bitmap sequence number

Add a globally unique sequence number which is incremented when ioperm() is
changing the I/O bitmap of a task. Store the new sequence number in the
io_bitmap structure and compare it with the sequence number of the I/O
bitmap which was last loaded on a CPU. Only update the bitmap if the
sequence is different.

That should further reduce the overhead of I/O bitmap scheduling when there
are only a few I/O bitmap users on the system.

The 64bit sequence counter is sufficient. A wraparound of the sequence
counter assuming an ioperm() call every nanosecond would require about 584
years of uptime.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/io_bitmap.h |  1 +
 arch/x86/include/asm/processor.h |  3 +++
 arch/x86/kernel/cpu/common.c     |  1 +
 arch/x86/kernel/ioport.c         |  5 +++++
 arch/x86/kernel/process.c        | 38 ++++++++++++++++++++++++++++----------
 5 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
index 1a12b9ff5e4e..d63bd5afe671 100644
--- a/arch/x86/include/asm/io_bitmap.h
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -5,6 +5,7 @@
 #include <asm/processor.h>
 
 struct io_bitmap {
+	u64		sequence;
 	/* The maximum number of bytes to copy so all zero bits are covered */
 	unsigned int	max;
 	unsigned long	bitmap[IO_BITMAP_LONGS];
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c949e0e5cbe6..40bb0f7bca3f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -361,6 +361,9 @@ struct entry_stack_page {
  * All IO bitmap related data stored in the TSS:
  */
 struct x86_io_bitmap {
+	/* The sequence number of the last active bitmap. */
+	u64			prev_sequence;
+
 	/*
 	 * Store the dirty size of the last io bitmap offender. The next
 	 * one will have to do the cleanup as the switch out to a non io
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3aee167246f7..79dd544bb974 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1862,6 +1862,7 @@ void cpu_init(void)
 	tss_setup_ist(tss);
 	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
 	tss->io_bitmap.prev_max = 0;
+	tss->io_bitmap.prev_sequence = 0;
 	memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 05f77f3af46d..7c1ab5c2b395 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,8 @@
 #include <asm/io_bitmap.h>
 #include <asm/desc.h>
 
+static atomic64_t io_bitmap_sequence;
+
 /*
  * this changes the io permissions bitmap in the current task.
  */
@@ -72,6 +74,9 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
 
 	/* Update the thread data */
 	iobm->max = bytes;
+	/* Update the sequence number to force an update in switch_to() */
+	iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence);
+
 	/*
 	 * Store the bitmap pointer (might be the same if the task already
 	 * head one). Set the TIF flag, just in case this is the first
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1504fd2b9bc4..7c49be90468b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -360,6 +360,28 @@ void arch_setup_new_exec(void)
 	}
 }
 
+static void switch_to_update_io_bitmap(struct tss_struct *tss,
+				       struct io_bitmap *iobm)
+{
+	/*
+	 * Copy at least the byte range of the incoming tasks bitmap which
+	 * covers the permitted I/O ports.
+	 *
+	 * If the previous task which used an I/O bitmap had more bits
+	 * permitted, then the copy needs to cover those as well so they
+	 * get turned off.
+	 */
+	memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
+	       max(tss->io_bitmap.prev_max, iobm->max));
+
+	/*
+	 * Store the new max and the sequence number of this bitmap
+	 * and a pointer to the bitmap itself.
+	 */
+	tss->io_bitmap.prev_max = iobm->max;
+	tss->io_bitmap.prev_sequence = iobm->sequence;
+}
+
 static inline void switch_to_bitmap(struct thread_struct *next,
 				    unsigned long tifp, unsigned long tifn)
 {
@@ -369,18 +391,14 @@ static inline void switch_to_bitmap(struct thread_struct *next,
 		struct io_bitmap *iobm = next->io_bitmap;
 
 		/*
-		 * Copy at least the size of the incoming tasks bitmap
-		 * which covers the last permitted I/O port.
-		 *
-		 * If the previous task which used an io bitmap had more
-		 * bits permitted, then the copy needs to cover those as
-		 * well so they get turned off.
+		 * Only copy bitmap data when the sequence number
+		 * differs. The update time is accounted to the incoming
+		 * task.
 		 */
-		memcpy(tss->io_bitmap.bitmap, next->io_bitmap->bitmap,
-		       max(tss->io_bitmap.prev_max, next->io_bitmap->max));
+		if (tss->io_bitmap.prev_sequence != iobm->sequence)
+			switch_to_update_io_bitmap(tss, iobm);
 
-		/* Store the new max and set io_bitmap_base valid */
-		tss->io_bitmap.prev_max = next->io_bitmap->max;
+		/* Enable the bitmap */
 		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
 
 		/*
-- 
cgit v1.2.3


From c8137ace56383688af911fea5934c71ad158135e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Nov 2019 23:03:28 +0100
Subject: x86/iopl: Restrict iopl() permission scope

The access to the full I/O port range can be also provided by the TSS I/O
bitmap, but that would require to copy 8k of data on scheduling in the
task. As shown with the sched out optimization TSS.io_bitmap_base can be
used to switch the incoming task to a preallocated I/O bitmap which has all
bits zero, i.e. allows access to all I/O ports.

Implementing this allows to provide an iopl() emulation mode which restricts
the IOPL level 3 permissions to I/O port access but removes the STI/CLI
permission which is coming with the hardware IOPL mechansim.

Provide a config option to switch IOPL to emulation mode, make it the
default and while at it also provide an option to disable IOPL completely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/Kconfig                        | 32 ++++++++++++
 arch/x86/include/asm/pgtable_32_types.h |  2 +-
 arch/x86/include/asm/processor.h        | 28 ++++++++---
 arch/x86/kernel/cpu/common.c            |  5 ++
 arch/x86/kernel/ioport.c                | 87 +++++++++++++++++++++++----------
 arch/x86/kernel/process.c               | 32 +++++++-----
 6 files changed, 139 insertions(+), 47 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d6e1faa28c58..2aad1cd14cc5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1254,6 +1254,38 @@ config X86_VSYSCALL_EMULATION
 	 Disabling this option saves about 7K of kernel size and
 	 possibly 4K of additional runtime pagetable memory.
 
+choice
+	prompt "IOPL"
+	default X86_IOPL_EMULATION
+
+config X86_IOPL_EMULATION
+	bool "IOPL Emulation"
+	---help---
+	  Legacy IOPL support is an overbroad mechanism which allows user
+	  space aside of accessing all 65536 I/O ports also to disable
+	  interrupts. To gain this access the caller needs CAP_SYS_RAWIO
+	  capabilities and permission from potentially active security
+	  modules.
+
+	  The emulation restricts the functionality of the syscall to
+	  only allowing the full range I/O port access, but prevents the
+	  ability to disable interrupts from user space.
+
+config X86_IOPL_LEGACY
+	bool "IOPL Legacy"
+	---help---
+	Allow the full IOPL permissions, i.e. user space access to all
+	65536 I/O ports and also the ability to disable interrupts, which
+	is overbroad and can result in system lockups.
+
+config X86_IOPL_NONE
+	bool "IOPL None"
+	---help---
+	Disable the IOPL permission syscall. That's the safest option as
+	no sane application should depend on this functionality.
+
+endchoice
+
 config TOSHIBA
 	tristate "Toshiba Laptop support"
 	depends on X86_32
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index b0bc0fff5f1f..0fab4bfb4df2 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -44,7 +44,7 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
  * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
  * to avoid include recursion hell
  */
-#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 40)
+#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 41)
 
 #define CPU_ENTRY_AREA_BASE						\
 	((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1))   \
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 40bb0f7bca3f..b0e02aa3f46a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -332,19 +332,21 @@ struct x86_hw_tss {
 #define IO_BITMAP_BYTES			(IO_BITMAP_BITS / BITS_PER_BYTE)
 #define IO_BITMAP_LONGS			(IO_BITMAP_BYTES / sizeof(long))
 
-#define IO_BITMAP_OFFSET_VALID					\
+#define IO_BITMAP_OFFSET_VALID_MAP				\
 	(offsetof(struct tss_struct, io_bitmap.bitmap) -	\
 	 offsetof(struct tss_struct, x86_tss))
 
+#define IO_BITMAP_OFFSET_VALID_ALL				\
+	(offsetof(struct tss_struct, io_bitmap.mapall) -	\
+	 offsetof(struct tss_struct, x86_tss))
+
 /*
- * sizeof(unsigned long) coming from an extra "long" at the end
- * of the iobitmap.
- *
- * -1? seg base+limit should be pointing to the address of the
- * last valid byte
+ * sizeof(unsigned long) coming from an extra "long" at the end of the
+ * iobitmap. The limit is inclusive, i.e. the last valid byte.
  */
 #define __KERNEL_TSS_LIMIT	\
-	(IO_BITMAP_OFFSET_VALID + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
+	(IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
+	 sizeof(unsigned long) - 1)
 
 /* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
 #define IO_BITMAP_OFFSET_INVALID	(__KERNEL_TSS_LIMIT + 1)
@@ -380,6 +382,12 @@ struct x86_io_bitmap {
 	 * be within the limit.
 	 */
 	unsigned long		bitmap[IO_BITMAP_LONGS + 1];
+
+	/*
+	 * Special I/O bitmap to emulate IOPL(3). All bytes zero,
+	 * except the additional byte at the end.
+	 */
+	unsigned long		mapall[IO_BITMAP_LONGS + 1];
 };
 
 struct tss_struct {
@@ -506,7 +514,13 @@ struct thread_struct {
 #endif
 	/* IO permissions: */
 	struct io_bitmap	*io_bitmap;
+
+	/*
+	 * IOPL. Priviledge level dependent I/O permission which includes
+	 * user space CLI/STI when granted.
+	 */
 	unsigned long		iopl;
+	unsigned long		iopl_emul;
 
 	mm_segment_t		addr_limit;
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 79dd544bb974..7bf402be13bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1864,6 +1864,11 @@ void cpu_init(void)
 	tss->io_bitmap.prev_max = 0;
 	tss->io_bitmap.prev_sequence = 0;
 	memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+	/*
+	 * Invalidate the extra array entry past the end of the all
+	 * permission bitmap as required by the hardware.
+	 */
+	tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
 	load_TR_desc();
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 3548563b0935..9ed9458e02df 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -17,25 +17,41 @@
 static atomic64_t io_bitmap_sequence;
 
 void io_bitmap_share(struct task_struct *tsk)
- {
-	/*
-	 * Take a refcount on current's bitmap. It can be used by
-	 * both tasks as long as none of them changes the bitmap.
-	 */
-	refcount_inc(&current->thread.io_bitmap->refcnt);
-	tsk->thread.io_bitmap = current->thread.io_bitmap;
+{
+	/* Can be NULL when current->thread.iopl_emul == 3 */
+	if (current->thread.io_bitmap) {
+		/*
+		 * Take a refcount on current's bitmap. It can be used by
+		 * both tasks as long as none of them changes the bitmap.
+		 */
+		refcount_inc(&current->thread.io_bitmap->refcnt);
+		tsk->thread.io_bitmap = current->thread.io_bitmap;
+	}
 	set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
 }
 
+static void task_update_io_bitmap(void)
+{
+	struct thread_struct *t = &current->thread;
+
+	if (t->iopl_emul == 3 || t->io_bitmap) {
+		/* TSS update is handled on exit to user space */
+		set_thread_flag(TIF_IO_BITMAP);
+	} else {
+		clear_thread_flag(TIF_IO_BITMAP);
+		/* Invalidate TSS */
+		preempt_disable();
+		tss_update_io_bitmap();
+		preempt_enable();
+	}
+}
+
 void io_bitmap_exit(void)
 {
 	struct io_bitmap *iobm = current->thread.io_bitmap;
 
 	current->thread.io_bitmap = NULL;
-	clear_thread_flag(TIF_IO_BITMAP);
-	preempt_disable();
-	tss_update_io_bitmap();
-	preempt_enable();
+	task_update_io_bitmap();
 	if (iobm && refcount_dec_and_test(&iobm->refcnt))
 		kfree(iobm);
 }
@@ -157,36 +173,55 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
  */
 SYSCALL_DEFINE1(iopl, unsigned int, level)
 {
-	struct pt_regs *regs = current_pt_regs();
 	struct thread_struct *t = &current->thread;
+	struct pt_regs *regs = current_pt_regs();
+	unsigned int old;
 
 	/*
 	 * Careful: the IOPL bits in regs->flags are undefined under Xen PV
 	 * and changing them has no effect.
 	 */
-	unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+	if (IS_ENABLED(CONFIG_X86_IOPL_NONE))
+		return -ENOSYS;
 
 	if (level > 3)
 		return -EINVAL;
+
+	if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION))
+		old = t->iopl_emul;
+	else
+		old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
+	/* No point in going further if nothing changes */
+	if (level == old)
+		return 0;
+
 	/* Trying to gain more privileges? */
 	if (level > old) {
 		if (!capable(CAP_SYS_RAWIO) ||
 		    security_locked_down(LOCKDOWN_IOPORT))
 			return -EPERM;
 	}
-	/*
-	 * Change the flags value on the return stack, which has been set
-	 * up on system-call entry. See also the fork and signal handling
-	 * code how this is handled.
-	 */
-	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
-		(level << X86_EFLAGS_IOPL_BIT);
-	/* Store the new level in the thread struct */
-	t->iopl = level << X86_EFLAGS_IOPL_BIT;
-	/*
-	 * X86_32 switches immediately and XEN handles it via emulation.
-	 */
-	set_iopl_mask(t->iopl);
+
+	if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION)) {
+		t->iopl_emul = level;
+		task_update_io_bitmap();
+	} else {
+		/*
+		 * Change the flags value on the return stack, which has
+		 * been set up on system-call entry. See also the fork and
+		 * signal handling code how this is handled.
+		 */
+		regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+			(level << X86_EFLAGS_IOPL_BIT);
+		/* Store the new level in the thread struct */
+		t->iopl = level << X86_EFLAGS_IOPL_BIT;
+		/*
+		 * X86_32 switches immediately and XEN handles it via
+		 * emulation.
+		 */
+		set_iopl_mask(t->iopl);
+	}
 
 	return 0;
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0b19c137c442..8a844a5d5ae8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -376,21 +376,27 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
 void tss_update_io_bitmap(void)
 {
 	struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+	u16 *base = &tss->x86_tss.io_bitmap_base;
 
 	if (test_thread_flag(TIF_IO_BITMAP)) {
-		struct io_bitmap *iobm = current->thread.io_bitmap;
-
-		/*
-		 * Only copy bitmap data when the sequence number
-		 * differs. The update time is accounted to the incoming
-		 * task.
-		 */
-		if (tss->io_bitmap.prev_sequence != iobm->sequence)
-			tss_copy_io_bitmap(tss, iobm);
-
-		/* Enable the bitmap */
-		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
-
+		struct thread_struct *t = &current->thread;
+
+		if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION) &&
+		    t->iopl_emul == 3) {
+			*base = IO_BITMAP_OFFSET_VALID_ALL;
+		} else {
+			struct io_bitmap *iobm = t->io_bitmap;
+			/*
+			 * Only copy bitmap data when the sequence number
+			 * differs. The update time is accounted to the
+			 * incoming task.
+			 */
+			if (tss->io_bitmap.prev_sequence != iobm->sequence)
+				tss_copy_io_bitmap(tss, iobm);
+
+			/* Enable the bitmap */
+			*base = IO_BITMAP_OFFSET_VALID_MAP;
+		}
 		/*
 		 * Make sure that the TSS limit is covering the io bitmap.
 		 * It might have been cut down by a VMEXIT to 0x67 which
-- 
cgit v1.2.3


From a24ca9976843156eabbc5f2d798954b5674d1b61 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Nov 2019 23:03:29 +0100
Subject: x86/iopl: Remove legacy IOPL option

The IOPL emulation via the I/O bitmap is sufficient. Remove the legacy
cruft dealing with the (e)flags based IOPL mechanism.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com> (Paravirt and Xen parts)
Acked-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/Kconfig                      | 23 +++--------------
 arch/x86/include/asm/paravirt.h       |  4 ---
 arch/x86/include/asm/paravirt_types.h |  2 --
 arch/x86/include/asm/processor.h      | 26 +++----------------
 arch/x86/include/asm/xen/hypervisor.h |  2 --
 arch/x86/kernel/ioport.c              | 47 ++++++++---------------------------
 arch/x86/kernel/paravirt.c            |  2 --
 arch/x86/kernel/process_32.c          |  9 -------
 arch/x86/kernel/process_64.c          | 11 --------
 arch/x86/xen/enlighten_pv.c           | 10 --------
 10 files changed, 17 insertions(+), 119 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2aad1cd14cc5..1f926e396ec1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1254,12 +1254,9 @@ config X86_VSYSCALL_EMULATION
 	 Disabling this option saves about 7K of kernel size and
 	 possibly 4K of additional runtime pagetable memory.
 
-choice
-	prompt "IOPL"
-	default X86_IOPL_EMULATION
-
 config X86_IOPL_EMULATION
 	bool "IOPL Emulation"
+	default y
 	---help---
 	  Legacy IOPL support is an overbroad mechanism which allows user
 	  space aside of accessing all 65536 I/O ports also to disable
@@ -1269,22 +1266,8 @@ config X86_IOPL_EMULATION
 
 	  The emulation restricts the functionality of the syscall to
 	  only allowing the full range I/O port access, but prevents the
-	  ability to disable interrupts from user space.
-
-config X86_IOPL_LEGACY
-	bool "IOPL Legacy"
-	---help---
-	Allow the full IOPL permissions, i.e. user space access to all
-	65536 I/O ports and also the ability to disable interrupts, which
-	is overbroad and can result in system lockups.
-
-config X86_IOPL_NONE
-	bool "IOPL None"
-	---help---
-	Disable the IOPL permission syscall. That's the safest option as
-	no sane application should depend on this functionality.
-
-endchoice
+	  ability to disable interrupts from user space which would be
+	  granted if the hardware IOPL mechanism would be used.
 
 config TOSHIBA
 	tristate "Toshiba Laptop support"
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 69089d46f128..86e7317eb31f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -294,10 +294,6 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
 {
 	PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
 }
-static inline void set_iopl_mask(unsigned mask)
-{
-	PVOP_VCALL1(cpu.set_iopl_mask, mask);
-}
 
 static inline void paravirt_activate_mm(struct mm_struct *prev,
 					struct mm_struct *next)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 70b654f3ffe5..84812964d3dd 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -140,8 +140,6 @@ struct pv_cpu_ops {
 
 	void (*load_sp0)(unsigned long sp0);
 
-	void (*set_iopl_mask)(unsigned mask);
-
 	void (*wbinvd)(void);
 
 	/* cpuid emulation, mostly so that caps bits can be disabled */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b0e02aa3f46a..1387d31c5e07 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -516,10 +516,10 @@ struct thread_struct {
 	struct io_bitmap	*io_bitmap;
 
 	/*
-	 * IOPL. Priviledge level dependent I/O permission which includes
-	 * user space CLI/STI when granted.
+	 * IOPL. Priviledge level dependent I/O permission which is
+	 * emulated via the I/O bitmap to prevent user space from disabling
+	 * interrupts.
 	 */
-	unsigned long		iopl;
 	unsigned long		iopl_emul;
 
 	mm_segment_t		addr_limit;
@@ -552,25 +552,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
  */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
 
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void native_set_iopl_mask(unsigned mask)
-{
-#ifdef CONFIG_X86_32
-	unsigned int reg;
-
-	asm volatile ("pushfl;"
-		      "popl %0;"
-		      "andl %1, %0;"
-		      "orl %2, %0;"
-		      "pushl %0;"
-		      "popfl"
-		      : "=&r" (reg)
-		      : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-#endif
-}
-
 static inline void
 native_load_sp0(unsigned long sp0)
 {
@@ -610,7 +591,6 @@ static inline void load_sp0(unsigned long sp0)
 	native_load_sp0(sp0);
 }
 
-#define set_iopl_mask native_set_iopl_mask
 #endif /* CONFIG_PARAVIRT_XXL */
 
 /* Free all resources held by a thread. */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 42e1245af0d8..ff4b52e37e60 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num);
 void xen_arch_unregister_cpu(int num);
 #endif
 
-extern void xen_set_iopl_mask(unsigned mask);
-
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 9ed9458e02df..d5dcde972c42 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -153,28 +153,23 @@ SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
 
 /*
  * The sys_iopl functionality depends on the level argument, which if
- * granted for the task is used by the CPU to check I/O instruction and
- * CLI/STI against the current priviledge level (CPL). If CPL is less than
- * or equal the tasks IOPL level the instructions take effect. If not a #GP
- * is raised. The default IOPL is 0, i.e. no permissions.
+ * granted for the task is used to enable access to all 65536 I/O ports.
  *
- * Setting IOPL to level 0-2 is disabling the userspace access. Only level
- * 3 enables it. If set it allows the user space thread:
+ * This does not use the IOPL mechanism provided by the CPU as that would
+ * also allow the user space task to use the CLI/STI instructions.
  *
- * - Unrestricted access to all 65535 I/O ports
- * - The usage of CLI/STI instructions
+ * Disabling interrupts in a user space task is dangerous as it might lock
+ * up the machine and the semantics vs. syscalls and exceptions is
+ * undefined.
  *
- * The advantage over ioperm is that the context switch does not require to
- * update the I/O bitmap which is especially true when a large number of
- * ports is accessed. But the allowance of CLI/STI in userspace is
- * considered a major problem.
+ * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3
+ * 3 enables them.
  *
  * IOPL is strictly per thread and inherited on fork.
  */
 SYSCALL_DEFINE1(iopl, unsigned int, level)
 {
 	struct thread_struct *t = &current->thread;
-	struct pt_regs *regs = current_pt_regs();
 	unsigned int old;
 
 	/*
@@ -187,10 +182,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 	if (level > 3)
 		return -EINVAL;
 
-	if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION))
-		old = t->iopl_emul;
-	else
-		old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+	old = t->iopl_emul;
 
 	/* No point in going further if nothing changes */
 	if (level == old)
@@ -203,25 +195,8 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 			return -EPERM;
 	}
 
-	if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION)) {
-		t->iopl_emul = level;
-		task_update_io_bitmap();
-	} else {
-		/*
-		 * Change the flags value on the return stack, which has
-		 * been set up on system-call entry. See also the fork and
-		 * signal handling code how this is handled.
-		 */
-		regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
-			(level << X86_EFLAGS_IOPL_BIT);
-		/* Store the new level in the thread struct */
-		t->iopl = level << X86_EFLAGS_IOPL_BIT;
-		/*
-		 * X86_32 switches immediately and XEN handles it via
-		 * emulation.
-		 */
-		set_iopl_mask(t->iopl);
-	}
+	t->iopl_emul = level;
+	task_update_io_bitmap();
 
 	return 0;
 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 59d3d2763a9e..789f5e4f89de 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -341,8 +341,6 @@ struct paravirt_patch_template pv_ops = {
 	.cpu.iret		= native_iret,
 	.cpu.swapgs		= native_swapgs,
 
-	.cpu.set_iopl_mask	= native_set_iopl_mask,
-
 	.cpu.start_context_switch	= paravirt_nop,
 	.cpu.end_context_switch		= paravirt_nop,
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6c7d90527156..323499f48858 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -187,15 +187,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	load_TLS(next, cpu);
 
-	/*
-	 * Restore IOPL if needed.  In normal use, the flags restore
-	 * in the switch assembly will handle this.  But if the kernel
-	 * is running virtualized at a non-zero CPL, the popf will
-	 * not restore flags, so it must be done in a separate step.
-	 */
-	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
-		set_iopl_mask(next->iopl);
-
 	switch_to_extra(prev_p, next_p);
 
 	/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e93a1b8fd7f9..506d66830d4d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -497,17 +497,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	switch_to_extra(prev_p, next_p);
 
-#ifdef CONFIG_XEN_PV
-	/*
-	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
-	 * current_pt_regs()->flags may not match the current task's
-	 * intended IOPL.  We need to switch it manually.
-	 */
-	if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
-		     prev->iopl != next->iopl))
-		xen_set_iopl_mask(next->iopl);
-#endif
-
 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 		/*
 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5bfea374a160..ae4a41ca19f6 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -837,15 +837,6 @@ static void xen_load_sp0(unsigned long sp0)
 	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
-void xen_set_iopl_mask(unsigned mask)
-{
-	struct physdev_set_iopl set_iopl;
-
-	/* Force the change at ring 0. */
-	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
-	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-}
-
 static void xen_io_delay(void)
 {
 }
@@ -1055,7 +1046,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.write_idt_entry = xen_write_idt_entry,
 	.load_sp0 = xen_load_sp0,
 
-	.set_iopl_mask = xen_set_iopl_mask,
 	.io_delay = xen_io_delay,
 
 	/* Xen takes care of %gs when switching to usermode for us */
-- 
cgit v1.2.3


From 111e7b15cf10f6e973ccf537c70c66a5de539060 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Nov 2019 21:40:33 +0100
Subject: x86/ioperm: Extend IOPL config to control ioperm() as well

If iopl() is disabled, then providing ioperm() does not make much sense.

Rename the config option and disable/enable both syscalls with it. Guard
the code with #ifdefs where appropriate.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                   |  7 +++++--
 arch/x86/include/asm/io_bitmap.h   |  6 ++++++
 arch/x86/include/asm/processor.h   |  9 ++++++++-
 arch/x86/include/asm/thread_info.h |  7 ++++++-
 arch/x86/kernel/cpu/common.c       | 26 +++++++++++++++++---------
 arch/x86/kernel/ioport.c           | 26 +++++++++++++++++++-------
 arch/x86/kernel/process.c          |  4 ++++
 7 files changed, 65 insertions(+), 20 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1f926e396ec1..b162ce1482fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1254,10 +1254,13 @@ config X86_VSYSCALL_EMULATION
 	 Disabling this option saves about 7K of kernel size and
 	 possibly 4K of additional runtime pagetable memory.
 
-config X86_IOPL_EMULATION
-	bool "IOPL Emulation"
+config X86_IOPL_IOPERM
+	bool "IOPERM and IOPL Emulation"
 	default y
 	---help---
+	  This enables the ioperm() and iopl() syscalls which are necessary
+	  for legacy applications.
+
 	  Legacy IOPL support is an overbroad mechanism which allows user
 	  space aside of accessing all 65536 I/O ports also to disable
 	  interrupts. To gain this access the caller needs CAP_SYS_RAWIO
diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
index b664baadf736..02c6ef8f7667 100644
--- a/arch/x86/include/asm/io_bitmap.h
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -15,9 +15,15 @@ struct io_bitmap {
 
 struct task_struct;
 
+#ifdef CONFIG_X86_IOPL_IOPERM
 void io_bitmap_share(struct task_struct *tsk);
 void io_bitmap_exit(void);
 
 void tss_update_io_bitmap(void);
+#else
+static inline void io_bitmap_share(struct task_struct *tsk) { }
+static inline void io_bitmap_exit(void) { }
+static inline void tss_update_io_bitmap(void) { }
+#endif
 
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1387d31c5e07..45f416a2c1f1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -340,13 +340,18 @@ struct x86_hw_tss {
 	(offsetof(struct tss_struct, io_bitmap.mapall) -	\
 	 offsetof(struct tss_struct, x86_tss))
 
+#ifdef CONFIG_X86_IOPL_IOPERM
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end of the
  * iobitmap. The limit is inclusive, i.e. the last valid byte.
  */
-#define __KERNEL_TSS_LIMIT	\
+# define __KERNEL_TSS_LIMIT	\
 	(IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
 	 sizeof(unsigned long) - 1)
+#else
+# define __KERNEL_TSS_LIMIT	\
+	(offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1)
+#endif
 
 /* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
 #define IO_BITMAP_OFFSET_INVALID	(__KERNEL_TSS_LIMIT + 1)
@@ -398,7 +403,9 @@ struct tss_struct {
 	 */
 	struct x86_hw_tss	x86_tss;
 
+#ifdef CONFIG_X86_IOPL_IOPERM
 	struct x86_io_bitmap	io_bitmap;
+#endif
 } __aligned(PAGE_SIZE);
 
 DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0accf44878a5..d779366ce3f8 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -156,8 +156,13 @@ struct thread_info {
 # define _TIF_WORK_CTXSW	(_TIF_WORK_CTXSW_BASE)
 #endif
 
-#define _TIF_WORK_CTXSW_PREV	(_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \
+#ifdef CONFIG_X86_IOPL_IOPERM
+# define _TIF_WORK_CTXSW_PREV	(_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \
 				 _TIF_IO_BITMAP)
+#else
+# define _TIF_WORK_CTXSW_PREV	(_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY)
+#endif
+
 #define _TIF_WORK_CTXSW_NEXT	(_TIF_WORK_CTXSW)
 
 #define STACK_WARN		(THREAD_SIZE/8)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7bf402be13bb..6f6ca6bd58d6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1804,6 +1804,22 @@ static inline void gdt_setup_doublefault_tss(int cpu)
 }
 #endif /* !CONFIG_X86_64 */
 
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+	tss->io_bitmap.prev_max = 0;
+	tss->io_bitmap.prev_sequence = 0;
+	memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+	/*
+	 * Invalidate the extra array entry past the end of the all
+	 * permission bitmap as required by the hardware.
+	 */
+	tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
+#endif
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1860,15 +1876,7 @@ void cpu_init(void)
 
 	/* Initialize the TSS. */
 	tss_setup_ist(tss);
-	tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
-	tss->io_bitmap.prev_max = 0;
-	tss->io_bitmap.prev_sequence = 0;
-	memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
-	/*
-	 * Invalidate the extra array entry past the end of the all
-	 * permission bitmap as required by the hardware.
-	 */
-	tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
+	tss_setup_io_bitmap(tss);
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
 	load_TR_desc();
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index d5dcde972c42..8abeee0dd7bf 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,8 @@
 #include <asm/io_bitmap.h>
 #include <asm/desc.h>
 
+#ifdef CONFIG_X86_IOPL_IOPERM
+
 static atomic64_t io_bitmap_sequence;
 
 void io_bitmap_share(struct task_struct *tsk)
@@ -172,13 +174,6 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 	struct thread_struct *t = &current->thread;
 	unsigned int old;
 
-	/*
-	 * Careful: the IOPL bits in regs->flags are undefined under Xen PV
-	 * and changing them has no effect.
-	 */
-	if (IS_ENABLED(CONFIG_X86_IOPL_NONE))
-		return -ENOSYS;
-
 	if (level > 3)
 		return -EINVAL;
 
@@ -200,3 +195,20 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 
 	return 0;
 }
+
+#else /* CONFIG_X86_IOPL_IOPERM */
+
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+	return -ENOSYS;
+}
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+	return -ENOSYS;
+}
+
+SYSCALL_DEFINE1(iopl, unsigned int, level)
+{
+	return -ENOSYS;
+}
+#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8a844a5d5ae8..7964d7db9366 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -322,6 +322,7 @@ void arch_setup_new_exec(void)
 	}
 }
 
+#ifdef CONFIG_X86_IOPL_IOPERM
 static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
 {
 	/*
@@ -409,6 +410,9 @@ void tss_update_io_bitmap(void)
 		tss_invalidate_io_bitmap(tss);
 	}
 }
+#else /* CONFIG_X86_IOPL_IOPERM */
+static inline void switch_to_bitmap(unsigned long tifp) { }
+#endif
 
 #ifdef CONFIG_SMP
 
-- 
cgit v1.2.3


From 0bcd7762727dd8ba9b9b6f828e5a4cbd5da4f725 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 26 Nov 2019 21:49:04 +0100
Subject: x86/iopl: Make 'struct tss_struct' constant size again
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the following commit:

  05b042a19443: ("x86/pti/32: Calculate the various PTI cpu_entry_area sizes correctly, make the CPU_ENTRY_AREA_PAGES assert precise")

'struct cpu_entry_area' has to be Kconfig invariant, so that we always
have a matching CPU_ENTRY_AREA_PAGES size.

This commit added a CONFIG_X86_IOPL_IOPERM dependency to tss_struct:

  111e7b15cf10: ("x86/ioperm: Extend IOPL config to control ioperm() as well")

Which, if CONFIG_X86_IOPL_IOPERM is turned off, reduces the size of
cpu_entry_area by two pages, triggering the assert:

  ./include/linux/compiler.h:391:38: error: call to ‘__compiletime_assert_202’ declared with attribute error: BUILD_BUG_ON failed: (CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE

Simplify the Kconfig dependencies and make cpu_entry_area constant
size on 32-bit kernels again.

Fixes: 05b042a19443: ("x86/pti/32: Calculate the various PTI cpu_entry_area sizes correctly, make the CPU_ENTRY_AREA_PAGES assert precise")
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b4e29d8b9e5a..e51afbb0cbfb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -411,9 +411,7 @@ struct tss_struct {
 	 */
 	struct x86_hw_tss	x86_tss;
 
-#ifdef CONFIG_X86_IOPL_IOPERM
 	struct x86_io_bitmap	io_bitmap;
-#endif
 } __aligned(PAGE_SIZE);
 
 DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
-- 
cgit v1.2.3


From 93efbde2c331004d8053f04b4bf0ca3e630b474a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 20 Nov 2019 22:12:38 -0800
Subject: x86/traps: Disentangle the 32-bit and 64-bit doublefault code

The 64-bit doublefault handler is much nicer than the 32-bit one.
As a first step toward unifying them, make the 64-bit handler
self-contained.  This should have no effect no functional effect
except in the odd case of x86_64 with CONFIG_DOUBLEFAULT=n in which
case it will change the logging a bit.

This also gets rid of CONFIG_DOUBLEFAULT configurability on 64-bit
kernels.  It didn't do anything useful -- CONFIG_DOUBLEFAULT=n
didn't actually disable doublefault handling on x86_64.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug           |  2 +-
 arch/x86/include/asm/processor.h |  1 -
 arch/x86/kernel/doublefault.c    | 11 -----------
 arch/x86/kernel/traps.c          | 12 +++---------
 4 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 409c00f74e60..c4eab8ed33a3 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,7 +117,7 @@ config DEBUG_WX
 
 config DOUBLEFAULT
 	default y
-	bool "Enable doublefault exception handler" if EXPERT
+	bool "Enable doublefault exception handler" if EXPERT && X86_32
 	---help---
 	  This option allows trapping of rare doublefault exceptions that
 	  would otherwise cause a system to silently reboot. Disabling this
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e51afbb0cbfb..f6c630097d9f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -997,7 +997,6 @@ bool xen_set_default_idle(void);
 #endif
 
 void stop_this_cpu(void *dummy);
-void df_debug(struct pt_regs *regs, long error_code);
 void microcode_check(void);
 
 enum l1tf_mitigations {
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0d6c657593f8..0b3c616b61a3 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -72,15 +72,4 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
 	.__cr3		= __pa_nodebug(swapper_pg_dir),
 };
 
-/* dummy for do_double_fault() call */
-void df_debug(struct pt_regs *regs, long error_code) {}
-
-#else /* !CONFIG_X86_32 */
-
-void df_debug(struct pt_regs *regs, long error_code)
-{
-	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
-	show_regs(regs);
-	panic("Machine halted.");
-}
 #endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..76381b04dc93 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -411,15 +411,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
 		handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 #endif
 
-#ifdef CONFIG_DOUBLEFAULT
-	df_debug(regs, error_code);
-#endif
-	/*
-	 * This is always a kernel trap and never fixable (and thus must
-	 * never return).
-	 */
-	for (;;)
-		die(str, regs, error_code);
+	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+	show_regs(regs);
+	panic("Machine halted.");
 }
 #endif
 
-- 
cgit v1.2.3


From dc4e0021b00b5a4ecba56fae509217776592b0aa Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Nov 2019 18:27:16 +0100
Subject: x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area

There are three problems with the current layout of the doublefault
stack and TSS.  First, the TSS is only cacheline-aligned, which is
not enough -- if the hardware portion of the TSS (struct x86_hw_tss)
crosses a page boundary, horrible things happen [0].  Second, the
stack and TSS are global, so simultaneous double faults on different
CPUs will cause massive corruption.  Third, the whole mechanism
won't work if user CR3 is loaded, resulting in a triple fault [1].

Let the doublefault stack and TSS share a page (which prevents the
TSS from spanning a page boundary), make it percpu, and move it into
cpu_entry_area.  Teach the stack dump code about the doublefault
stack.

[0] Real hardware will read past the end of the page onto the next
    *physical* page if a task switch happens.  Virtual machines may
    have any number of bugs, and I would consider it reasonable for
    a VM to summarily kill the guest if it tries to task-switch to
    a page-spanning TSS.

[1] Real hardware triple faults.  At least some VMs seem to hang.
    I'm not sure what's going on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpu_entry_area.h   | 12 +++++++
 arch/x86/include/asm/doublefault.h      | 13 ++++++++
 arch/x86/include/asm/pgtable_32_types.h |  7 ++--
 arch/x86/include/asm/processor.h        |  1 -
 arch/x86/kernel/cpu/common.c            | 12 ++-----
 arch/x86/kernel/doublefault_32.c        | 58 ++++++++++++++++++++++-----------
 arch/x86/kernel/dumpstack_32.c          | 30 +++++++++++++++++
 arch/x86/mm/cpu_entry_area.c            | 14 +++++++-
 8 files changed, 113 insertions(+), 34 deletions(-)
 create mode 100644 arch/x86/include/asm/doublefault.h

(limited to 'arch/x86/include/asm/processor.h')

diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index ea866c7bf31d..804734058c77 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -65,6 +65,13 @@ enum exception_stack_ordering {
 
 #endif
 
+#ifdef CONFIG_X86_32
+struct doublefault_stack {
+	unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)];
+	struct x86_hw_tss tss;
+} __aligned(PAGE_SIZE);
+#endif
+
 /*
  * cpu_entry_area is a percpu region that contains things needed by the CPU
  * and early entry/exit code.  Real types aren't used for all fields here
@@ -86,6 +93,11 @@ struct cpu_entry_area {
 #endif
 	struct entry_stack_page entry_stack_page;
 
+#ifdef CONFIG_X86_32
+	char guard_doublefault_stack[PAGE_SIZE];
+	struct doublefault_stack doublefault_stack;
+#endif
+
 	/*
 	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
 	 * we need task switches to work, and task switches write to the TSS.
diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h
new file mode 100644
index 000000000000..af9a14ac8962
--- /dev/null
+++ b/arch/x86/include/asm/doublefault.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DOUBLEFAULT_H
+#define _ASM_X86_DOUBLEFAULT_H
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
+extern void doublefault_init_cpu_tss(void);
+#else
+static inline void doublefault_init_cpu_tss(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_DOUBLEFAULT_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 19f5807260c3..0416d42e5bdd 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
 #endif
 
 /*
- * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
- * to avoid include recursion hell
+ * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE.
+ * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c
+ * to avoid include recursion hell.
  */
-#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 41)
+#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 43)
 
 /* The +1 is for the readonly IDT page: */
 #define CPU_ENTRY_AREA_BASE	\
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f6c630097d9f..0340aad3f2fc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -166,7 +166,6 @@ enum cpuid_regs_idx {
 extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
-extern struct x86_hw_tss	doublefault_tss;
 extern __u32			cpu_caps_cleared[NCAPINTS + NBUGINTS];
 extern __u32			cpu_caps_set[NCAPINTS + NBUGINTS];
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index baa2fed8deb6..2e4d90294fe6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -24,6 +24,7 @@
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
 #include <asm/mmu_context.h>
+#include <asm/doublefault.h>
 #include <asm/archrandom.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
 	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
 }
 
-static inline void gdt_setup_doublefault_tss(int cpu) { }
-
 #else /* CONFIG_X86_64 */
 
 static inline void setup_getcpu(int cpu) { }
@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu)
 
 static inline void tss_setup_ist(struct tss_struct *tss) { }
 
-static inline void gdt_setup_doublefault_tss(int cpu)
-{
-#ifdef CONFIG_DOUBLEFAULT
-	/* Set up the doublefault TSS pointer in the GDT */
-	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-}
 #endif /* !CONFIG_X86_64 */
 
 static inline void tss_setup_io_bitmap(struct tss_struct *tss)
@@ -1923,7 +1915,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	gdt_setup_doublefault_tss(cpu);
+	doublefault_init_cpu_tss();
 
 	fpu__init_cpu();
 
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 61c707ca8a09..4eecfe4825ed 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -10,10 +10,6 @@
 #include <asm/processor.h>
 #include <asm/desc.h>
 
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
-
 #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
 
 static void doublefault_fn(void)
@@ -21,6 +17,8 @@ static void doublefault_fn(void)
 	struct desc_ptr gdt_desc = {0, 0};
 	unsigned long gdt, tss;
 
+	BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+
 	native_store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
@@ -48,24 +46,46 @@ static void doublefault_fn(void)
 		cpu_relax();
 }
 
-struct x86_hw_tss doublefault_tss __cacheline_aligned = {
-	.sp0		= STACK_START,
-	.ss0		= __KERNEL_DS,
-	.ldt		= 0,
+DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
+	.tss = {
+                /*
+                 * No sp0 or ss0 -- we never run CPL != 0 with this TSS
+                 * active.  sp is filled in later.
+                 */
+		.ldt		= 0,
 	.io_bitmap_base	= IO_BITMAP_OFFSET_INVALID,
 
-	.ip		= (unsigned long) doublefault_fn,
-	/* 0x2 bit is always set */
-	.flags		= X86_EFLAGS_SF | 0x2,
-	.sp		= STACK_START,
-	.es		= __USER_DS,
-	.cs		= __KERNEL_CS,
-	.ss		= __KERNEL_DS,
-	.ds		= __USER_DS,
-	.fs		= __KERNEL_PERCPU,
+		.ip		= (unsigned long) doublefault_fn,
+		/* 0x2 bit is always set */
+		.flags		= X86_EFLAGS_SF | 0x2,
+		.es		= __USER_DS,
+		.cs		= __KERNEL_CS,
+		.ss		= __KERNEL_DS,
+		.ds		= __USER_DS,
+		.fs		= __KERNEL_PERCPU,
 #ifndef CONFIG_X86_32_LAZY_GS
-	.gs		= __KERNEL_STACK_CANARY,
+		.gs		= __KERNEL_STACK_CANARY,
 #endif
 
-	.__cr3		= __pa_nodebug(swapper_pg_dir),
+		.__cr3		= __pa_nodebug(swapper_pg_dir),
+	},
 };
+
+void doublefault_init_cpu_tss(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+	/*
+	 * The linker isn't smart enough to initialize percpu variables that
+	 * point to other places in percpu space.
+	 */
+        this_cpu_write(doublefault_stack.tss.sp,
+                       (unsigned long)&cea->doublefault_stack.stack +
+                       sizeof(doublefault_stack.stack));
+
+	/* Set up doublefault TSS pointer in the GDT */
+	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+		       &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 64a59d726639..8e3a8fedfa4d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
 	if (type == STACK_TYPE_ENTRY)
 		return "ENTRY_TRAMPOLINE";
 
+	if (type == STACK_TYPE_EXCEPTION)
+		return "#DF";
+
 	return NULL;
 }
 
@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
 	return true;
 }
 
+static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
+{
+#ifdef CONFIG_DOUBLEFAULT
+	struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
+	struct doublefault_stack *ss = &cea->doublefault_stack;
+
+	void *begin = ss->stack;
+	void *end = begin + sizeof(ss->stack);
+
+	if ((void *)stack < begin || (void *)stack >= end)
+		return false;
+
+	info->type	= STACK_TYPE_EXCEPTION;
+	info->begin	= begin;
+	info->end	= end;
+	info->next_sp	= (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
+
+	return true;
+#else
+	return false;
+#endif
+}
+
+
 int get_stack_info(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info, unsigned long *visit_mask)
 {
@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (in_softirq_stack(stack, info))
 		goto recursion_check;
 
+	if (in_doublefault_stack(stack, info))
+		goto recursion_check;
+
 	goto unknown;
 
 recursion_check:
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 82ead8e27888..56f9189bbadb 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
 DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
 #endif
 
+#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
+DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
+#endif
+
 struct cpu_entry_area *get_cpu_entry_area(int cpu)
 {
 	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
@@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
 	cea_map_stack(MCE);
 }
 #else
-static inline void percpu_setup_exception_stacks(unsigned int cpu) {}
+static inline void percpu_setup_exception_stacks(unsigned int cpu)
+{
+#ifdef CONFIG_DOUBLEFAULT
+	struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+	cea_map_percpu_pages(&cea->doublefault_stack,
+			     &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
+#endif
+}
 #endif
 
 /* Setup the fixmap mappings only once per-processor */
-- 
cgit v1.2.3