From 02390b87a9459937cdb299e6b34ff33992512ec7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 14 Feb 2018 14:16:49 +0300
Subject: mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS

With boot-time switching between paging mode we will have variable
MAX_PHYSMEM_BITS.

Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y
configuration to define zsmalloc data structures.

The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case.
It also suits well to handle PAE special case.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Nitin Gupta <ngupta@vflare.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180214111656.88514-3-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 mm/zsmalloc.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c3013505c305..b7f61cd1c709 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -84,18 +84,19 @@
  * This is made more complicated by various memory models and PAE.
  */
 
-#ifndef MAX_PHYSMEM_BITS
-#ifdef CONFIG_HIGHMEM64G
-#define MAX_PHYSMEM_BITS 36
-#else /* !CONFIG_HIGHMEM64G */
+#ifndef MAX_POSSIBLE_PHYSMEM_BITS
+#ifdef MAX_PHYSMEM_BITS
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
+#else
 /*
  * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
  * be PAGE_SHIFT
  */
-#define MAX_PHYSMEM_BITS BITS_PER_LONG
+#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
 #endif
 #endif
-#define _PFN_BITS		(MAX_PHYSMEM_BITS - PAGE_SHIFT)
+
+#define _PFN_BITS		(MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
 
 /*
  * Memory for allocating for handle keeps object position by
-- 
cgit v1.2.3


From c65e774fb3f6af212641538694b9778ff9ab4300 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 14 Feb 2018 14:16:53 +0300
Subject: x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable

For boot-time switching between 4- and 5-level paging we need to be able
to fold p4d page table level at runtime. It requires variable
PGDIR_SHIFT and PTRS_PER_P4D.

The change doesn't affect the kernel image size much:

   text	   data	    bss	    dec	    hex	filename
8628091	4734304	1368064	14730459	 e0c4db	vmlinux.before
8628393	4734340	1368064	14730797	 e0c62d	vmlinux.after

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20180214111656.88514-7-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/kaslr.c        |  2 ++
 arch/x86/include/asm/pgtable_32.h       |  2 ++
 arch/x86/include/asm/pgtable_64_types.h | 19 ++++++++++++-------
 arch/x86/kernel/cpu/mcheck/mce.c        | 18 ++++++------------
 arch/x86/kernel/head64.c                |  6 +++++-
 arch/x86/mm/dump_pagetables.c           | 12 +++++-------
 arch/x86/mm/init_64.c                   |  2 +-
 arch/x86/mm/kasan_init_64.c             |  2 +-
 arch/x86/platform/efi/efi_64.c          |  4 ++--
 include/asm-generic/5level-fixup.h      |  1 +
 include/asm-generic/pgtable-nop4d.h     |  9 +++++----
 include/linux/kasan.h                   |  2 +-
 mm/kasan/kasan_init.c                   |  2 +-
 13 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index bd69e1830fbe..b18e8f9512de 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -48,6 +48,8 @@
 
 #ifdef CONFIG_X86_5LEVEL
 unsigned int pgtable_l5_enabled __ro_after_init = 1;
+unsigned int pgdir_shift __ro_after_init = 48;
+unsigned int ptrs_per_p4d __ro_after_init = 512;
 #endif
 
 extern unsigned long get_cmd_line_ptr(void);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index e67c0620aec2..d829360e26bd 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -33,6 +33,8 @@ static inline void pgtable_cache_init(void) { }
 static inline void check_pgt_cache(void) { }
 void paging_init(void);
 
+static inline int pgd_large(pgd_t pgd) { return 0; }
+
 /*
  * Define this if things work differently on an i386 and an i486:
  * it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 903e4d054bcb..0c48d80e11d4 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -26,6 +26,9 @@ extern unsigned int pgtable_l5_enabled;
 #define pgtable_l5_enabled 0
 #endif
 
+extern unsigned int pgdir_shift;
+extern unsigned int ptrs_per_p4d;
+
 #endif	/* !__ASSEMBLY__ */
 
 #define SHARED_KERNEL_PMD	0
@@ -35,16 +38,17 @@ extern unsigned int pgtable_l5_enabled;
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
-#define PGDIR_SHIFT	48
+#define PGDIR_SHIFT	pgdir_shift
 #define PTRS_PER_PGD	512
 
 /*
  * 4th level page in 5-level paging case
  */
-#define P4D_SHIFT	39
-#define PTRS_PER_P4D	512
-#define P4D_SIZE	(_AC(1, UL) << P4D_SHIFT)
-#define P4D_MASK	(~(P4D_SIZE - 1))
+#define P4D_SHIFT		39
+#define MAX_PTRS_PER_P4D	512
+#define PTRS_PER_P4D		ptrs_per_p4d
+#define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK		(~(P4D_SIZE - 1))
 
 #define MAX_POSSIBLE_PHYSMEM_BITS	52
 
@@ -53,8 +57,9 @@ extern unsigned int pgtable_l5_enabled;
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
-#define PGDIR_SHIFT	39
-#define PTRS_PER_PGD	512
+#define PGDIR_SHIFT		39
+#define PTRS_PER_PGD		512
+#define MAX_PTRS_PER_P4D	1
 
 #endif /* CONFIG_X86_5LEVEL */
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3a8e88a611eb..cbb3af721291 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1082,19 +1082,7 @@ void arch_unmap_kpfn(unsigned long pfn)
 	 * a legal address.
 	 */
 
-/*
- * Build time check to see if we have a spare virtual bit. Don't want
- * to leave this until run time because most developers don't have a
- * system that can exercise this code path. This will only become a
- * problem if/when we move beyond 5-level page tables.
- *
- * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
- */
-#if PGDIR_SHIFT + 9 < 63
 	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
-#else
-#error "no unused virtual bit available"
-#endif
 
 	if (set_memory_np(decoy_addr, 1))
 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
@@ -2328,6 +2316,12 @@ static __init int mcheck_init_device(void)
 {
 	int err;
 
+	/*
+	 * Check if we have a spare virtual bit. This will only become
+	 * a problem if/when we move beyond 5-level page tables.
+	 */
+	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
 	if (!mce_available(&boot_cpu_data)) {
 		err = -EIO;
 		goto err_out;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 17d00d1886de..98b0ff49b220 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -42,6 +42,10 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 #ifdef CONFIG_X86_5LEVEL
 unsigned int pgtable_l5_enabled __ro_after_init = 1;
 EXPORT_SYMBOL(pgtable_l5_enabled);
+unsigned int pgdir_shift __ro_after_init = 48;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 512;
+EXPORT_SYMBOL(ptrs_per_p4d);
 #endif
 
 #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
@@ -336,7 +340,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
 	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+	MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 				(__START_KERNEL & PGDIR_MASK)));
 	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a89f2dbc3531..420058b05d39 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -428,14 +428,15 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
 #define p4d_none(a)  pud_none(__pud(p4d_val(a)))
 #endif
 
-#if PTRS_PER_P4D > 1
-
 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
 {
 	int i;
 	p4d_t *start, *p4d_start;
 	pgprotval_t prot;
 
+	if (PTRS_PER_P4D == 1)
+		return walk_pud_level(m, st, __p4d(pgd_val(addr)), P);
+
 	p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_P4D; i++) {
@@ -455,11 +456,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 	}
 }
 
-#else
-#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
-#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
-#define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
-#endif
+#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a)  (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
 
 static inline bool is_hypervisor_range(int idx)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ab42c852069..6a4b20bc7527 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -143,7 +143,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 		 * With folded p4d, pgd_none() is always false, we need to
 		 * handle synchonization on p4d level.
 		 */
-		BUILD_BUG_ON(pgd_none(*pgd_ref));
+		MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
 		p4d_ref = p4d_offset(pgd_ref, addr);
 
 		if (p4d_none(*p4d_ref))
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index af6f2f9c6a26..12ec90f62457 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -19,7 +19,7 @@
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
-static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
 
 static __init void *early_alloc(size_t size, int nid, bool panic)
 {
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 780460aa5ea5..d52aaa7dc088 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -257,8 +257,8 @@ void efi_sync_low_kernel_mappings(void)
 	 * only span a single PGD entry and that the entry also maps
 	 * other important kernel regions.
 	 */
-	BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
-	BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+	MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+	MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
 			(EFI_VA_END & PGDIR_MASK));
 
 	pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index dfbd9d990637..9c2e0708eb82 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -8,6 +8,7 @@
 #define P4D_SHIFT			PGDIR_SHIFT
 #define P4D_SIZE			PGDIR_SIZE
 #define P4D_MASK			PGDIR_MASK
+#define MAX_PTRS_PER_P4D		1
 #define PTRS_PER_P4D			1
 
 #define p4d_t				pgd_t
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index 8f22f55de17a..1a29b2a0282b 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -8,10 +8,11 @@
 
 typedef struct { pgd_t pgd; } p4d_t;
 
-#define P4D_SHIFT	PGDIR_SHIFT
-#define PTRS_PER_P4D	1
-#define P4D_SIZE	(1UL << P4D_SHIFT)
-#define P4D_MASK	(~(P4D_SIZE-1))
+#define P4D_SHIFT		PGDIR_SHIFT
+#define MAX_PTRS_PER_P4D	1
+#define PTRS_PER_P4D		1
+#define P4D_SIZE		(1UL << P4D_SHIFT)
+#define P4D_MASK		(~(P4D_SIZE-1))
 
 /*
  * The "pgd_xxx()" functions here are trivial for a folded two-level
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index adc13474a53b..d6459bd1376d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -18,7 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE];
 extern pte_t kasan_zero_pte[PTRS_PER_PTE];
 extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
 extern pud_t kasan_zero_pud[PTRS_PER_PUD];
-extern p4d_t kasan_zero_p4d[PTRS_PER_P4D];
+extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D];
 
 void kasan_populate_zero_shadow(const void *shadow_start,
 				const void *shadow_end);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 554e4c0f23a2..f436246ccc79 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -31,7 +31,7 @@
 unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
 
 #if CONFIG_PGTABLE_LEVELS > 4
-p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss;
+p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
 #endif
 #if CONFIG_PGTABLE_LEVELS > 3
 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
-- 
cgit v1.2.3


From 15d9f3d116c02a485441d758d9ca0a2e4f3b30be Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennisszhou@gmail.com>
Date: Thu, 15 Feb 2018 10:08:14 -0600
Subject: percpu: match chunk allocator declarations with definitions

At some point the function declaration parameters got out of sync with
the function definitions in percpu-vm.c and percpu-km.c. This patch
makes them match again.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 50e7fdf84055..e1ea41002173 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1277,8 +1277,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  * pcpu_addr_to_page		- translate address to physical address
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+			       int page_start, int page_end);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+				  int page_start, int page_end);
 static struct pcpu_chunk *pcpu_create_chunk(void);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
-- 
cgit v1.2.3


From 47504ee04b9241548ae2c28be7d0b01cff3b7aa6 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennisszhou@gmail.com>
Date: Fri, 16 Feb 2018 12:07:19 -0600
Subject: percpu: add __GFP_NORETRY semantics to the percpu balancing path

Percpu memory using the vmalloc area based chunk allocator lazily
populates chunks by first requesting the full virtual address space
required for the chunk and subsequently adding pages as allocations come
through. To ensure atomic allocations can succeed, a workqueue item is
used to maintain a minimum number of empty pages. In certain scenarios,
such as reported in [1], it is possible that physical memory becomes
quite scarce which can result in either a rather long time spent trying
to find free pages or worse, a kernel panic.

This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them
through to the underlying allocators. This should prevent any
unnecessary panics potentially caused by the workqueue item. The passing
of gfp around is as additional flags rather than a full set of flags.
The next patch will change these to caller passed semantics.

V2:
Added const modifier to gfp flags in the balance path.
Removed an extra whitespace.

[1] https://lkml.org/lkml/2018/2/12/551

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu-km.c |  8 ++++----
 mm/percpu-vm.c | 18 +++++++++++-------
 mm/percpu.c    | 44 +++++++++++++++++++++++++++-----------------
 3 files changed, 42 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d2a76642c4ae..0d88d7bd5706 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
 #include <linux/log2.h>
 
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	return 0;
 }
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	/* nada */
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 	struct pcpu_chunk *chunk;
 	struct page *pages;
 	int i;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
-	pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+	pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
 	if (!pages) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a81391..0af71eb2fff0 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
 	lockdep_assert_held(&pcpu_alloc_mutex);
 
 	if (!pages)
-		pages = pcpu_mem_zalloc(pages_size);
+		pages = pcpu_mem_zalloc(pages_size, 0);
 	return pages;
 }
 
@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
  * @page_start: page index of the first page to be allocated
  * @page_end: page index of the last page to be allocated + 1
+ * @gfp: allocation flags passed to the underlying allocator
  *
  * Allocate pages [@page_start,@page_end) into @pages for all units.
  * The allocation is for @chunk.  Percpu core doesn't care about the
  * content of @pages and will pass it verbatim to pcpu_map_pages().
  */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-			    struct page **pages, int page_start, int page_end)
+			    struct page **pages, int page_start, int page_end,
+			    gfp_t gfp)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
 	unsigned int cpu, tcpu;
 	int i;
 
+	gfp |= GFP_KERNEL | __GFP_HIGHMEM;
+
 	for_each_possible_cpu(cpu) {
 		for (i = page_start; i < page_end; i++) {
 			struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  * @chunk: chunk of interest
  * @page_start: the start page
  * @page_end: the end page
+ * @gfp: allocation flags passed to the underlying memory allocator
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.
@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
  * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end)
+			       int page_start, int page_end, gfp_t gfp)
 {
 	struct page **pages;
 
@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
 	if (!pages)
 		return -ENOMEM;
 
-	if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
+	if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
 		return -ENOMEM;
 
 	if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 	pcpu_free_pages(chunk, pages, page_start, page_end);
 }
 
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	struct vm_struct **vms;
 
-	chunk = pcpu_alloc_chunk();
+	chunk = pcpu_alloc_chunk(gfp);
 	if (!chunk)
 		return NULL;
 
diff --git a/mm/percpu.c b/mm/percpu.c
index e1ea41002173..f97443d488a8 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
+ * @gfp: allocation flags
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used.  The returned
- * memory is always zeroed.
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
+ * This is to facilitate passing through whitelisted flags.  The
+ * returned memory is always zeroed.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
@@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
 
 	if (size <= PAGE_SIZE)
-		return kzalloc(size, GFP_KERNEL);
+		return kzalloc(size, gfp | GFP_KERNEL);
 	else
-		return vzalloc(size);
+		return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
+				 PAGE_KERNEL);
 }
 
 /**
@@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 	return chunk;
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
 	struct pcpu_chunk *chunk;
 	int region_bits;
 
-	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
 	if (!chunk)
 		return NULL;
 
@@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 	region_bits = pcpu_chunk_map_bits(chunk);
 
 	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
-					   sizeof(chunk->alloc_map[0]));
+					   sizeof(chunk->alloc_map[0]), gfp);
 	if (!chunk->alloc_map)
 		goto alloc_map_fail;
 
 	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
-					   sizeof(chunk->bound_map[0]));
+					   sizeof(chunk->bound_map[0]), gfp);
 	if (!chunk->bound_map)
 		goto bound_map_fail;
 
 	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
-					   sizeof(chunk->md_blocks[0]));
+					   sizeof(chunk->md_blocks[0]), gfp);
 	if (!chunk->md_blocks)
 		goto md_blocks_fail;
 
@@ -1278,10 +1281,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-			       int page_start, int page_end);
+			       int page_start, int page_end, gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
 				  int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1423,7 +1426,7 @@ restart:
 	}
 
 	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk();
+		chunk = pcpu_create_chunk(0);
 		if (!chunk) {
 			err = "failed to allocate new chunk";
 			goto fail;
@@ -1452,7 +1455,7 @@ area_found:
 					   page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
-			ret = pcpu_populate_chunk(chunk, rs, re);
+			ret = pcpu_populate_chunk(chunk, rs, re, 0);
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
@@ -1563,10 +1566,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  * @work: unused
  *
- * Reclaim all fully free chunks except for the first one.
+ * Reclaim all fully free chunks except for the first one.  This is also
+ * responsible for maintaining the pool of empty populated pages.  However,
+ * it is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered.  We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
  */
 static void pcpu_balance_workfn(struct work_struct *work)
 {
+	/* gfp flags passed to underlying allocators */
+	const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
 	LIST_HEAD(to_free);
 	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
@@ -1647,7 +1657,7 @@ retry_pop:
 					   chunk->nr_pages) {
 			int nr = min(re - rs, nr_to_pop);
 
-			ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
 			if (!ret) {
 				nr_to_pop -= nr;
 				spin_lock_irq(&pcpu_lock);
@@ -1664,7 +1674,7 @@ retry_pop:
 
 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
-		chunk = pcpu_create_chunk();
+		chunk = pcpu_create_chunk(gfp);
 		if (chunk) {
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
-- 
cgit v1.2.3


From 554fef1c39ee148623a496e04569dabb11463406 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennisszhou@gmail.com>
Date: Fri, 16 Feb 2018 12:09:58 -0600
Subject: percpu: allow select gfp to be passed to underlying allocators

The prior patch added support for passing gfp flags through to the
underlying allocators. This patch allows users to pass along gfp flags
(currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying
allocators. This should allow users to decide if they are ok with
failing allocations recovering in a more graceful way.

Additionally, gfp passing was done as additional flags in the previous
patch. Instead, change this to caller passed semantics. GFP_KERNEL is
also removed as the default flag. It continues to be used for internally
caused underlying percpu allocations.

V2:
Removed gfp_percpu_mask in favor of doing it inline.
Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu-km.c |  2 +-
 mm/percpu-vm.c |  4 ++--
 mm/percpu.c    | 16 +++++++---------
 3 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 0d88d7bd5706..38de70ab1a0d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -56,7 +56,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 	if (!chunk)
 		return NULL;
 
-	pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
+	pages = alloc_pages(gfp, order_base_2(nr_pages));
 	if (!pages) {
 		pcpu_free_chunk(chunk);
 		return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 0af71eb2fff0..d8078de912de 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
 	lockdep_assert_held(&pcpu_alloc_mutex);
 
 	if (!pages)
-		pages = pcpu_mem_zalloc(pages_size, 0);
+		pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
 	return pages;
 }
 
@@ -86,7 +86,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 	unsigned int cpu, tcpu;
 	int i;
 
-	gfp |= GFP_KERNEL | __GFP_HIGHMEM;
+	gfp |= __GFP_HIGHMEM;
 
 	for_each_possible_cpu(cpu) {
 		for (i = page_start; i < page_end; i++) {
diff --git a/mm/percpu.c b/mm/percpu.c
index f97443d488a8..fa3f854634a1 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -454,9 +454,6 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  * This is to facilitate passing through whitelisted flags.  The
  * returned memory is always zeroed.
  *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -466,10 +463,9 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 		return NULL;
 
 	if (size <= PAGE_SIZE)
-		return kzalloc(size, gfp | GFP_KERNEL);
+		return kzalloc(size, gfp);
 	else
-		return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
-				 PAGE_KERNEL);
+		return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
 }
 
 /**
@@ -1344,6 +1340,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 				 gfp_t gfp)
 {
+	/* whitelisted flags that can be passed to the backing allocators */
+	gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
 	bool do_warn = !(gfp & __GFP_NOWARN);
 	static int warn_limit = 10;
@@ -1426,7 +1424,7 @@ restart:
 	}
 
 	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-		chunk = pcpu_create_chunk(0);
+		chunk = pcpu_create_chunk(pcpu_gfp);
 		if (!chunk) {
 			err = "failed to allocate new chunk";
 			goto fail;
@@ -1455,7 +1453,7 @@ area_found:
 					   page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
-			ret = pcpu_populate_chunk(chunk, rs, re, 0);
+			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
@@ -1576,7 +1574,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 static void pcpu_balance_workfn(struct work_struct *work)
 {
 	/* gfp flags passed to underlying allocators */
-	const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 	LIST_HEAD(to_free);
 	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
-- 
cgit v1.2.3


From 5f171577b4f35b44795a73bde8cf2c49b4073925 Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Tue, 24 Oct 2017 16:52:32 +0100
Subject: Drop a bunch of metag references

Now that arch/metag/ has been removed, drop a bunch of metag references
in various codes across the whole tree:
 - VM_GROWSUP and __VM_ARCH_SPECIFIC_1.
 - MT_METAG_* ELF note types.
 - METAG Kconfig dependencies (FRAME_POINTER) and ranges
   (MAX_STACK_SIZE_MB).
 - metag cases in tools (checkstack.pl, recordmcount.c, perf).

Signed-off-by: James Hogan <jhogan@kernel.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: linux-mm@kvack.org
Cc: linux-metag@vger.kernel.org
---
 include/linux/cpuhotplug.h     |  1 -
 include/linux/mm.h             |  2 --
 include/trace/events/mmflags.h |  2 +-
 include/uapi/linux/elf.h       |  3 ---
 lib/Kconfig.debug              |  2 +-
 mm/Kconfig                     |  7 +++----
 scripts/checkstack.pl          |  4 ----
 scripts/recordmcount.c         | 20 --------------------
 tools/perf/perf-sys.h          |  4 ----
 9 files changed, 5 insertions(+), 40 deletions(-)

(limited to 'mm')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 5172ad0daa7c..c7a950681f3a 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -108,7 +108,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_CQM_STARTING,
 	CPUHP_AP_PERF_X86_CSTATE_STARTING,
 	CPUHP_AP_PERF_XTENSA_STARTING,
-	CPUHP_AP_PERF_METAG_STARTING,
 	CPUHP_AP_MIPS_OP_LOONGSON3_STARTING,
 	CPUHP_AP_ARM_SDEI_STARTING,
 	CPUHP_AP_ARM_VFP_STARTING,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad06d42adb1a..ccac10682ce5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -241,8 +241,6 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
 #elif defined(CONFIG_PARISC)
 # define VM_GROWSUP	VM_ARCH_1
-#elif defined(CONFIG_METAG)
-# define VM_GROWSUP	VM_ARCH_1
 #elif defined(CONFIG_IA64)
 # define VM_GROWSUP	VM_ARCH_1
 #elif !defined(CONFIG_MMU)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index dbe1bb058c09..a81cffb76d89 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -115,7 +115,7 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
 #define __VM_ARCH_SPECIFIC_1 {VM_PAT,     "pat"           }
 #elif defined(CONFIG_PPC)
 #define __VM_ARCH_SPECIFIC_1 {VM_SAO,     "sao"           }
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
+#elif defined(CONFIG_PARISC) || defined(CONFIG_IA64)
 #define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP,	"growsup"	}
 #elif !defined(CONFIG_MMU)
 #define __VM_ARCH_SPECIFIC_1 {VM_MAPPED_COPY,"mappedcopy"	}
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 3bf73fb58045..e2535d6dcec7 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -420,9 +420,6 @@ typedef struct elf64_shdr {
 #define NT_ARM_HW_WATCH	0x403		/* ARM hardware watchpoint registers */
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
-#define NT_METAG_CBUF	0x500		/* Metag catch buffer registers */
-#define NT_METAG_RPIPE	0x501		/* Metag read pipeline state */
-#define NT_METAG_TLS	0x502		/* Metag TLS pointer */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 
 /* Note header in a PT_NOTE section */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6088408ef26c..d1c523e408e9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -356,7 +356,7 @@ config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
 	depends on DEBUG_KERNEL && \
 		(CRIS || M68K || FRV || UML || \
-		 SUPERH || BLACKFIN || MN10300 || METAG) || \
+		 SUPERH || BLACKFIN || MN10300) || \
 		ARCH_WANT_FRAME_POINTERS
 	default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS
 	help
diff --git a/mm/Kconfig b/mm/Kconfig
index c782e8fb7235..abefa573bcd8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -627,15 +627,14 @@ config GENERIC_EARLY_IOREMAP
 config MAX_STACK_SIZE_MB
 	int "Maximum user stack size for 32-bit processes (MB)"
 	default 80
-	range 8 256 if METAG
 	range 8 2048
 	depends on STACK_GROWSUP && (!64BIT || COMPAT)
 	help
 	  This is the maximum stack size in Megabytes in the VM layout of 32-bit
 	  user processes when the stack grows upwards (currently only on parisc
-	  and metag arch). The stack will be located at the highest memory
-	  address minus the given value, unless the RLIMIT_STACK hard limit is
-	  changed to a smaller value in which case that is used.
+	  arch). The stack will be located at the highest memory address minus
+	  the given value, unless the RLIMIT_STACK hard limit is changed to a
+	  smaller value in which case that is used.
 
 	  A sane initial value is 80 MB.
 
diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl
index cb993801e4b2..eeb9ac8dbcfb 100755
--- a/scripts/checkstack.pl
+++ b/scripts/checkstack.pl
@@ -64,10 +64,6 @@ my (@stack, $re, $dre, $x, $xs, $funcre);
 		#    2b6c:       4e56 fb70       linkw %fp,#-1168
 		#  1df770:       defc ffe4       addaw #-28,%sp
 		$re = qr/.*(?:linkw %fp,|addaw )#-([0-9]{1,4})(?:,%sp)?$/o;
-	} elsif ($arch eq 'metag') {
-		#400026fc:       40 00 00 82     ADD       A0StP,A0StP,#0x8
-		$re = qr/.*ADD.*A0StP,A0StP,\#(0x$x{1,8})/o;
-		$funcre = qr/^$x* <[^\$](.*)>:$/;
 	} elsif ($arch eq 'mips64') {
 		#8800402c:       67bdfff0        daddiu  sp,sp,-16
 		$re = qr/.*daddiu.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o;
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 16e086dcc567..8c9691c3329e 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -33,20 +33,6 @@
 #include <string.h>
 #include <unistd.h>
 
-/*
- * glibc synced up and added the metag number but didn't add the relocations.
- * Work around this in a crude manner for now.
- */
-#ifndef EM_METAG
-#define EM_METAG      174
-#endif
-#ifndef R_METAG_ADDR32
-#define R_METAG_ADDR32                   2
-#endif
-#ifndef R_METAG_NONE
-#define R_METAG_NONE                     3
-#endif
-
 #ifndef EM_AARCH64
 #define EM_AARCH64	183
 #define R_AARCH64_NONE		0
@@ -538,12 +524,6 @@ do_file(char const *const fname)
 			gpfx = '_';
 			break;
 	case EM_IA_64:	 reltype = R_IA64_IMM64;   gpfx = '_'; break;
-	case EM_METAG:	 reltype = R_METAG_ADDR32;
-			 altmcount = "_mcount_wrapper";
-			 rel_type_nop = R_METAG_NONE;
-			 /* We happen to have the same requirement as MIPS */
-			 is_fake_mcount32 = MIPS32_is_fake_mcount;
-			 break;
 	case EM_MIPS:	 /* reltype: e_class    */ gpfx = '_'; break;
 	case EM_PPC:	 reltype = R_PPC_ADDR32;   gpfx = '_'; break;
 	case EM_PPC64:	 reltype = R_PPC64_ADDR64; gpfx = '_'; break;
diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h
index 36673f98d66b..3eb7a39169f6 100644
--- a/tools/perf/perf-sys.h
+++ b/tools/perf/perf-sys.h
@@ -46,10 +46,6 @@
 #define CPUINFO_PROC	{"Processor"}
 #endif
 
-#ifdef __metag__
-#define CPUINFO_PROC	{"CPU"}
-#endif
-
 #ifdef __xtensa__
 #define CPUINFO_PROC	{"core ID"}
 #endif
-- 
cgit v1.2.3


From accd4f36a7d11c2d54544007eb65e10604dcf2f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 23 Feb 2018 08:12:42 -0800
Subject: percpu: add a schedule point in pcpu_balance_workfn()

When a large BPF percpu map is destroyed, I have seen
pcpu_balance_workfn() holding cpu for hundreds of milliseconds.

On KASAN config and 112 hyperthreads, average time to destroy a chunk
is ~4 ms.

[ 2489.841376] destroy chunk 1 in 4148689 ns
...
[ 2490.093428] destroy chunk 32 in 4072718 ns

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index fa3f854634a1..36e7b65ba6cf 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1610,6 +1610,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			spin_unlock_irq(&pcpu_lock);
 		}
 		pcpu_destroy_chunk(chunk);
+		cond_resched();
 	}
 
 	/*
-- 
cgit v1.2.3


From 4704dea36dd9e5b4bf37ed20f7f15e70632ccdd0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 9 Mar 2018 15:50:55 -0800
Subject: hugetlb: fix surplus pages accounting

Dan Rue has noticed that libhugetlbfs test suite fails counter test:

  # mount_point="/mnt/hugetlb/"
  # echo 200 > /proc/sys/vm/nr_hugepages
  # mkdir -p "${mount_point}"
  # mount -t hugetlbfs hugetlbfs "${mount_point}"
  # export LD_LIBRARY_PATH=/root/libhugetlbfs/libhugetlbfs-2.20/obj64
  # /root/libhugetlbfs/libhugetlbfs-2.20/tests/obj64/counters
  Starting testcase "/root/libhugetlbfs/libhugetlbfs-2.20/tests/obj64/counters", pid 3319
  Base pool size: 0
  Clean...
  FAIL    Line 326: Bad HugePages_Total: expected 0, actual 1

The bug was bisected to 0c397daea1d4 ("mm, hugetlb: further simplify
hugetlb allocation API").

The reason is that alloc_surplus_huge_page() misaccounts per node
surplus pages.  We should increase surplus_huge_pages_node rather than
nr_huge_pages_node which is already handled by alloc_fresh_huge_page.

Link: http://lkml.kernel.org/r/20180221191439.GM2231@dhcp22.suse.cz
Fixes: 0c397daea1d4 ("mm, hugetlb: further simplify hugetlb allocation API")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Dan Rue <dan.rue@linaro.org>
Tested-by: Dan Rue <dan.rue@linaro.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c204e3d132b..a963f2034dfc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1583,7 +1583,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 		page = NULL;
 	} else {
 		h->surplus_huge_pages++;
-		h->nr_huge_pages_node[page_to_nid(page)]++;
+		h->surplus_huge_pages_node[page_to_nid(page)]++;
 	}
 
 out_unlock:
-- 
cgit v1.2.3


From 96312e61282ae3f6537a562625706498cbc75594 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 9 Mar 2018 15:51:06 -0800
Subject: mm/gup.c: teach get_user_pages_unlocked to handle FOLL_NOWAIT

KVM is hanging during postcopy live migration with userfaultfd because
get_user_pages_unlocked is not capable to handle FOLL_NOWAIT.

Earlier FOLL_NOWAIT was only ever passed to get_user_pages.

Specifically faultin_page (the callee of get_user_pages_unlocked caller)
doesn't know that if FAULT_FLAG_RETRY_NOWAIT was set in the page fault
flags, when VM_FAULT_RETRY is returned, the mmap_sem wasn't actually
released (even if nonblocking is not NULL).  So it sets *nonblocking to
zero and the caller won't release the mmap_sem thinking it was already
released, but it wasn't because of FOLL_NOWAIT.

Link: http://lkml.kernel.org/r/20180302174343.5421-2-aarcange@redhat.com
Fixes: ce53053ce378c ("kvm: switch get_user_page_nowait() to get_user_pages_unlocked()")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/gup.c b/mm/gup.c
index 1b46e6e74881..6afae32571ca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -516,7 +516,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 	}
 
 	if (ret & VM_FAULT_RETRY) {
-		if (nonblocking)
+		if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
 			*nonblocking = 0;
 		return -EBUSY;
 	}
@@ -890,7 +890,10 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 				break;
 		}
 		if (*locked) {
-			/* VM_FAULT_RETRY didn't trigger */
+			/*
+			 * VM_FAULT_RETRY didn't trigger or it was a
+			 * FOLL_NOWAIT.
+			 */
 			if (!pages_done)
 				pages_done = ret;
 			break;
-- 
cgit v1.2.3


From 379b03b7fa05f7db521b7732a52692448a3c34fe Mon Sep 17 00:00:00 2001
From: Daniel Vacek <neelx@redhat.com>
Date: Fri, 9 Mar 2018 15:51:09 -0800
Subject: mm/memblock.c: hardcode the end_pfn being -1

This is just a cleanup.  It aids handling the special end case in the
next commit.

[akpm@linux-foundation.org: make it work against current -linus, not against -mm]
[akpm@linux-foundation.org: make it work against current -linus, not against -mm some more]
Link: http://lkml.kernel.org/r/1ca478d4269125a99bcfb1ca04d7b88ac1aee924.1520011944.git.neelx@redhat.com
Signed-off-by: Daniel Vacek <neelx@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 5a9ca2a1751b..b6ba6b7adadc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1107,7 +1107,7 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
 	struct memblock_type *type = &memblock.memory;
 	unsigned int right = type->cnt;
 	unsigned int mid, left = 0;
-	phys_addr_t addr = PFN_PHYS(pfn + 1);
+	phys_addr_t addr = PFN_PHYS(++pfn);
 
 	do {
 		mid = (right + left) / 2;
@@ -1118,15 +1118,15 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
 				  type->regions[mid].size))
 			left = mid + 1;
 		else {
-			/* addr is within the region, so pfn + 1 is valid */
-			return min(pfn + 1, max_pfn);
+			/* addr is within the region, so pfn is valid */
+			return pfn;
 		}
 	} while (left < right);
 
 	if (right == type->cnt)
-		return max_pfn;
+		return -1UL;
 	else
-		return min(PHYS_PFN(type->regions[right].base), max_pfn);
+		return PHYS_PFN(type->regions[right].base);
 }
 
 /**
-- 
cgit v1.2.3


From 864b75f9d6b0100bb24fdd9a20d156e7cda9b5ae Mon Sep 17 00:00:00 2001
From: Daniel Vacek <neelx@redhat.com>
Date: Fri, 9 Mar 2018 15:51:13 -0800
Subject: mm/page_alloc: fix memmap_init_zone pageblock alignment

Commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns
where possible") introduced a bug where move_freepages() triggers a
VM_BUG_ON() on uninitialized page structure due to pageblock alignment.
To fix this, simply align the skipped pfns in memmap_init_zone() the
same way as in move_freepages_block().

Seen in one of the RHEL reports:

  crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
  kernel BUG at mm/page_alloc.c:1389!
  invalid opcode: 0000 [#1] SMP
  --
  RIP: 0010:[<ffffffff8118833e>]  [<ffffffff8118833e>] move_freepages+0x15e/0x160
  RSP: 0018:ffff88054d727688  EFLAGS: 00010087
  --
  Call Trace:
   [<ffffffff811883b3>] move_freepages_block+0x73/0x80
   [<ffffffff81189e63>] __rmqueue+0x263/0x460
   [<ffffffff8118c781>] get_page_from_freelist+0x7e1/0x9e0
   [<ffffffff8118caf6>] __alloc_pages_nodemask+0x176/0x420
  --
  RIP  [<ffffffff8118833e>] move_freepages+0x15e/0x160
   RSP <ffff88054d727688>

  crash> page_init_bug -v | grep RAM
  <struct resource 0xffff88067fffd2f8>          1000 -        9bfff	System RAM (620.00 KiB)
  <struct resource 0xffff88067fffd3a0>        100000 -     430bffff	System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
  <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff	System RAM ( 14.83 MiB = 15188.00 KiB)
  <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff	System RAM (391.02 MiB = 400408.00 KiB)
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff	System RAM (480.00 KiB)
  <struct resource 0xffff88067fffd640>     100000000 -    67fffffff	System RAM ( 22.00 GiB)

  crash> page_init_bug | head -6
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff	System RAM (480.00 KiB)
  <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  <struct page 0xffffea0001ede200> 505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
  <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
  <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  BUG, zones differ!

Note that this range follows two not populated sections
68000000-77ffffff in this zone.  7b788000-7b7fffff is the first one
after a gap.  This makes memmap_init_zone() skip all the pfns up to the
beginning of this range.  But this range is not pageblock (2M) aligned.
In fact no range has to be.

  crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
  ffffea0001e00000  78000000                0        0  0 0
  ffffea0001ed7fc0  7b5ff000                0        0  0 0
  ffffea0001ed8000  7b600000                0        0  0 0	<<<<
  ffffea0001ede1c0  7b787000                0        0  0 0
  ffffea0001ede200  7b788000                0        0  1 1fffff00000000

Top part of page flags should contain nodeid and zonenr, which is not
the case for page ffffea0001ed8000 here (<<<<).

  crash> log | grep -o fffea0001ed[^\ ]* | sort -u
  fffea0001ed8000
  fffea0001eded20
  fffea0001edffc0

  crash> bt -r | grep -o fffea0001ed[^\ ]* | sort -u
  fffea0001ed8000
  fffea0001eded00
  fffea0001eded20
  fffea0001edffc0

Initialization of the whole beginning of the section is skipped up to
the start of the range due to the commit b92df1de5d28.  Now any code
calling move_freepages_block() (like reusing the page from a freelist as
in this example) with a page from the beginning of the range will get
the page rounded down to start_page ffffea0001ed8000 and passed to
move_freepages() which crashes on assertion getting wrong zonenr.

  >         VM_BUG_ON(page_zone(start_page) != page_zone(end_page));

Note, page_zone() derives the zone from page flags here.

From similar machine before commit b92df1de5d28:

  crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b7fe000 7b7ff000
        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
  fffff73941e00000  78000000                0        0  1 1fffff00000000
  fffff73941ed7fc0  7b5ff000                0        0  1 1fffff00000000
  fffff73941ed8000  7b600000                0        0  1 1fffff00000000
  fffff73941edff80  7b7fe000                0        0  1 1fffff00000000
  fffff73941edffc0  7b7ff000 ffff8e67e04d3ae0     ad84  1 1fffff00020068 uptodate,lru,active,mappedtodisk

All the pages since the beginning of the section are initialized.
move_freepages()' not gonna blow up.

The same machine with this fix applied:

  crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b7fe000 7b7ff000
        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
  ffffea0001e00000  78000000                0        0  0 0
  ffffea0001e00000  7b5ff000                0        0  0 0
  ffffea0001ed8000  7b600000                0        0  1 1fffff00000000
  ffffea0001edff80  7b7fe000                0        0  1 1fffff00000000
  ffffea0001edffc0  7b7ff000 ffff88017fb13720        8  2 1fffff00020068 uptodate,lru,active,mappedtodisk

At least the bare minimum of pages is initialized preventing the crash
as well.

Customers started to report this as soon as 7.4 (where b92df1de5d28 was
merged in RHEL) was released.  I remember reports from
September/October-ish times.  It's not easily reproduced and happens on
a handful of machines only.  I guess that's why.  But that does not make
it less serious, I think.

Though there actually is a report here:
  https://bugzilla.kernel.org/show_bug.cgi?id=196443

And there are reports for Fedora from July:
  https://bugzilla.redhat.com/show_bug.cgi?id=1473242
and CentOS:
  https://bugs.centos.org/view.php?id=13964
and we internally track several dozens reports for RHEL bug
  https://bugzilla.redhat.com/show_bug.cgi?id=1525121

Link: http://lkml.kernel.org/r/0485727b2e82da7efbce5f6ba42524b429d0391a.1520011945.git.neelx@redhat.com
Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
Signed-off-by: Daniel Vacek <neelx@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb416723538f..3d974cb2a1a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5359,9 +5359,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			/*
 			 * Skip to the pfn preceding the next valid one (or
 			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop.
+			 * on our next iteration of the loop. Note that it needs
+			 * to be pageblock aligned even when the region itself
+			 * is not. move_freepages_block() can shift ahead of
+			 * the valid region but still depends on correct page
+			 * metadata.
 			 */
-			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+			pfn = (memblock_next_valid_pfn(pfn, end_pfn) &
+					~(pageblock_nr_pages-1)) - 1;
 #endif
 			continue;
 		}
-- 
cgit v1.2.3


From 3e04040df6d4613a8af5a80882d5f7f298f49810 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 14 Mar 2018 19:29:37 +0000
Subject: Revert "mm/page_alloc: fix memmap_init_zone pageblock alignment"

This reverts commit 864b75f9d6b0100bb24fdd9a20d156e7cda9b5ae.

Commit 864b75f9d6b0 ("mm/page_alloc: fix memmap_init_zone pageblock
alignment") modified the logic in memmap_init_zone() to initialize
struct pages associated with invalid PFNs, to appease a VM_BUG_ON()
in move_freepages(), which is redundant by its own admission, and
dereferences struct page fields to obtain the zone without checking
whether the struct pages in question are valid to begin with.

Commit 864b75f9d6b0 only makes it worse, since the rounding it does
may cause pfn assume the same value it had in a prior iteration of
the loop, resulting in an infinite loop and a hang very early in the
boot. Also, since it doesn't perform the same rounding on start_pfn
itself but only on intermediate values following an invalid PFN, we
may still hit the same VM_BUG_ON() as before.

So instead, let's fix this at the core, and ensure that the BUG
check doesn't dereference struct page fields of invalid pages.

Fixes: 864b75f9d6b0 ("mm/page_alloc: fix memmap_init_zone pageblock alignment")
Tested-by: Jan Glauber <jglauber@cavium.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Cc: Daniel Vacek <neelx@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3d974cb2a1a1..635d7dd29d7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1910,7 +1910,9 @@ static int move_freepages(struct zone *zone,
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
-	VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
+	VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
+	          pfn_valid(page_to_pfn(end_page)) &&
+	          page_zone(start_page) != page_zone(end_page));
 #endif
 
 	if (num_movable)
@@ -5359,14 +5361,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			/*
 			 * Skip to the pfn preceding the next valid one (or
 			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop. Note that it needs
-			 * to be pageblock aligned even when the region itself
-			 * is not. move_freepages_block() can shift ahead of
-			 * the valid region but still depends on correct page
-			 * metadata.
+			 * on our next iteration of the loop.
 			 */
-			pfn = (memblock_next_valid_pfn(pfn, end_pfn) &
-					~(pageblock_nr_pages-1)) - 1;
+			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
 #endif
 			continue;
 		}
-- 
cgit v1.2.3


From 1a8429132e1d2ada3832db5b4a0802c49affb750 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 9 Mar 2018 23:11:26 +0100
Subject: mm: remove blackfin MPU support

The CONFIG_MPU option was only defined on blackfin, and that architecture
is now being removed, so the respective code can be simplified.

A lot of other microcontrollers have an MPU, but I suspect that if we
want to bring that support back, we'd do it differently anyway.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/module.c |  4 ----
 mm/nommu.c      | 20 --------------------
 2 files changed, 24 deletions(-)

(limited to 'mm')

diff --git a/kernel/module.c b/kernel/module.c
index ad2d420024f6..2c1df850029b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2181,10 +2181,6 @@ static void free_module(struct module *mod)
 	/* Finally, free the core (containing the module structure) */
 	disable_ro_nx(&mod->core_layout);
 	module_memfree(mod->core_layout.base);
-
-#ifdef CONFIG_MPU
-	update_protections(current->mm);
-#endif
 }
 
 void *__symbol_get(const char *symbol)
diff --git a/mm/nommu.c b/mm/nommu.c
index ebb6e618dade..838a8fdec5c2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -662,22 +662,6 @@ static void put_nommu_region(struct vm_region *region)
 	__put_nommu_region(region);
 }
 
-/*
- * update protection on a vma
- */
-static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
-{
-#ifdef CONFIG_MPU
-	struct mm_struct *mm = vma->vm_mm;
-	long start = vma->vm_start & PAGE_MASK;
-	while (start < vma->vm_end) {
-		protect_page(mm, start, flags);
-		start += PAGE_SIZE;
-	}
-	update_protections(mm);
-#endif
-}
-
 /*
  * add a VMA into a process's mm_struct in the appropriate place in the list
  * and tree and add to the address space's page tree also if not an anonymous
@@ -695,8 +679,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 	mm->map_count++;
 	vma->vm_mm = mm;
 
-	protect_vma(vma, vma->vm_flags);
-
 	/* add the VMA to the mapping */
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
@@ -757,8 +739,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 	struct mm_struct *mm = vma->vm_mm;
 	struct task_struct *curr = current;
 
-	protect_vma(vma, 0);
-
 	mm->map_count--;
 	for (i = 0; i < VMACACHE_SIZE; i++) {
 		/* if the vma is cached, invalidate the entire cache */
-- 
cgit v1.2.3


From 79375ea3ec527f746d5beae8c8f6e8a58740d4a8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 9 Mar 2018 23:14:56 +0100
Subject: mm: remove obsolete alloc_remap()

Tile was the only remaining architecture to implement alloc_remap(),
and since that is being removed, there is no point in keeping this
function.

Removing all callers simplifies the mem_map handling.

Reviewed-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/bootmem.h |  9 ---------
 mm/page_alloc.c         |  5 +----
 mm/sparse.c             | 15 ---------------
 3 files changed, 1 insertion(+), 28 deletions(-)

(limited to 'mm')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index a53063e9d7d8..7942a96b1a9d 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -364,15 +364,6 @@ static inline void __init memblock_free_late(
 }
 #endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */
 
-#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
-extern void *alloc_remap(int nid, unsigned long size);
-#else
-static inline void *alloc_remap(int nid, unsigned long size)
-{
-	return NULL;
-}
-#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
-
 extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb416723538f..484e21062228 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6199,10 +6199,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
-		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = memblock_virt_alloc_node_nopanic(size,
-							       pgdat->node_id);
+		map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
 		pgdat->node_mem_map = map + offset;
 	}
 	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
diff --git a/mm/sparse.c b/mm/sparse.c
index 7af5e7a92528..65bb52599f90 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -427,10 +427,6 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
 	struct page *map;
 	unsigned long size;
 
-	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
-	if (map)
-		return map;
-
 	size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 	map = memblock_virt_alloc_try_nid(size,
 					  PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -446,17 +442,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 	unsigned long pnum;
 	unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
 
-	map = alloc_remap(nodeid, size * map_count);
-	if (map) {
-		for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-			if (!present_section_nr(pnum))
-				continue;
-			map_map[pnum] = map;
-			map += size;
-		}
-		return;
-	}
-
 	size = PAGE_ALIGN(size);
 	map = memblock_virt_alloc_try_nid_raw(size * map_count,
 					      PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
-- 
cgit v1.2.3


From 71546d100422bcc2c543dadeb9328728997cd23a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Mar 2018 08:27:26 -0700
Subject: percpu: include linux/sched.h for cond_resched()

microblaze build broke due to missing declaration of the
cond_resched() invocation added recently.  Let's include linux/sched.h
explicitly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
---
 mm/percpu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 36e7b65ba6cf..15a398c00791 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,6 +80,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/kmemleak.h>
+#include <linux/sched.h>
 
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
-- 
cgit v1.2.3


From f52ba1fef7b92e74d58efef8eae7b6f48c6d218d Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Mar 2018 18:32:10 +0300
Subject: mm: Allow to kill tasks doing pcpu_alloc() and waiting for
 pcpu_balance_workfn()

In case of memory deficit and low percpu memory pages,
pcpu_balance_workfn() takes pcpu_alloc_mutex for a long
time (as it makes memory allocations itself and waits
for memory reclaim). If tasks doing pcpu_alloc() are
choosen by OOM killer, they can't exit, because they
are waiting for the mutex.

The patch makes pcpu_alloc() to care about killing signal
and use mutex_lock_killable(), when it's allowed by GFP
flags. This guarantees, a task does not miss SIGKILL
from OOM killer.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 15a398c00791..9297098519a6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1373,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
-	if (!is_atomic)
-		mutex_lock(&pcpu_alloc_mutex);
+	if (!is_atomic) {
+		/*
+		 * pcpu_balance_workfn() allocates memory under this mutex,
+		 * and it may wait for memory reclaim. Allow current task
+		 * to become OOM victim, in case of memory pressure.
+		 */
+		if (gfp & __GFP_NOFAIL)
+			mutex_lock(&pcpu_alloc_mutex);
+		else if (mutex_lock_killable(&pcpu_alloc_mutex))
+			return NULL;
+	}
 
 	spin_lock_irqsave(&pcpu_lock, flags);
 
-- 
cgit v1.2.3


From 8970a63e965b43288c4f5f40efbc2bbf80de7f16 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Thu, 22 Mar 2018 16:17:02 -0700
Subject: mm/mempolicy.c: avoid use uninitialized preferred_node

Alexander reported a use of uninitialized memory in __mpol_equal(),
which is caused by incorrect use of preferred_node.

When mempolicy in mode MPOL_PREFERRED with flags MPOL_F_LOCAL, it uses
numa_node_id() instead of preferred_node, however, __mpol_equal() uses
preferred_node without checking whether it is MPOL_F_LOCAL or not.

[akpm@linux-foundation.org: slight comment tweak]
Link: http://lkml.kernel.org/r/4ebee1c2-57f6-bcb8-0e2d-1833d1ee0bb7@huawei.com
Fixes: fc36b8d3d819 ("mempolicy: use MPOL_F_LOCAL to Indicate Preferred Local Policy")
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Reported-by: Alexander Potapenko <glider@google.com>
Tested-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d879f1d8a44a..32cba0332787 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_INTERLEAVE:
 		return !!nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
+		/* a's ->flags is the same as b's */
+		if (a->flags & MPOL_F_LOCAL)
+			return true;
 		return a->v.preferred_node == b->v.preferred_node;
 	default:
 		BUG();
-- 
cgit v1.2.3


From 2e517d681632326ed98399cb4dd99519efe3e32c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 22 Mar 2018 16:17:10 -0700
Subject: lockdep: fix fs_reclaim warning

Dave Jones reported fs_reclaim lockdep warnings.

  ============================================
  WARNING: possible recursive locking detected
  4.15.0-rc9-backup-debug+ #1 Not tainted
  --------------------------------------------
  sshd/24800 is trying to acquire lock:
   (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30

  but task is already holding lock:
   (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30

  other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(fs_reclaim);
    lock(fs_reclaim);

   *** DEADLOCK ***

   May be due to missing lock nesting notation

  2 locks held by sshd/24800:
   #0:  (sk_lock-AF_INET6){+.+.}, at: [<000000001a069652>] tcp_sendmsg+0x19/0x40
   #1:  (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30

  stack backtrace:
  CPU: 3 PID: 24800 Comm: sshd Not tainted 4.15.0-rc9-backup-debug+ #1
  Call Trace:
   dump_stack+0xbc/0x13f
   __lock_acquire+0xa09/0x2040
   lock_acquire+0x12e/0x350
   fs_reclaim_acquire.part.102+0x29/0x30
   kmem_cache_alloc+0x3d/0x2c0
   alloc_extent_state+0xa7/0x410
   __clear_extent_bit+0x3ea/0x570
   try_release_extent_mapping+0x21a/0x260
   __btrfs_releasepage+0xb0/0x1c0
   btrfs_releasepage+0x161/0x170
   try_to_release_page+0x162/0x1c0
   shrink_page_list+0x1d5a/0x2fb0
   shrink_inactive_list+0x451/0x940
   shrink_node_memcg.constprop.88+0x4c9/0x5e0
   shrink_node+0x12d/0x260
   try_to_free_pages+0x418/0xaf0
   __alloc_pages_slowpath+0x976/0x1790
   __alloc_pages_nodemask+0x52c/0x5c0
   new_slab+0x374/0x3f0
   ___slab_alloc.constprop.81+0x47e/0x5a0
   __slab_alloc.constprop.80+0x32/0x60
   __kmalloc_track_caller+0x267/0x310
   __kmalloc_reserve.isra.40+0x29/0x80
   __alloc_skb+0xee/0x390
   sk_stream_alloc_skb+0xb8/0x340
   tcp_sendmsg_locked+0x8e6/0x1d30
   tcp_sendmsg+0x27/0x40
   inet_sendmsg+0xd0/0x310
   sock_write_iter+0x17a/0x240
   __vfs_write+0x2ab/0x380
   vfs_write+0xfb/0x260
   SyS_write+0xb6/0x140
   do_syscall_64+0x1e5/0xc05
   entry_SYSCALL64_slow_path+0x25/0x25

This warning is caused by commit d92a8cfcb37e ("locking/lockdep:
Rework FS_RECLAIM annotation") which replaced the use of
lockdep_{set,clear}_current_reclaim_state() in __perform_reclaim()
and lockdep_trace_alloc() in slab_pre_alloc_hook() with
fs_reclaim_acquire()/ fs_reclaim_release().

Since __kmalloc_reserve() from __alloc_skb() adds __GFP_NOMEMALLOC |
__GFP_NOWARN to gfp_mask, and all reclaim path simply propagates
__GFP_NOMEMALLOC, fs_reclaim_acquire() in slab_pre_alloc_hook() is
trying to grab the 'fake' lock again when __perform_reclaim() already
grabbed the 'fake' lock.

The

  /* this guy won't enter reclaim */
  if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
          return false;

test which causes slab_pre_alloc_hook() to try to grab the 'fake' lock
was added by commit cf40bd16fdad ("lockdep: annotate reclaim context
(__GFP_NOFS)").  But that test is outdated because PF_MEMALLOC thread
won't enter reclaim regardless of __GFP_NOMEMALLOC after commit
341ce06f69ab ("page allocator: calculate the alloc_flags for allocation
only once") added the PF_MEMALLOC safeguard (

  /* Avoid recursion of direct reclaim */
  if (p->flags & PF_MEMALLOC)
          goto nopage;

in __alloc_pages_slowpath()).

Thus, let's fix outdated test by removing __GFP_NOMEMALLOC test and
allow __need_fs_reclaim() to return false.

Link: http://lkml.kernel.org/r/201802280650.FJC73911.FOSOMLJVFFQtHO@I-love.SAKURA.ne.jp
Fixes: d92a8cfcb37ecd13 ("locking/lockdep: Rework FS_RECLAIM annotation")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Tested-by: Dave Jones <davej@codemonkey.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org>	[4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 635d7dd29d7f..010dee0f089e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
 		return false;
 
 	/* this guy won't enter reclaim */
-	if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+	if (current->flags & PF_MEMALLOC)
 		return false;
 
 	/* We're only interested __GFP_FS allocations for now */
-- 
cgit v1.2.3


From 63489f8e821144000e0bdca7e65a8d1cc23a7ee7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 22 Mar 2018 16:17:13 -0700
Subject: hugetlbfs: check for pgoff value overflow

A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call.  The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.

A sequence such as:

  mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
  remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);

will result in the following when task exits/file closed,

  kernel BUG at mm/hugetlb.c:749!
  Call Trace:
    hugetlbfs_evict_inode+0x2f/0x40
    evict+0xcb/0x190
    __dentry_kill+0xcb/0x150
    __fput+0x164/0x1e0
    task_work_run+0x84/0xa0
    exit_to_usermode_loop+0x7d/0x80
    do_syscall_64+0x18b/0x190
    entry_SYSCALL_64_after_hwframe+0x3d/0xa2

The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.

The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.

[mike.kravetz@oracle.com: v3]
  Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 17 ++++++++++++++---
 mm/hugetlb.c         |  7 +++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8fe1b0aa2896..b9a254dcc0e7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
+/*
+ * Mask used when checking the page offset value passed in via system
+ * calls.  This value will be converted to a loff_t which is signed.
+ * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
+ * value.  The extra bit (- 1 in the shift value) is to take the sign
+ * bit into account.
+ */
+#define PGOFF_LOFFT_MAX \
+	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
+
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_ops = &hugetlb_vm_ops;
 
 	/*
-	 * Offset passed to mmap (before page shift) could have been
-	 * negative when represented as a (l)off_t.
+	 * page based offset in vm_pgoff could be sufficiently large to
+	 * overflow a (l)off_t when converted to byte offset.
 	 */
-	if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
 		return -EINVAL;
 
+	/* must be huge page aligned */
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a963f2034dfc..976bbc5646fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/mmdebug.h>
 #include <linux/sched/signal.h>
 #include <linux/rmap.h>
 #include <linux/string_helpers.h>
@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,
 	struct resv_map *resv_map;
 	long gbl_reserve;
 
+	/* This should never happen */
+	if (from > to) {
+		VM_WARN(1, "%s called with a negative range\n", __func__);
+		return -EINVAL;
+	}
+
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
 	 * attempt will be made for VM_NORESERVE to allocate a page
-- 
cgit v1.2.3


From fece2029a9e65b9a990831afe2a2b83290cbbe26 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 22 Mar 2018 16:17:28 -0700
Subject: mm/khugepaged.c: convert VM_BUG_ON() to collapse fail

khugepaged is not yet able to convert PTE-mapped huge pages back to PMD
mapped.  We do not collapse such pages.  See check
khugepaged_scan_pmd().

But if between khugepaged_scan_pmd() and __collapse_huge_page_isolate()
somebody managed to instantiate THP in the range and then split the PMD
back to PTEs we would have a problem --
VM_BUG_ON_PAGE(PageCompound(page)) will get triggered.

It's possible since we drop mmap_sem during collapse to re-take for
write.

Replace the VM_BUG_ON() with graceful collapse fail.

Link: http://lkml.kernel.org/r/20180315152353.27989-1-kirill.shutemov@linux.intel.com
Fixes: b1caa957ae6d ("khugepaged: ignore pmd tables with THP mapped with ptes")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/khugepaged.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7e2268dfc9a..c15da1ea7e63 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 
-		VM_BUG_ON_PAGE(PageCompound(page), page);
+		/* TODO: teach khugepaged to collapse THP mapped with pte */
+		if (PageCompound(page)) {
+			result = SCAN_PAGE_COMPOUND;
+			goto out;
+		}
+
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 
 		/*
-- 
cgit v1.2.3


From fa41b900c30b45fab03783724932dc30cd46a6be Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 22 Mar 2018 16:17:31 -0700
Subject: mm/thp: do not wait for lock_page() in deferred_split_scan()

deferred_split_scan() gets called from reclaim path.  Waiting for page
lock may lead to deadlock there.

Replace lock_page() with trylock_page() and skip the page if we failed
to lock it.  We will get to the page on the next scan.

Link: http://lkml.kernel.org/r/20180315150747.31945-1-kirill.shutemov@linux.intel.com
Fixes: 9a982250f773 ("thp: introduce deferred_split_huge_page()")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87ab9b8f56b5..529cf36b7edb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2783,11 +2783,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 
 	list_for_each_safe(pos, next, &list) {
 		page = list_entry((void *)pos, struct page, mapping);
-		lock_page(page);
+		if (!trylock_page(page))
+			goto next;
 		/* split_huge_page() removes page from list on success */
 		if (!split_huge_page(page))
 			split++;
 		unlock_page(page);
+next:
 		put_page(page);
 	}
 
-- 
cgit v1.2.3


From b3cd54b257ad95d344d121dc563d943ca39b0921 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 22 Mar 2018 16:17:35 -0700
Subject: mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink()

shmem_unused_huge_shrink() gets called from reclaim path.  Waiting for
page lock may lead to deadlock there.

There was a bug report that may be attributed to this:

  http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.net

Replace lock_page() with trylock_page() and skip the page if we failed
to lock it.  We will get to the page on the next scan.

We can test for the PageTransHuge() outside the page lock as we only
need protection against splitting the page under us.  Holding pin oni
the page is enough for this.

Link: http://lkml.kernel.org/r/20180316210830.43738-1-kirill.shutemov@linux.intel.com
Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Eric Wheeler <linux-mm@lists.ewheeler.net>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 1907688b75ee..b85919243399 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -493,36 +493,45 @@ next:
 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
 		inode = &info->vfs_inode;
 
-		if (nr_to_split && split >= nr_to_split) {
-			iput(inode);
-			continue;
-		}
+		if (nr_to_split && split >= nr_to_split)
+			goto leave;
 
-		page = find_lock_page(inode->i_mapping,
+		page = find_get_page(inode->i_mapping,
 				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
 		if (!page)
 			goto drop;
 
+		/* No huge page at the end of the file: nothing to split */
 		if (!PageTransHuge(page)) {
-			unlock_page(page);
 			put_page(page);
 			goto drop;
 		}
 
+		/*
+		 * Leave the inode on the list if we failed to lock
+		 * the page at this time.
+		 *
+		 * Waiting for the lock may lead to deadlock in the
+		 * reclaim path.
+		 */
+		if (!trylock_page(page)) {
+			put_page(page);
+			goto leave;
+		}
+
 		ret = split_huge_page(page);
 		unlock_page(page);
 		put_page(page);
 
-		if (ret) {
-			/* split failed: leave it on the list */
-			iput(inode);
-			continue;
-		}
+		/* If split failed leave the inode on the list */
+		if (ret)
+			goto leave;
 
 		split++;
 drop:
 		list_del_init(&info->shrinklist);
 		removed++;
+leave:
 		iput(inode);
 	}
 
-- 
cgit v1.2.3


From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001
From: Daniel Vacek <neelx@redhat.com>
Date: Thu, 22 Mar 2018 16:17:38 -0700
Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where
 possible"

This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of
invalid pfns where possible").  The commit is meant to be a boot init
speed up skipping the loop in memmap_init_zone() for invalid pfns.

But given some specific memory mapping on x86_64 (or more generally
theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the
implementation also skips valid pfns which is plain wrong and causes
'kernel BUG at mm/page_alloc.c:1389!'

  crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
  kernel BUG at mm/page_alloc.c:1389!
  invalid opcode: 0000 [#1] SMP
  --
  RIP: 0010: move_freepages+0x15e/0x160
  --
  Call Trace:
    move_freepages_block+0x73/0x80
    __rmqueue+0x263/0x460
    get_page_from_freelist+0x7e1/0x9e0
    __alloc_pages_nodemask+0x176/0x420
  --

  crash> page_init_bug -v | grep RAM
  <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
  <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
  <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
  <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
  <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)

  crash> page_init_bug | head -6
  <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
  <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  <struct page 0xffffea0001ede200>       505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
  <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
  <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
  BUG, zones differ!

  crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
        PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
  ffffea0001e00000  78000000                0        0  0 0
  ffffea0001ed7fc0  7b5ff000                0        0  0 0
  ffffea0001ed8000  7b600000                0        0  0 0       <<<<
  ffffea0001ede1c0  7b787000                0        0  0 0
  ffffea0001ede200  7b788000                0        0  1 1fffff00000000

Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com
Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
Signed-off-by: Daniel Vacek <neelx@redhat.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  1 -
 mm/memblock.c            | 28 ----------------------------
 mm/page_alloc.c          | 11 +----------
 3 files changed, 1 insertion(+), 39 deletions(-)

(limited to 'mm')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8be5077efb5f..f92ea7783652 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
-unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index b6ba6b7adadc..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-						      unsigned long max_pfn)
-{
-	struct memblock_type *type = &memblock.memory;
-	unsigned int right = type->cnt;
-	unsigned int mid, left = 0;
-	phys_addr_t addr = PFN_PHYS(++pfn);
-
-	do {
-		mid = (right + left) / 2;
-
-		if (addr < type->regions[mid].base)
-			right = mid;
-		else if (addr >= (type->regions[mid].base +
-				  type->regions[mid].size))
-			left = mid + 1;
-		else {
-			/* addr is within the region, so pfn is valid */
-			return pfn;
-		}
-	} while (left < right);
-
-	if (right == type->cnt)
-		return -1UL;
-	else
-		return PHYS_PFN(type->regions[right].base);
-}
-
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 010dee0f089e..1741dd23e7c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-			/*
-			 * Skip to the pfn preceding the next valid one (or
-			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop.
-			 */
-			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+		if (!early_pfn_valid(pfn))
 			continue;
-		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
cgit v1.2.3


From 1c610d5f93c709df56787f50b3576704ac271826 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 22 Mar 2018 16:17:42 -0700
Subject: mm/vmscan: wake up flushers for legacy cgroups too

Commit 726d061fbd36 ("mm: vmscan: kick flushers when we encounter dirty
pages on the LRU") added flusher invocation to shrink_inactive_list()
when many dirty pages on the LRU are encountered.

However, shrink_inactive_list() doesn't wake up flushers for legacy
cgroup reclaim, so the next commit bbef938429f5 ("mm: vmscan: remove old
flusher wakeup from direct reclaim path") removed the only source of
flusher's wake up in legacy mem cgroup reclaim path.

This leads to premature OOM if there is too many dirty pages in cgroup:
    # mkdir /sys/fs/cgroup/memory/test
    # echo $$ > /sys/fs/cgroup/memory/test/tasks
    # echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    # dd if=/dev/zero of=tmp_file bs=1M count=100
    Killed

    dd invoked oom-killer: gfp_mask=0x14000c0(GFP_KERNEL), nodemask=(null), order=0, oom_score_adj=0

    Call Trace:
     dump_stack+0x46/0x65
     dump_header+0x6b/0x2ac
     oom_kill_process+0x21c/0x4a0
     out_of_memory+0x2a5/0x4b0
     mem_cgroup_out_of_memory+0x3b/0x60
     mem_cgroup_oom_synchronize+0x2ed/0x330
     pagefault_out_of_memory+0x24/0x54
     __do_page_fault+0x521/0x540
     page_fault+0x45/0x50

    Task in /test killed as a result of limit of /test
    memory: usage 51200kB, limit 51200kB, failcnt 73
    memory+swap: usage 51200kB, limit 9007199254740988kB, failcnt 0
    kmem: usage 296kB, limit 9007199254740988kB, failcnt 0
    Memory cgroup stats for /test: cache:49632KB rss:1056KB rss_huge:0KB shmem:0KB
            mapped_file:0KB dirty:49500KB writeback:0KB swap:0KB inactive_anon:0KB
	    active_anon:1168KB inactive_file:24760KB active_file:24960KB unevictable:0KB
    Memory cgroup out of memory: Kill process 3861 (bash) score 88 or sacrifice child
    Killed process 3876 (dd) total-vm:8484kB, anon-rss:1052kB, file-rss:1720kB, shmem-rss:0kB
    oom_reaper: reaped process 3876 (dd), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

Wake up flushers in legacy cgroup reclaim too.

Link: http://lkml.kernel.org/r/20180315164553.17856-1-aryabinin@virtuozzo.com
Fixes: bbef938429f5 ("mm: vmscan: remove old flusher wakeup from direct reclaim path")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index bee53495a829..cd5dc3faaa57 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1779,6 +1779,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (stat.nr_writeback && stat.nr_writeback == nr_taken)
 		set_bit(PGDAT_WRITEBACK, &pgdat->flags);
 
+	/*
+	 * If dirty pages are scanned that are not queued for IO, it
+	 * implies that flushers are not doing their job. This can
+	 * happen when memory pressure pushes dirty pages to the end of
+	 * the LRU before the dirty limits are breached and the dirty
+	 * data has expired. It can also happen when the proportion of
+	 * dirty pages grows not through writes but through memory
+	 * pressure reclaiming all the clean cache. And in some cases,
+	 * the flushers simply cannot keep up with the allocation
+	 * rate. Nudge the flusher threads in case they are asleep.
+	 */
+	if (stat.nr_unqueued_dirty == nr_taken)
+		wakeup_flusher_threads(WB_REASON_VMSCAN);
+
 	/*
 	 * Legacy memcg will stall in page writeback so avoid forcibly
 	 * stalling here.
@@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
 			set_bit(PGDAT_CONGESTED, &pgdat->flags);
 
-		/*
-		 * If dirty pages are scanned that are not queued for IO, it
-		 * implies that flushers are not doing their job. This can
-		 * happen when memory pressure pushes dirty pages to the end of
-		 * the LRU before the dirty limits are breached and the dirty
-		 * data has expired. It can also happen when the proportion of
-		 * dirty pages grows not through writes but through memory
-		 * pressure reclaiming all the clean cache. And in some cases,
-		 * the flushers simply cannot keep up with the allocation
-		 * rate. Nudge the flusher threads in case they are asleep, but
-		 * also allow kswapd to start writing pages during reclaim.
-		 */
-		if (stat.nr_unqueued_dirty == nr_taken) {
-			wakeup_flusher_threads(WB_REASON_VMSCAN);
+		/* Allow kswapd to start writing pages during reclaim. */
+		if (stat.nr_unqueued_dirty == nr_taken)
 			set_bit(PGDAT_DIRTY, &pgdat->flags);
-		}
 
 		/*
 		 * If kswapd scans pages marked marked for immediate
-- 
cgit v1.2.3


From 9d3c3354bb85bab4d865fe95039443f09a4c8394 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 22 Mar 2018 16:17:45 -0700
Subject: mm, thp: do not cause memcg oom for thp

Commit 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and
madvised allocations") changed the page allocator to no longer detect
thp allocations based on __GFP_NORETRY.

It did not, however, modify the mem cgroup try_charge() path to avoid
oom kill for either khugepaged collapsing or thp faulting.  It is never
expected to oom kill a process to allocate a hugepage for thp; reclaim
is governed by the thp defrag mode and MADV_HUGEPAGE, but allocations
(and charging) should fallback instead of oom killing processes.

Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803191409420.124411@chino.kir.corp.google.com
Fixes: 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations")
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 +++--
 mm/khugepaged.c  | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 529cf36b7edb..5a68730eebd6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
+				  true)) {
 		put_page(page);
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1316,7 +1317,7 @@ alloc:
 	}
 
 	if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
-					huge_gfp, &memcg, true))) {
+				huge_gfp | __GFP_NORETRY, &memcg, true))) {
 		put_page(new_page);
 		split_huge_pmd(vma, vmf->pmd, vmf->address);
 		if (page)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c15da1ea7e63..e42568284e06 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -965,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 		goto out_nolock;
 	}
 
-	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+	/* Do not oom kill for khugepaged charges */
+	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+					   &memcg, true))) {
 		result = SCAN_CGROUP_CHARGE_FAIL;
 		goto out_nolock;
 	}
@@ -1324,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,
 		goto out;
 	}
 
-	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+	/* Do not oom kill for khugepaged charges */
+	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+					   &memcg, true))) {
 		result = SCAN_CGROUP_CHARGE_FAIL;
 		goto out;
 	}
-- 
cgit v1.2.3


From a687a5337063af99ebd0eebaa6f4b4cf2e07c21b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 7 Mar 2018 23:30:54 +0100
Subject: treewide: simplify Kconfig dependencies for removed archs

A lot of Kconfig symbols have architecture specific dependencies.
In those cases that depend on architectures we have already removed,
they can be omitted.

Acked-by: Kalle Valo <kvalo@codeaurora.org>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 block/bounce.c                       |  2 +-
 drivers/ide/Kconfig                  |  2 +-
 drivers/ide/ide-generic.c            | 12 +-----------
 drivers/input/joystick/analog.c      |  2 +-
 drivers/isdn/hisax/Kconfig           | 10 +++++-----
 drivers/net/ethernet/davicom/Kconfig |  2 +-
 drivers/net/ethernet/smsc/Kconfig    |  6 +++---
 drivers/net/wireless/cisco/Kconfig   |  2 +-
 drivers/pwm/Kconfig                  |  2 +-
 drivers/rtc/Kconfig                  |  2 +-
 drivers/spi/Kconfig                  |  4 ++--
 drivers/usb/musb/Kconfig             |  2 +-
 drivers/video/console/Kconfig        |  3 +--
 drivers/watchdog/Kconfig             |  6 ------
 drivers/watchdog/Makefile            |  6 ------
 fs/Kconfig.binfmt                    |  5 ++---
 fs/minix/Kconfig                     |  2 +-
 include/linux/ide.h                  |  7 +------
 init/Kconfig                         |  5 ++---
 lib/Kconfig.debug                    | 13 +++++--------
 lib/test_user_copy.c                 |  2 --
 mm/Kconfig                           |  7 -------
 mm/percpu.c                          |  4 ----
 23 files changed, 31 insertions(+), 77 deletions(-)

(limited to 'mm')

diff --git a/block/bounce.c b/block/bounce.c
index 6a3e68292273..dd0b93f2a871 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -31,7 +31,7 @@
 static struct bio_set *bounce_bio_set, *bounce_bio_split;
 static mempool_t *page_pool, *isa_page_pool;
 
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
+#if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
 #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index cf1fb3fb5d26..901b8833847f 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -200,7 +200,7 @@ comment "IDE chipset support/bugfixes"
 
 config IDE_GENERIC
 	tristate "generic/default IDE chipset support"
-	depends on ALPHA || X86 || IA64 || M32R || MIPS || ARCH_RPC
+	depends on ALPHA || X86 || IA64 || MIPS || ARCH_RPC
 	default ARM && ARCH_RPC
 	help
 	  This is the generic IDE driver.  This driver attaches to the
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 54d7c4685d23..80c0d69b83ac 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -13,13 +13,10 @@
 #include <linux/ide.h>
 #include <linux/pci_ids.h>
 
-/* FIXME: convert arm and m32r to use ide_platform host driver */
+/* FIXME: convert arm to use ide_platform host driver */
 #ifdef CONFIG_ARM
 #include <asm/irq.h>
 #endif
-#ifdef CONFIG_M32R
-#include <asm/m32r.h>
-#endif
 
 #define DRV_NAME	"ide_generic"
 
@@ -35,13 +32,6 @@ static const struct ide_port_info ide_generic_port_info = {
 #ifdef CONFIG_ARM
 static const u16 legacy_bases[] = { 0x1f0 };
 static const int legacy_irqs[]  = { IRQ_HARDDISK };
-#elif defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) || \
-      defined(CONFIG_PLAT_OPSPUT)
-static const u16 legacy_bases[] = { 0x1f0 };
-static const int legacy_irqs[]  = { PLD_IRQ_CFIREQ };
-#elif defined(CONFIG_PLAT_MAPPI3)
-static const u16 legacy_bases[] = { 0x1f0, 0x170 };
-static const int legacy_irqs[]  = { PLD_IRQ_CFIREQ, PLD_IRQ_IDEIREQ };
 #elif defined(CONFIG_ALPHA)
 static const u16 legacy_bases[] = { 0x1f0, 0x170, 0x1e8, 0x168 };
 static const int legacy_irqs[]  = { 14, 15, 11, 10 };
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index be1b4921f22a..eefac7978f93 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -163,7 +163,7 @@ static unsigned int get_time_pit(void)
 #define GET_TIME(x)	do { x = (unsigned int)rdtsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
-#elif defined(__alpha__) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV) || defined(CONFIG_TILE)
+#elif defined(__alpha__) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
 #define GET_TIME(x)	do { x = get_cycles(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"get_cycles"
diff --git a/drivers/isdn/hisax/Kconfig b/drivers/isdn/hisax/Kconfig
index eb83d94ab4fe..38cfc8baae19 100644
--- a/drivers/isdn/hisax/Kconfig
+++ b/drivers/isdn/hisax/Kconfig
@@ -109,7 +109,7 @@ config HISAX_16_3
 
 config HISAX_TELESPCI
 	bool "Teles PCI"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN)))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN)))
 	help
 	  This enables HiSax support for the Teles PCI.
 	  See <file:Documentation/isdn/README.HiSax> on how to configure it.
@@ -237,7 +237,7 @@ config HISAX_MIC
 
 config HISAX_NETJET
 	bool "NETjet card"
-	depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE))
+	depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE))
 	depends on VIRT_TO_BUS
 	help
 	  This enables HiSax support for the NetJet from Traverse
@@ -249,7 +249,7 @@ config HISAX_NETJET
 
 config HISAX_NETJET_U
 	bool "NETspider U card"
-	depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE))
+	depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE))
 	depends on VIRT_TO_BUS
 	help
 	  This enables HiSax support for the Netspider U interface ISDN card
@@ -318,7 +318,7 @@ config HISAX_GAZEL
 
 config HISAX_HFC_PCI
 	bool "HFC PCI-Bus cards"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN)))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN)))
 	help
 	  This enables HiSax support for the HFC-S PCI 2BDS0 based cards.
 
@@ -343,7 +343,7 @@ config HISAX_HFC_SX
 
 config HISAX_ENTERNOW_PCI
 	bool "Formula-n enter:now PCI card"
-	depends on HISAX_NETJET && PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN)))
+	depends on HISAX_NETJET && PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN)))
 	help
 	  This enables HiSax support for the Formula-n enter:now PCI
 	  ISDN card.
diff --git a/drivers/net/ethernet/davicom/Kconfig b/drivers/net/ethernet/davicom/Kconfig
index 7ec2d74f94d3..680a6d983f37 100644
--- a/drivers/net/ethernet/davicom/Kconfig
+++ b/drivers/net/ethernet/davicom/Kconfig
@@ -4,7 +4,7 @@
 
 config DM9000
 	tristate "DM9000 support"
-	depends on ARM || BLACKFIN || MIPS || COLDFIRE || NIOS2
+	depends on ARM || MIPS || COLDFIRE || NIOS2
 	select CRC32
 	select MII
 	---help---
diff --git a/drivers/net/ethernet/smsc/Kconfig b/drivers/net/ethernet/smsc/Kconfig
index 948603e9b905..3da0c573d2ab 100644
--- a/drivers/net/ethernet/smsc/Kconfig
+++ b/drivers/net/ethernet/smsc/Kconfig
@@ -5,8 +5,8 @@
 config NET_VENDOR_SMSC
 	bool "SMC (SMSC)/Western Digital devices"
 	default y
-	depends on ARM || ARM64 || ATARI_ETHERNAT || BLACKFIN || COLDFIRE || \
-		   ISA || M32R || MAC || MIPS || NIOS2 || PCI || \
+	depends on ARM || ARM64 || ATARI_ETHERNAT || COLDFIRE || \
+		   ISA || MAC || MIPS || NIOS2 || PCI || \
 		   PCMCIA || SUPERH || XTENSA || H8300
 	---help---
 	  If you have a network (Ethernet) card belonging to this class, say Y.
@@ -37,7 +37,7 @@ config SMC91X
 	select CRC32
 	select MII
 	depends on !OF || GPIOLIB
-	depends on ARM || ARM64 || ATARI_ETHERNAT || BLACKFIN || COLDFIRE || \
+	depends on ARM || ARM64 || ATARI_ETHERNAT || COLDFIRE || \
 		   M32R || MIPS || NIOS2 || SUPERH || XTENSA || H8300
 	---help---
 	  This is a driver for SMC's 91x series of Ethernet chipsets,
diff --git a/drivers/net/wireless/cisco/Kconfig b/drivers/net/wireless/cisco/Kconfig
index b22567dff893..8ed0b154bb33 100644
--- a/drivers/net/wireless/cisco/Kconfig
+++ b/drivers/net/wireless/cisco/Kconfig
@@ -33,7 +33,7 @@ config AIRO
 
 config AIRO_CS
 	tristate "Cisco/Aironet 34X/35X/4500/4800 PCMCIA cards"
-	depends on CFG80211 && PCMCIA && (BROKEN || !M32R)
+	depends on CFG80211 && PCMCIA
 	select WIRELESS_EXT
 	select WEXT_SPY
 	select WEXT_PRIV
diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 763ee50ea57d..f16aad3bf5d6 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -43,7 +43,7 @@ config PWM_AB8500
 
 config PWM_ATMEL
 	tristate "Atmel PWM support"
-	depends on ARCH_AT91 || AVR32
+	depends on ARCH_AT91
 	help
 	  Generic PWM framework driver for Atmel SoC.
 
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index be5a3dc99c11..46af10ac45fc 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -868,7 +868,7 @@ comment "Platform RTC drivers"
 
 config RTC_DRV_CMOS
 	tristate "PC-style 'CMOS'"
-	depends on X86 || ARM || M32R || PPC || MIPS || SPARC64
+	depends on X86 || ARM || PPC || MIPS || SPARC64
 	default y if X86
 	select RTC_MC146818_LIB
 	help
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 603783976b81..103c13fcefa0 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -72,10 +72,10 @@ config SPI_ARMADA_3700
 config SPI_ATMEL
 	tristate "Atmel SPI Controller"
 	depends on HAS_DMA
-	depends on (ARCH_AT91 || AVR32 || COMPILE_TEST)
+	depends on ARCH_AT91 || COMPILE_TEST
 	help
 	  This selects a driver for the Atmel SPI Controller, present on
-	  many AT32 (AVR32) and AT91 (ARM) chips.
+	  many AT91 ARM chips.
 
 config SPI_AU1550
 	tristate "Au1550/Au1200/Au1300 SPI Controller"
diff --git a/drivers/usb/musb/Kconfig b/drivers/usb/musb/Kconfig
index 5506a9c03c1f..e757afc1cfd0 100644
--- a/drivers/usb/musb/Kconfig
+++ b/drivers/usb/musb/Kconfig
@@ -87,7 +87,7 @@ config USB_MUSB_DA8XX
 config USB_MUSB_TUSB6010
 	tristate "TUSB6010"
 	depends on HAS_IOMEM
-	depends on (ARCH_OMAP2PLUS || COMPILE_TEST) && !BLACKFIN
+	depends on ARCH_OMAP2PLUS || COMPILE_TEST
 	depends on NOP_USB_XCEIV = USB_MUSB_HDRC # both built-in or both modules
 
 config USB_MUSB_OMAP2PLUS
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index 005ed87c8216..a9e398c144f8 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -6,8 +6,7 @@ menu "Console display driver support"
 
 config VGA_CONSOLE
 	bool "VGA text console" if EXPERT || !X86
-	depends on !4xx && !PPC_8xx && !SPARC && !M68K && !PARISC && !FRV && \
-		!SUPERH && !BLACKFIN && !AVR32 && !CRIS && \
+	depends on !4xx && !PPC_8xx && !SPARC && !M68K && !PARISC &&  !SUPERH && \
 		(!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
 		!ARM64 && !ARC && !MICROBLAZE && !OPENRISC
 	default y
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 0e19679348d1..79020ce95de2 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -828,10 +828,6 @@ config BFIN_WDT
 	  To compile this driver as a module, choose M here: the
 	  module will be called bfin_wdt.
 
-# CRIS Architecture
-
-# FRV Architecture
-
 # X86 (i386 + ia64 + x86_64) Architecture
 
 config ACQUIRE_WDT
@@ -1431,8 +1427,6 @@ config NIC7018_WDT
 	  To compile this driver as a module, choose M here: the module will be
 	  called nic7018_wdt.
 
-# M32R Architecture
-
 # M68K Architecture
 
 config M54xx_WATCHDOG
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 0474d38aa854..1f9a0235f22c 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -94,10 +94,6 @@ obj-$(CONFIG_SPRD_WATCHDOG) += sprd_wdt.o
 # BLACKFIN Architecture
 obj-$(CONFIG_BFIN_WDT) += bfin_wdt.o
 
-# CRIS Architecture
-
-# FRV Architecture
-
 # X86 (i386 + ia64 + x86_64) Architecture
 obj-$(CONFIG_ACQUIRE_WDT) += acquirewdt.o
 obj-$(CONFIG_ADVANTECH_WDT) += advantechwdt.o
@@ -146,8 +142,6 @@ obj-$(CONFIG_INTEL_MEI_WDT) += mei_wdt.o
 obj-$(CONFIG_NI903X_WDT) += ni903x_wdt.o
 obj-$(CONFIG_NIC7018_WDT) += nic7018_wdt.o
 
-# M32R Architecture
-
 # M68K Architecture
 obj-$(CONFIG_M54xx_WATCHDOG) += m54xx_wdt.o
 
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 58c2bbd385ad..57a27c42b5ac 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,6 @@
 config BINFMT_ELF
 	bool "Kernel support for ELF binaries"
-	depends on MMU && (BROKEN || !FRV)
+	depends on MMU
 	select ELFCORE
 	default y
 	---help---
@@ -35,7 +35,7 @@ config ARCH_BINFMT_ELF_STATE
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y if !BINFMT_ELF
-	depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+	depends on (ARM || (SUPERH32 && !MMU) || C6X)
 	select ELFCORE
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
@@ -90,7 +90,6 @@ config BINFMT_SCRIPT
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU || ARM || M68K
-	depends on !FRV || BROKEN
 	help
 	  Support uClinux FLAT format binaries.
 
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index f2a0cfcef11d..bcd53a79156f 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -18,7 +18,7 @@ config MINIX_FS
 
 config MINIX_FS_NATIVE_ENDIAN
 	def_bool MINIX_FS
-	depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+	depends on MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
 
 config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
 	def_bool MINIX_FS
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 20d42c0d9fb6..1d6f16110eae 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -25,15 +25,10 @@
 #include <asm/byteorder.h>
 #include <asm/io.h>
 
-#if defined(CONFIG_CRIS) || defined(CONFIG_FRV)
-# define SUPPORT_VLB_SYNC 0
-#else
-# define SUPPORT_VLB_SYNC 1
-#endif
-
 /*
  * Probably not wise to fiddle with these
  */
+#define SUPPORT_VLB_SYNC 1
 #define IDE_DEFAULT_MAX_FAILURES	1
 #define ERROR_MAX	8	/* Max read/write errors per sector */
 #define ERROR_RESET	3	/* Reset controller every 4th retry */
diff --git a/init/Kconfig b/init/Kconfig
index a14bcc9724a2..2852692d7c9c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -998,7 +998,6 @@ config RELAY
 
 config BLK_DEV_INITRD
 	bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
-	depends on BROKEN || !FRV
 	help
 	  The initial RAM filesystem is a ramfs which is loaded by the
 	  boot loader (loadlin or lilo) and that is mounted as root
@@ -1108,7 +1107,7 @@ config MULTIUSER
 
 config SGETMASK_SYSCALL
 	bool "sgetmask/ssetmask syscalls support" if EXPERT
-	def_bool PARISC || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
+	def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH
 	---help---
 	  sys_sgetmask and sys_ssetmask are obsolete system calls
 	  no longer supported in libc but still enabled by default in some
@@ -1370,7 +1369,7 @@ config KALLSYMS_ABSOLUTE_PERCPU
 config KALLSYMS_BASE_RELATIVE
 	bool
 	depends on KALLSYMS
-	default !IA64 && !(TILE && 64BIT)
+	default !IA64
 	help
 	  Instead of emitting them as absolute values in the native word size,
 	  emit the symbol references in the kallsyms table as 32-bit entries,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 41ac9d294245..6927c6d8d185 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -165,7 +165,7 @@ config DEBUG_INFO_REDUCED
 
 config DEBUG_INFO_SPLIT
 	bool "Produce split debuginfo in .dwo files"
-	depends on DEBUG_INFO && !FRV
+	depends on DEBUG_INFO
 	help
 	  Generate debug info into separate .dwo files. This significantly
 	  reduces the build directory size for builds with DEBUG_INFO,
@@ -354,10 +354,7 @@ config ARCH_WANT_FRAME_POINTERS
 
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
-	depends on DEBUG_KERNEL && \
-		(CRIS || M68K || FRV || UML || \
-		 SUPERH || BLACKFIN) || \
-		ARCH_WANT_FRAME_POINTERS
+	depends on DEBUG_KERNEL && (M68K || UML || SUPERH) || ARCH_WANT_FRAME_POINTERS
 	default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS
 	help
 	  If you say Y here the resulting kernel image will be slightly
@@ -1138,7 +1135,7 @@ config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE && !X86
+	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 
@@ -1571,7 +1568,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
 	depends on !X86_64
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE && !X86
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
@@ -1969,7 +1966,7 @@ config STRICT_DEVMEM
 	bool "Filter access to /dev/mem"
 	depends on MMU && DEVMEM
 	depends on ARCH_HAS_DEVMEM_IS_ALLOWED
-	default y if TILE || PPC || X86 || ARM64
+	default y if PPC || X86 || ARM64
 	---help---
 	  If this option is disabled, you allow userspace (root) access to all
 	  of memory, including kernel and userspace memory. Accidental
diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
index a6556f3364d1..e161f0498f42 100644
--- a/lib/test_user_copy.c
+++ b/lib/test_user_copy.c
@@ -31,8 +31,6 @@
  * their capability at compile-time, we just have to opt-out certain archs.
  */
 #if BITS_PER_LONG == 64 || (!(defined(CONFIG_ARM) && !defined(MMU)) && \
-			    !defined(CONFIG_BLACKFIN) &&	\
-			    !defined(CONFIG_M32R) &&		\
 			    !defined(CONFIG_M68K) &&		\
 			    !defined(CONFIG_MICROBLAZE) &&	\
 			    !defined(CONFIG_NIOS2) &&		\
diff --git a/mm/Kconfig b/mm/Kconfig
index abefa573bcd8..d5004d82a1d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -278,13 +278,6 @@ config BOUNCE
 	  by default when ZONE_DMA or HIGHMEM is selected, but you
 	  may say n to override this.
 
-# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
-# have more than 4GB of memory, but we don't currently use the IOTLB to present
-# a 32-bit address to OHCI.  So we need to use a bounce pool instead.
-config NEED_BOUNCE_POOL
-	bool
-	default y if TILE && USB_OHCI_HCD
-
 config NR_QUICK
 	int
 	depends on QUICKLIST
diff --git a/mm/percpu.c b/mm/percpu.c
index 50e7fdf84055..79e3549cab0f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2719,11 +2719,7 @@ void __init setup_per_cpu_areas(void)
 
 	if (pcpu_setup_first_chunk(ai, fc) < 0)
 		panic("Failed to initialize percpu areas.");
-#ifdef CONFIG_CRIS
-#warning "the CRIS architecture has physical and virtual addresses confused"
-#else
 	pcpu_free_alloc_info(ai);
-#endif
 }
 
 #endif	/* CONFIG_SMP */
-- 
cgit v1.2.3


From fc5d1073cae299de4517755a910df4f12a6a438f Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 26 Mar 2018 23:27:21 -0700
Subject: x86/mm/32: Remove unused node_memmap_size_bytes() &
 CONFIG_NEED_NODE_MEMMAP_SIZE logic

node_memmap_size_bytes() has been unused since the v3.9 kernel, so remove it.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Fixes: f03574f2d5b2 ("x86-32, mm: Rip out x86_32 NUMA remapping code")
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803262325540.256524@chino.kir.corp.google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig       |  4 ----
 arch/x86/mm/numa_32.c  | 11 -----------
 include/linux/mmzone.h |  5 -----
 mm/sparse.c            | 22 ----------------------
 4 files changed, 42 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 18233e459bff..739aff253d17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1608,10 +1608,6 @@ config ARCH_HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on X86_32 && DISCONTIGMEM
 
-config NEED_NODE_MEMMAP_SIZE
-	def_bool y
-	depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
-
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 	depends on X86_32 && !NUMA
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index aca6295350f3..e8a4a09e20f1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	}
 	printk(KERN_CONT "\n");
 }
-
-unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
-					      unsigned long end_pfn)
-{
-	unsigned long nr_pages = end_pfn - start_pfn;
-
-	if (!nr_pages)
-		return 0;
-
-	return (nr_pages + 1) * sizeof(struct page);
-}
 #endif
 
 extern unsigned long highend_pfn, highstart_pfn;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7522a6987595..a2db4576e499 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -816,10 +816,6 @@ int local_memory_node(int node_id);
 static inline int local_memory_node(int node_id) { return node_id; };
 #endif
 
-#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
-unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
-#endif
-
 /*
  * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
  */
@@ -1289,7 +1285,6 @@ struct mminit_pfnnid_cache {
 #endif
 
 void memory_present(int nid, unsigned long start, unsigned long end);
-unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 
 /*
  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
diff --git a/mm/sparse.c b/mm/sparse.c
index 7af5e7a92528..79b26f98d793 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -235,28 +235,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
 	}
 }
 
-/*
- * Only used by the i386 NUMA architecures, but relatively
- * generic code.
- */
-unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
-						     unsigned long end_pfn)
-{
-	unsigned long pfn;
-	unsigned long nr_pages = 0;
-
-	mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
-	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		if (nid != early_pfn_to_nid(pfn))
-			continue;
-
-		if (pfn_present(pfn))
-			nr_pages += PAGES_PER_SECTION;
-	}
-
-	return nr_pages * sizeof(struct page);
-}
-
 /*
  * Subtle, we encode the real pfn into the mem_map such that
  * the identity pfn - section_mem_map will return the actual
-- 
cgit v1.2.3


From 880cd276dff17ea29e9a8404275c9502b265afa7 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 28 Mar 2018 16:00:57 -0700
Subject: mm, slab: memcg_link the SLAB's kmem_cache

All the root caches are linked into slab_root_caches which was
introduced by the commit 510ded33e075 ("slab: implement slab_root_caches
list") but it missed to add the SLAB's kmem_cache.

While experimenting with opt-in/opt-out kmem accounting, I noticed
system crashes due to NULL dereference inside cache_from_memcg_idx()
while deferencing kmem_cache.memcg_params.memcg_caches.  The upstream
clean kernel will not see these crashes but SLAB should be consistent
with SLUB which does linked its boot caches (kmem_cache_node and
kmem_cache) into slab_root_caches.

Link: http://lkml.kernel.org/r/20180319210020.60289-1-shakeelb@google.com
Fixes: 510ded33e075c ("slab: implement slab_root_caches list")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 324446621b3e..9095c3945425 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1283,6 +1283,7 @@ void __init kmem_cache_init(void)
 				  nr_node_ids * sizeof(struct kmem_cache_node *),
 				  SLAB_HWCACHE_ALIGN, 0, 0);
 	list_add(&kmem_cache->list, &slab_caches);
+	memcg_link_cache(kmem_cache);
 	slab_state = PARTIAL;
 
 	/*
-- 
cgit v1.2.3


From 299815a4fba9f3c7a81434dba0072148f1690608 Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Wed, 28 Mar 2018 16:01:05 -0700
Subject: mm/page_owner: fix recursion bug after changing skip entries

This patch fixes commit 5f48f0bd4e36 ("mm, page_owner: skip unnecessary
stack_trace entries").

Because if we skip first two entries then logic of checking count value
as 2 for recursion is broken and code will go in one depth recursion.

so we need to check only one call of _RET_IP(__set_page_owner) while
checking for recursion.

Current Backtrace while checking for recursion:-

  (save_stack)             from (__set_page_owner)  // (But recursion returns true here)
  (__set_page_owner)       from (get_page_from_freelist)
  (get_page_from_freelist) from (__alloc_pages_nodemask)
  (__alloc_pages_nodemask) from (depot_save_stack)
  (depot_save_stack)       from (save_stack)       // recursion should return true here
  (save_stack)             from (__set_page_owner)
  (__set_page_owner)       from (get_page_from_freelist)
  (get_page_from_freelist) from (__alloc_pages_nodemask+)
  (__alloc_pages_nodemask) from (depot_save_stack)
  (depot_save_stack)       from (save_stack)
  (save_stack)             from (__set_page_owner)
  (__set_page_owner)       from (get_page_from_freelist)

Correct Backtrace with fix:

  (save_stack)             from (__set_page_owner) // recursion returned true here
  (__set_page_owner)       from (get_page_from_freelist)
  (get_page_from_freelist) from (__alloc_pages_nodemask+)
  (__alloc_pages_nodemask) from (depot_save_stack)
  (depot_save_stack)       from (save_stack)
  (save_stack)             from (__set_page_owner)
  (__set_page_owner)       from (get_page_from_freelist)

Link: http://lkml.kernel.org/r/1521607043-34670-1-git-send-email-maninder1.s@samsung.com
Fixes: 5f48f0bd4e36 ("mm, page_owner: skip unnecessary stack_trace entries")
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Signed-off-by: Vaneet Narang <v.narang@samsung.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@techadventures.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ayush Mittal <ayush.m@samsung.com>
Cc: Prakash Gupta <guptap@codeaurora.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Vasyl Gomonovych <gomonovych@gmail.com>
Cc: Amit Sahrawat <a.sahrawat@samsung.com>
Cc: <pankaj.m@samsung.com>
Cc: Vaneet Narang <v.narang@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_owner.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9886c6073828..7172e0a80e13 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -123,13 +123,13 @@ void __reset_page_owner(struct page *page, unsigned int order)
 static inline bool check_recursive_alloc(struct stack_trace *trace,
 					unsigned long ip)
 {
-	int i, count;
+	int i;
 
 	if (!trace->nr_entries)
 		return false;
 
-	for (i = 0, count = 0; i < trace->nr_entries; i++) {
-		if (trace->entries[i] == ip && ++count == 2)
+	for (i = 0; i < trace->nr_entries; i++) {
+		if (trace->entries[i] == ip)
 			return true;
 	}
 
-- 
cgit v1.2.3


From c7f26ccfb2c31eb1bf810ba13d044fcf583232db Mon Sep 17 00:00:00 2001
From: "Steven J. Hill" <steven.hill@cavium.com>
Date: Wed, 28 Mar 2018 16:01:09 -0700
Subject: mm/vmstat.c: fix vmstat_update() preemption BUG

Attempting to hotplug CPUs with CONFIG_VM_EVENT_COUNTERS enabled can
cause vmstat_update() to report a BUG due to preemption not being
disabled around smp_processor_id().

Discovered on Ubiquiti EdgeRouter Pro with Cavium Octeon II processor.

  BUG: using smp_processor_id() in preemptible [00000000] code:
  kworker/1:1/269
  caller is vmstat_update+0x50/0xa0
  CPU: 0 PID: 269 Comm: kworker/1:1 Not tainted
  4.16.0-rc4-Cavium-Octeon-00009-gf83bbd5-dirty #1
  Workqueue: mm_percpu_wq vmstat_update
  Call Trace:
    show_stack+0x94/0x128
    dump_stack+0xa4/0xe0
    check_preemption_disabled+0x118/0x120
    vmstat_update+0x50/0xa0
    process_one_work+0x144/0x348
    worker_thread+0x150/0x4b8
    kthread+0x110/0x140
    ret_from_kernel_thread+0x14/0x1c

Link: http://lkml.kernel.org/r/1520881552-25659-1-git-send-email-steven.hill@cavium.com
Signed-off-by: Steven J. Hill <steven.hill@cavium.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2db6db6b1..33581be705f0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1839,9 +1839,11 @@ static void vmstat_update(struct work_struct *w)
 		 * to occur in the future. Keep on running the
 		 * update worker thread.
 		 */
+		preempt_disable();
 		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
 				this_cpu_ptr(&vmstat_work),
 				round_jiffies_relative(sysctl_stat_interval));
+		preempt_enable();
 	}
 }
 
-- 
cgit v1.2.3


From b213b54fbf9d282dc545252313d727f3972be8e0 Mon Sep 17 00:00:00 2001
From: Honglei Wang <honglei.wang@oracle.com>
Date: Wed, 28 Mar 2018 16:01:12 -0700
Subject: mm/memcontrol.c: fix parameter description mismatch

There are a couple of places where parameter description and function
name do not match the actual code.  Fix it.

Link: http://lkml.kernel.org/r/1520843448-17347-1-git-send-email-honglei.wang@oracle.com
Signed-off-by: Honglei Wang <honglei.wang@oracle.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 670e99b68aa6..9ec024b862ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -714,9 +714,9 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
- * Reclaimers can specify a zone and a priority level in @reclaim to
+ * Reclaimers can specify a node and a priority level in @reclaim to
  * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same zone and priority.
+ * reclaimers operating on the same node and priority.
  */
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
@@ -2299,7 +2299,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
 }
 
 /**
- * memcg_kmem_charge: charge a kmem page
+ * memcg_kmem_charge_memcg: charge a kmem page
  * @page: page to charge
  * @gfp: reclaim mode
  * @order: allocation order
-- 
cgit v1.2.3


From 914b6dfff790544d9b77dfd1723adb3745ec9700 Mon Sep 17 00:00:00 2001
From: Vinayak Menon <vinmenon@codeaurora.org>
Date: Wed, 28 Mar 2018 16:01:16 -0700
Subject: mm/kmemleak.c: wait for scan completion before disabling free

A crash is observed when kmemleak_scan accesses the object->pointer,
likely due to the following race.

  TASK A             TASK B                     TASK C
  kmemleak_write
   (with "scan" and
   NOT "scan=on")
  kmemleak_scan()
                     create_object
                     kmem_cache_alloc fails
                     kmemleak_disable
                     kmemleak_do_cleanup
                     kmemleak_free_enabled = 0
                                                kfree
                                                kmemleak_free bails out
                                                 (kmemleak_free_enabled is 0)
                                                slub frees object->pointer
  update_checksum
  crash - object->pointer
   freed (DEBUG_PAGEALLOC)

kmemleak_do_cleanup waits for the scan thread to complete, but not for
direct call to kmemleak_scan via kmemleak_write.  So add a wait for
kmemleak_scan completion before disabling kmemleak_free, and while at it
fix the comment on stop_scan_thread.

[vinmenon@codeaurora.org: fix stop_scan_thread comment]
  Link: http://lkml.kernel.org/r/1522219972-22809-1-git-send-email-vinmenon@codeaurora.org
Link: http://lkml.kernel.org/r/1522063429-18992-1-git-send-email-vinmenon@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e83987c55a08..46c2290a08f1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1657,8 +1657,7 @@ static void start_scan_thread(void)
 }
 
 /*
- * Stop the automatic memory scanning thread. This function must be called
- * with the scan_mutex held.
+ * Stop the automatic memory scanning thread.
  */
 static void stop_scan_thread(void)
 {
@@ -1921,12 +1920,15 @@ static void kmemleak_do_cleanup(struct work_struct *work)
 {
 	stop_scan_thread();
 
+	mutex_lock(&scan_mutex);
 	/*
-	 * Once the scan thread has stopped, it is safe to no longer track
-	 * object freeing. Ordering of the scan thread stopping and the memory
-	 * accesses below is guaranteed by the kthread_stop() function.
+	 * Once it is made sure that kmemleak_scan has stopped, it is safe to no
+	 * longer track object freeing. Ordering of the scan thread stopping and
+	 * the memory accesses below is guaranteed by the kthread_stop()
+	 * function.
 	 */
 	kmemleak_free_enabled = 0;
+	mutex_unlock(&scan_mutex);
 
 	if (!kmemleak_found_leaks)
 		__kmemleak_do_cleanup();
-- 
cgit v1.2.3


From b6e9b0babb7a02ae4f00f053974609000f00950e Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 16:00:25 +0100
Subject: mm: add kernel_migrate_pages() helper, move compat syscall to
 mm/mempolicy.c

Move compat_sys_migrate_pages() to mm/mempolicy.c and make it call a newly
introduced helper -- kernel_migrate_pages() -- instead of the syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/compat.c | 33 ---------------------------------
 mm/mempolicy.c  | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/kernel/compat.c b/kernel/compat.c
index 3f5fa8902e7d..51bdf1808943 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -508,39 +508,6 @@ COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
 	}
 	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
-
-COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
-		       compat_ulong_t, maxnode,
-		       const compat_ulong_t __user *, old_nodes,
-		       const compat_ulong_t __user *, new_nodes)
-{
-	unsigned long __user *old = NULL;
-	unsigned long __user *new = NULL;
-	nodemask_t tmp_mask;
-	unsigned long nr_bits;
-	unsigned long size;
-
-	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
-	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-	if (old_nodes) {
-		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
-			return -EFAULT;
-		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
-		if (new_nodes)
-			new = old + size / sizeof(unsigned long);
-		if (copy_to_user(old, nodes_addr(tmp_mask), size))
-			return -EFAULT;
-	}
-	if (new_nodes) {
-		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
-			return -EFAULT;
-		if (new == NULL)
-			new = compat_alloc_user_space(size);
-		if (copy_to_user(new, nodes_addr(tmp_mask), size))
-			return -EFAULT;
-	}
-	return sys_migrate_pages(pid, nr_bits + 1, old, new);
-}
 #endif
 
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d879f1d8a44a..7399ede02b5f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1377,9 +1377,9 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 	return do_set_mempolicy(mode, flags, &nodes);
 }
 
-SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
-		const unsigned long __user *, old_nodes,
-		const unsigned long __user *, new_nodes)
+static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
+				const unsigned long __user *old_nodes,
+				const unsigned long __user *new_nodes)
 {
 	struct mm_struct *mm = NULL;
 	struct task_struct *task;
@@ -1469,6 +1469,13 @@ out_put:
 
 }
 
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+		const unsigned long __user *, old_nodes,
+		const unsigned long __user *, new_nodes)
+{
+	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
+}
+
 
 /* Retrieve NUMA policy */
 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
@@ -1571,7 +1578,40 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 }
 
-#endif
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+		       compat_ulong_t, maxnode,
+		       const compat_ulong_t __user *, old_nodes,
+		       const compat_ulong_t __user *, new_nodes)
+{
+	unsigned long __user *old = NULL;
+	unsigned long __user *new = NULL;
+	nodemask_t tmp_mask;
+	unsigned long nr_bits;
+	unsigned long size;
+
+	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+	if (old_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+			return -EFAULT;
+		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+		if (new_nodes)
+			new = old + size / sizeof(unsigned long);
+		if (copy_to_user(old, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	if (new_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+			return -EFAULT;
+		if (new == NULL)
+			new = compat_alloc_user_space(size);
+		if (copy_to_user(new, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
+}
+
+#endif /* CONFIG_COMPAT */
 
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
 						unsigned long addr)
-- 
cgit v1.2.3


From 7addf44388255f6fa99c83e3e2ad79cef0813698 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 16:08:03 +0100
Subject: mm: add kernel_move_pages() helper, move compat syscall to
 mm/migrate.c

Move compat_sys_move_pages() to mm/migrate.c and make it call a newly
introduced helper -- kernel_move_pages() -- instead of the syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/compat.c | 22 ----------------------
 mm/migrate.c    | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/kernel/compat.c b/kernel/compat.c
index 51bdf1808943..6d21894806b4 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -488,28 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 }
 EXPORT_SYMBOL_GPL(get_compat_sigset);
 
-#ifdef CONFIG_NUMA
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
-		       compat_uptr_t __user *, pages32,
-		       const int __user *, nodes,
-		       int __user *, status,
-		       int, flags)
-{
-	const void __user * __user *pages;
-	int i;
-
-	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
-	for (i = 0; i < nr_pages; i++) {
-		compat_uptr_t p;
-
-		if (get_user(p, pages32 + i) ||
-			put_user(compat_ptr(p), pages + i))
-			return -EFAULT;
-	}
-	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-#endif
-
 /*
  * Allocate user-space memory for the duration of a single system call,
  * in order to marshall parameters inside a compat thunk.
diff --git a/mm/migrate.c b/mm/migrate.c
index 1e5525a25691..003886606a22 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -34,6 +34,7 @@
 #include <linux/backing-dev.h>
 #include <linux/compaction.h>
 #include <linux/syscalls.h>
+#include <linux/compat.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
@@ -1745,10 +1746,10 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
  * Move a list of pages in the address space of the currently executing
  * process.
  */
-SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
-		const void __user * __user *, pages,
-		const int __user *, nodes,
-		int __user *, status, int, flags)
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+			     const void __user * __user *pages,
+			     const int __user *nodes,
+			     int __user *status, int flags)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
@@ -1807,6 +1808,36 @@ out:
 	return err;
 }
 
+SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
+		const void __user * __user *, pages,
+		const int __user *, nodes,
+		int __user *, status, int, flags)
+{
+	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+		       compat_uptr_t __user *, pages32,
+		       const int __user *, nodes,
+		       int __user *, status,
+		       int, flags)
+{
+	const void __user * __user *pages;
+	int i;
+
+	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+	for (i = 0; i < nr_pages; i++) {
+		compat_uptr_t p;
+
+		if (get_user(p, pages32 + i) ||
+			put_user(compat_ptr(p), pages + i))
+			return -EFAULT;
+	}
+	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif /* CONFIG_COMPAT */
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Returns true if this is a safe migration target node for misplaced NUMA
-- 
cgit v1.2.3


From e7dc9ad6e98eb8cc49b454d54e361f91aebc395f Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 16:12:22 +0100
Subject: mm: add kernel_mbind() helper; remove in-kernel call to syscall

Using the mm-internal kernel_mbind() helper allows us to get rid of the
mm-internal call to the sys_mbind() syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 mm/mempolicy.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7399ede02b5f..e4d7d4c0b253 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1336,9 +1336,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
-		unsigned long, mode, const unsigned long __user *, nmask,
-		unsigned long, maxnode, unsigned, flags)
+static long kernel_mbind(unsigned long start, unsigned long len,
+			 unsigned long mode, const unsigned long __user *nmask,
+			 unsigned long maxnode, unsigned int flags)
 {
 	nodemask_t nodes;
 	int err;
@@ -1357,6 +1357,13 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 }
 
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+		unsigned long, mode, const unsigned long __user *, nmask,
+		unsigned long, maxnode, unsigned int, flags)
+{
+	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
+}
+
 /* Set the process memory policy */
 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 		unsigned long, maxnode)
@@ -1575,7 +1582,7 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
 			return -EFAULT;
 	}
 
-	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
+	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
 }
 
 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
-- 
cgit v1.2.3


From af03c4acb728dd9ed850d329a1cff71d52e7f3a8 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 16:20:01 +0100
Subject: mm: add kernel_[sg]et_mempolicy() helpers; remove in-kernel calls to
 syscalls

Using the mm-internal kernel_[sg]et_mempolicy() helper allows us to get
rid of the mm-internal calls to the sys_[sg]et_mempolicy() syscalls.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 mm/mempolicy.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e4d7d4c0b253..ca817e768d0e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1365,8 +1365,8 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 }
 
 /* Set the process memory policy */
-SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
-		unsigned long, maxnode)
+static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
+				 unsigned long maxnode)
 {
 	int err;
 	nodemask_t nodes;
@@ -1384,6 +1384,12 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 	return do_set_mempolicy(mode, flags, &nodes);
 }
 
+SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
+		unsigned long, maxnode)
+{
+	return kernel_set_mempolicy(mode, nmask, maxnode);
+}
+
 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *old_nodes,
 				const unsigned long __user *new_nodes)
@@ -1485,9 +1491,11 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 
 
 /* Retrieve NUMA policy */
-SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
-		unsigned long __user *, nmask, unsigned long, maxnode,
-		unsigned long, addr, unsigned long, flags)
+static int kernel_get_mempolicy(int __user *policy,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned long addr,
+				unsigned long flags)
 {
 	int err;
 	int uninitialized_var(pval);
@@ -1510,6 +1518,13 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 	return err;
 }
 
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+		unsigned long __user *, nmask, unsigned long, maxnode,
+		unsigned long, addr, unsigned long, flags)
+{
+	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
+}
+
 #ifdef CONFIG_COMPAT
 
 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
@@ -1528,7 +1543,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 	if (nmask)
 		nm = compat_alloc_user_space(alloc_size);
 
-	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
+	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 
 	if (!err && nmask) {
 		unsigned long copy_size;
@@ -1560,7 +1575,7 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
 			return -EFAULT;
 	}
 
-	return sys_set_mempolicy(mode, nm, nr_bits+1);
+	return kernel_set_mempolicy(mode, nm, nr_bits+1);
 }
 
 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
-- 
cgit v1.2.3


From 9d5b7c956b09daab955fb2a42447d5d89ff15093 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:45 +0100
Subject: mm: add ksys_fadvise64_64() helper; remove in-kernel call to
 sys_fadvise64_64()

Using the ksys_fadvise64_64() helper allows us to avoid the in-kernel
calls to the sys_fadvise64_64() syscall. The ksys_ prefix denotes that
this function is meant as a drop-in replacement for the syscall. In
particular, it uses the same calling convention as ksys_fadvise64_64().

Some compat stubs called sys_fadvise64(), which then just passed through
the arguments to sys_fadvise64_64(). Get rid of this indirection, and call
ksys_fadvise64_64() directly.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/arm/kernel/sys_arm.c       |  2 +-
 arch/mips/kernel/linux32.c      |  2 +-
 arch/parisc/kernel/sys_parisc.c |  2 +-
 arch/powerpc/kernel/sys_ppc32.c |  4 ++--
 arch/powerpc/kernel/syscalls.c  |  4 ++--
 arch/s390/kernel/compat_linux.c |  5 +++--
 arch/sh/kernel/sys_sh32.c       |  8 ++++----
 arch/sparc/kernel/sys_sparc32.c | 10 +++++-----
 arch/x86/ia32/sys_ia32.c        | 12 ++++++------
 arch/xtensa/kernel/syscall.c    |  2 +-
 include/linux/syscalls.h        |  9 +++++++++
 mm/fadvise.c                    | 10 ++++++++--
 12 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index 3151f5623d0e..bdf7514204ab 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -35,5 +35,5 @@
 asmlinkage long sys_arm_fadvise64_64(int fd, int advice,
 				     loff_t offset, loff_t len)
 {
-	return sys_fadvise64_64(fd, offset, len, advice);
+	return ksys_fadvise64_64(fd, offset, len, advice);
 }
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 0779d474c8ad..1c5785e72db4 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -149,7 +149,7 @@ asmlinkage long sys32_fadvise64_64(int fd, int __pad,
 	unsigned long a4, unsigned long a5,
 	int flags)
 {
-	return sys_fadvise64_64(fd,
+	return ksys_fadvise64_64(fd,
 			merge_64(a2, a3), merge_64(a4, a5),
 			flags);
 }
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index 588fab336ddd..f36ab1f09595 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -352,7 +352,7 @@ asmlinkage long parisc_fadvise64_64(int fd,
 			unsigned int high_off, unsigned int low_off,
 			unsigned int high_len, unsigned int low_len, int advice)
 {
-	return sys_fadvise64_64(fd, (loff_t)high_off << 32 | low_off,
+	return ksys_fadvise64_64(fd, (loff_t)high_off << 32 | low_off,
 			(loff_t)high_len << 32 | low_len, advice);
 }
 
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 68f11e1065f8..0b95fa13307f 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -113,8 +113,8 @@ asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long h
 long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
 		     size_t len, int advice)
 {
-	return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low, len,
-			     advice);
+	return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, len,
+				 advice);
 }
 
 asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index a877bf8269fe..ecb981eea74b 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -119,8 +119,8 @@ long ppc64_personality(unsigned long personality)
 long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
 		      u32 len_high, u32 len_low)
 {
-	return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low,
-			     (u64)len_high << 32 | len_low, advice);
+	return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low,
+				 (u64)len_high << 32 | len_low, advice);
 }
 
 long sys_switch_endian(void)
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 039858f9f128..9bb897e443a6 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -483,7 +483,8 @@ COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size
 		advise = POSIX_FADV_DONTNEED;
 	else if (advise == 5)
 		advise = POSIX_FADV_NOREUSE;
-	return sys_fadvise64(fd, (unsigned long)high << 32 | low, len, advise);
+	return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len,
+				 advise);
 }
 
 struct fadvise64_64_args {
@@ -503,7 +504,7 @@ COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, arg
 		a.advice = POSIX_FADV_DONTNEED;
 	else if (a.advice == 5)
 		a.advice = POSIX_FADV_NOREUSE;
-	return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
+	return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
 }
 
 COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow,
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index c37ee3d0c803..9dca568509a5 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -52,10 +52,10 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1,
 				u32 len0, u32 len1, int advice)
 {
 #ifdef  __LITTLE_ENDIAN__
-	return sys_fadvise64_64(fd, (u64)offset1 << 32 | offset0,
-				(u64)len1 << 32 | len0,	advice);
+	return ksys_fadvise64_64(fd, (u64)offset1 << 32 | offset0,
+				 (u64)len1 << 32 | len0, advice);
 #else
-	return sys_fadvise64_64(fd, (u64)offset0 << 32 | offset1,
-				(u64)len0 << 32 | len1,	advice);
+	return ksys_fadvise64_64(fd, (u64)offset0 << 32 | offset1,
+				 (u64)len0 << 32 | len1, advice);
 #endif
 }
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 4ba62d676632..4da66aed50b4 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -225,7 +225,7 @@ long compat_sys_fadvise64(int fd,
 			  unsigned long offlo,
 			  compat_size_t len, int advice)
 {
-	return sys_fadvise64_64(fd, (offhi << 32) | offlo, len, advice);
+	return ksys_fadvise64_64(fd, (offhi << 32) | offlo, len, advice);
 }
 
 long compat_sys_fadvise64_64(int fd,
@@ -233,10 +233,10 @@ long compat_sys_fadvise64_64(int fd,
 			     unsigned long lenhi, unsigned long lenlo,
 			     int advice)
 {
-	return sys_fadvise64_64(fd,
-				(offhi << 32) | offlo,
-				(lenhi << 32) | lenlo,
-				advice);
+	return ksys_fadvise64_64(fd,
+				 (offhi << 32) | offlo,
+				 (lenhi << 32) | lenlo,
+				 advice);
 }
 
 long sys32_sync_file_range(unsigned int fd, unsigned long off_high, unsigned long off_low, unsigned long nb_high, unsigned long nb_low, unsigned int flags)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index df2acb13623f..401bd8ec9cf0 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -194,10 +194,10 @@ COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low,
 		       __u32, offset_high, __u32, len_low, __u32, len_high,
 		       int, advice)
 {
-	return sys_fadvise64_64(fd,
-			       (((u64)offset_high)<<32) | offset_low,
-			       (((u64)len_high)<<32) | len_low,
-				advice);
+	return ksys_fadvise64_64(fd,
+				 (((u64)offset_high)<<32) | offset_low,
+				 (((u64)len_high)<<32) | len_low,
+				 advice);
 }
 
 COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo,
@@ -218,8 +218,8 @@ COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low,
 COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo,
 		       unsigned int, offset_hi, size_t, len, int, advice)
 {
-	return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
-				len, advice);
+	return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
+				 len, advice);
 }
 
 COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode,
diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c
index 74afbf02d07e..8201748da05b 100644
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -55,7 +55,7 @@ asmlinkage long xtensa_shmat(int shmid, char __user *shmaddr, int shmflg)
 asmlinkage long xtensa_fadvise64_64(int fd, int advice,
 		unsigned long long offset, unsigned long long len)
 {
-	return sys_fadvise64_64(fd, offset, len, advice);
+	return ksys_fadvise64_64(fd, offset, len, advice);
 }
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 613b8127834d..466d408deefd 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -970,6 +970,15 @@ ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 		      size_t count, loff_t pos);
 int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+#ifdef CONFIG_ADVISE_SYSCALLS
+int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);
+#else
+static inline int ksys_fadvise64_64(int fd, loff_t offset, loff_t len,
+				    int advice)
+{
+	return -EINVAL;
+}
+#endif
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 767887f5f3bf..afa41491d324 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,8 @@
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
  */
-SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+
+int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
 	struct fd f = fdget(fd);
 	struct inode *inode;
@@ -185,11 +186,16 @@ out:
 	return ret;
 }
 
+SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+{
+	return ksys_fadvise64_64(fd, offset, len, advice);
+}
+
 #ifdef __ARCH_WANT_SYS_FADVISE64
 
 SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
 {
-	return sys_fadvise64_64(fd, offset, len, advice);
+	return ksys_fadvise64_64(fd, offset, len, advice);
 }
 
 #endif
-- 
cgit v1.2.3


From a90f590a1bee36fc2129cfb38ceec24a555bb12d Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:46 +0100
Subject: mm: add ksys_mmap_pgoff() helper; remove in-kernel calls to
 sys_mmap_pgoff()

Using this helper allows us to avoid the in-kernel calls to the
sys_mmap_pgoff() syscall. The ksys_ prefix denotes that this function is
meant as a drop-in replacement for the syscall. In particular, it uses the
same calling convention as sys_mmap_pgoff().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/alpha/kernel/osf_sys.c             |  2 +-
 arch/arm64/kernel/sys.c                 |  2 +-
 arch/ia64/kernel/sys_ia64.c             |  4 ++--
 arch/m68k/kernel/sys_m68k.c             |  2 +-
 arch/microblaze/kernel/sys_microblaze.c |  6 +++---
 arch/mips/kernel/linux32.c              |  4 ++--
 arch/mips/kernel/syscall.c              |  6 ++++--
 arch/parisc/kernel/sys_parisc.c         |  6 +++---
 arch/powerpc/kernel/syscalls.c          |  2 +-
 arch/riscv/kernel/sys_riscv.c           |  4 ++--
 arch/s390/kernel/compat_linux.c         |  6 +++---
 arch/s390/kernel/sys_s390.c             |  2 +-
 arch/sh/kernel/sys_sh.c                 |  4 ++--
 arch/sparc/kernel/sys_sparc_32.c        |  6 +++---
 arch/sparc/kernel/sys_sparc_64.c        |  2 +-
 arch/um/kernel/syscall.c                |  2 +-
 arch/x86/ia32/sys_ia32.c                |  2 +-
 arch/x86/kernel/sys_x86_64.c            |  2 +-
 include/linux/syscalls.h                |  3 +++
 mm/mmap.c                               | 17 ++++++++++++-----
 mm/nommu.c                              | 17 ++++++++++++-----
 21 files changed, 60 insertions(+), 41 deletions(-)

(limited to 'mm')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index fa1a392ca9a2..89faa6f4de47 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE6(osf_mmap, unsigned long, addr, unsigned long, len,
 		goto out;
 	if (off & ~PAGE_MASK)
 		goto out;
-	ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
  out:
 	return ret;
 }
diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
index 26fe8ea93ea2..72981bae10eb 100644
--- a/arch/arm64/kernel/sys.c
+++ b/arch/arm64/kernel/sys.c
@@ -34,7 +34,7 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
 	if (offset_in_page(off) != 0)
 		return -EINVAL;
 
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 }
 
 SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 085adfcc74a4..9ebe1d633abc 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -139,7 +139,7 @@ int ia64_mmap_check(unsigned long addr, unsigned long len,
 asmlinkage unsigned long
 sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
 {
-	addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
 	if (!IS_ERR((void *) addr))
 		force_successful_syscall_return();
 	return addr;
@@ -151,7 +151,7 @@ sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, lo
 	if (offset_in_page(off) != 0)
 		return -EINVAL;
 
-	addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 	if (!IS_ERR((void *) addr))
 		force_successful_syscall_return();
 	return addr;
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 27e10af5153a..6363ec83a290 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -46,7 +46,7 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 	 * so we need to shift the argument down by 1; m68k mmap64(3)
 	 * (in libc) expects the last argument of mmap2 in 4Kb units.
 	 */
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
 }
 
 /* Convert virtual (user) address VADDR to physical address PADDR */
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index f1e1f666ddde..ed9f34da1a2a 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -40,7 +40,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 	if (pgoff & ~PAGE_MASK)
 		return -EINVAL;
 
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT);
 }
 
 SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,
@@ -50,6 +50,6 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,
 	if (pgoff & (~PAGE_MASK >> 12))
 		return -EINVAL;
 
-	return sys_mmap_pgoff(addr, len, prot, flags, fd,
-			      pgoff >> (PAGE_SHIFT - 12));
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       pgoff >> (PAGE_SHIFT - 12));
 }
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 1c5785e72db4..0571ab7b68b0 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -67,8 +67,8 @@ SYSCALL_DEFINE6(32_mmap2, unsigned long, addr, unsigned long, len,
 {
 	if (pgoff & (~PAGE_MASK >> 12))
 		return -EINVAL;
-	return sys_mmap_pgoff(addr, len, prot, flags, fd,
-			      pgoff >> (PAGE_SHIFT-12));
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       pgoff >> (PAGE_SHIFT-12));
 }
 
 #define RLIM_INFINITY32 0x7fffffff
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 58c6f634b550..69c17b549fd3 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -63,7 +63,8 @@ SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len,
 {
 	if (offset & ~PAGE_MASK)
 		return -EINVAL;
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       offset >> PAGE_SHIFT);
 }
 
 SYSCALL_DEFINE6(mips_mmap2, unsigned long, addr, unsigned long, len,
@@ -73,7 +74,8 @@ SYSCALL_DEFINE6(mips_mmap2, unsigned long, addr, unsigned long, len,
 	if (pgoff & (~PAGE_MASK >> 12))
 		return -EINVAL;
 
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12));
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       pgoff >> (PAGE_SHIFT - 12));
 }
 
 save_static_function(sys_fork);
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index f36ab1f09595..080d566654ea 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -270,8 +270,8 @@ asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len,
 {
 	/* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE
 	   we have. */
-	return sys_mmap_pgoff(addr, len, prot, flags, fd,
-			      pgoff >> (PAGE_SHIFT - 12));
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       pgoff >> (PAGE_SHIFT - 12));
 }
 
 asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
@@ -279,7 +279,7 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
 		unsigned long offset)
 {
 	if (!(offset & ~PAGE_MASK)) {
-		return sys_mmap_pgoff(addr, len, prot, flags, fd,
+		return ksys_mmap_pgoff(addr, len, prot, flags, fd,
 					offset >> PAGE_SHIFT);
 	} else {
 		return -EINVAL;
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index ecb981eea74b..1ef3b80b62a6 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -57,7 +57,7 @@ static inline long do_mmap2(unsigned long addr, size_t len,
 		off >>= shift;
 	}
 
-	ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off);
+	ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off);
 out:
 	return ret;
 }
diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
index 79c78668258e..f7181ed8aafc 100644
--- a/arch/riscv/kernel/sys_riscv.c
+++ b/arch/riscv/kernel/sys_riscv.c
@@ -24,8 +24,8 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len,
 {
 	if (unlikely(offset & (~PAGE_MASK >> page_shift_offset)))
 		return -EINVAL;
-	return sys_mmap_pgoff(addr, len, prot, flags, fd,
-			      offset >> (PAGE_SHIFT - page_shift_offset));
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       offset >> (PAGE_SHIFT - page_shift_offset));
 }
 
 #ifdef CONFIG_64BIT
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 9bb897e443a6..da5ef7718254 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -442,8 +442,8 @@ COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg
 		return -EFAULT;
 	if (a.offset & ~PAGE_MASK)
 		return -EINVAL;
-	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
-			      a.offset >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+			       a.offset >> PAGE_SHIFT);
 }
 
 COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg)
@@ -452,7 +452,7 @@ COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg)
 
 	if (copy_from_user(&a, arg, sizeof(a)))
 		return -EFAULT;
-	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
+	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
 }
 
 COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count)
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index 0090037ab148..31cefe0c28c0 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -53,7 +53,7 @@ SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg)
 
 	if (copy_from_user(&a, arg, sizeof(a)))
 		goto out;
-	error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
+	error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
 out:
 	return error;
 }
diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
index 724911c59e7d..f8afc014e084 100644
--- a/arch/sh/kernel/sys_sh.c
+++ b/arch/sh/kernel/sys_sh.c
@@ -35,7 +35,7 @@ asmlinkage int old_mmap(unsigned long addr, unsigned long len,
 {
 	if (off & ~PAGE_MASK)
 		return -EINVAL;
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, off>>PAGE_SHIFT);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, off>>PAGE_SHIFT);
 }
 
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
@@ -51,7 +51,7 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 
 	pgoff >>= PAGE_SHIFT - 12;
 
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
 }
 
 /* sys_cacheflush -- flush (part of) the processor cache.  */
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 990703b7cf4d..d980da4ffd7b 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -104,8 +104,8 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 {
 	/* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE
 	   we have. */
-	return sys_mmap_pgoff(addr, len, prot, flags, fd,
-			      pgoff >> (PAGE_SHIFT - 12));
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd,
+			       pgoff >> (PAGE_SHIFT - 12));
 }
 
 asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
@@ -113,7 +113,7 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
 	unsigned long off)
 {
 	/* no alignment check? */
-	return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 }
 
 long sparc_remap_file_pages(unsigned long start, unsigned long size,
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 55416db482ad..ebb84dc8a5a7 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -458,7 +458,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 		goto out;
 	if (off & ~PAGE_MASK)
 		goto out;
-	retval = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	retval = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 out:
 	return retval;
 }
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index 6258676bed85..35f7047bdebc 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -22,7 +22,7 @@ long old_mmap(unsigned long addr, unsigned long len,
 	if (offset & ~PAGE_MASK)
 		goto out;
 
-	err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
+	err = ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
  out:
 	return err;
 }
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 401bd8ec9cf0..bff71b9ae3f5 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -166,7 +166,7 @@ COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg)
 	if (a.offset & ~PAGE_MASK)
 		return -EINVAL;
 
-	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
 			       a.offset>>PAGE_SHIFT);
 }
 
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 676774b9bb8d..a3f15ed545b5 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -97,7 +97,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 	if (off & ~PAGE_MASK)
 		goto out;
 
-	error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 out:
 	return error;
 }
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 466d408deefd..ec866c959e7d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -979,6 +979,9 @@ static inline int ksys_fadvise64_64(int fd, loff_t offset, loff_t len,
 	return -EINVAL;
 }
 #endif
+unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
+			      unsigned long prot, unsigned long flags,
+			      unsigned long fd, unsigned long pgoff);
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021ad22..aa0dc8231c0d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1488,9 +1488,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	return addr;
 }
 
-SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
-		unsigned long, prot, unsigned long, flags,
-		unsigned long, fd, unsigned long, pgoff)
+unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
+			      unsigned long prot, unsigned long flags,
+			      unsigned long fd, unsigned long pgoff)
 {
 	struct file *file = NULL;
 	unsigned long retval;
@@ -1537,6 +1537,13 @@ out_fput:
 	return retval;
 }
 
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, unsigned long, pgoff)
+{
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+}
+
 #ifdef __ARCH_WANT_SYS_OLD_MMAP
 struct mmap_arg_struct {
 	unsigned long addr;
@@ -1556,8 +1563,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 	if (offset_in_page(a.offset))
 		return -EINVAL;
 
-	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
-			      a.offset >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+			       a.offset >> PAGE_SHIFT);
 }
 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
 
diff --git a/mm/nommu.c b/mm/nommu.c
index ebb6e618dade..cad329629530 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1423,9 +1423,9 @@ error_getting_region:
 	return -ENOMEM;
 }
 
-SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
-		unsigned long, prot, unsigned long, flags,
-		unsigned long, fd, unsigned long, pgoff)
+unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
+			      unsigned long prot, unsigned long flags,
+			      unsigned long fd, unsigned long pgoff)
 {
 	struct file *file = NULL;
 	unsigned long retval = -EBADF;
@@ -1447,6 +1447,13 @@ out:
 	return retval;
 }
 
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, unsigned long, pgoff)
+{
+	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+}
+
 #ifdef __ARCH_WANT_SYS_OLD_MMAP
 struct mmap_arg_struct {
 	unsigned long addr;
@@ -1466,8 +1473,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 	if (offset_in_page(a.offset))
 		return -EINVAL;
 
-	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
-			      a.offset >> PAGE_SHIFT);
+	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+			       a.offset >> PAGE_SHIFT);
 }
 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
 
-- 
cgit v1.2.3


From c7b95d5156a9ee70f800bd2e47a9eba677be73e1 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 19 Mar 2018 17:51:36 +0100
Subject: mm: add ksys_readahead() helper; remove in-kernel calls to
 sys_readahead()

Using this helper allows us to avoid the in-kernel calls to the
sys_readahead() syscall. The ksys_ prefix denotes that this function is
meant as a drop-in replacement for the syscall. In particular, it uses the
same calling convention as sys_readahead().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/mips/kernel/linux32.c      | 2 +-
 arch/parisc/kernel/sys_parisc.c | 2 +-
 arch/powerpc/kernel/sys_ppc32.c | 2 +-
 arch/s390/kernel/compat_linux.c | 2 +-
 arch/sparc/kernel/sys_sparc32.c | 2 +-
 arch/x86/ia32/sys_ia32.c        | 2 +-
 include/linux/syscalls.h        | 1 +
 mm/readahead.c                  | 7 ++++++-
 8 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 0571ab7b68b0..318f1c05c5b3 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE1(32_personality, unsigned long, personality)
 asmlinkage ssize_t sys32_readahead(int fd, u32 pad0, u64 a2, u64 a3,
 				   size_t count)
 {
-	return sys_readahead(fd, merge_64(a2, a3), count);
+	return ksys_readahead(fd, merge_64(a2, a3), count);
 }
 
 asmlinkage long sys32_sync_file_range(int fd, int __pad,
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index 080d566654ea..8c99ebbe2bac 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -345,7 +345,7 @@ asmlinkage ssize_t parisc_pwrite64(unsigned int fd, const char __user *buf,
 asmlinkage ssize_t parisc_readahead(int fd, unsigned int high, unsigned int low,
 		                    size_t count)
 {
-	return sys_readahead(fd, (loff_t)high << 32 | low, count);
+	return ksys_readahead(fd, (loff_t)high << 32 | low, count);
 }
 
 asmlinkage long parisc_fadvise64_64(int fd,
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 0b95fa13307f..c11c73373691 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -88,7 +88,7 @@ compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, com
 
 compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count)
 {
-	return sys_readahead(fd, ((loff_t)offhi << 32) | offlo, count);
+	return ksys_readahead(fd, ((loff_t)offhi << 32) | offlo, count);
 }
 
 asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4,
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index da5ef7718254..8ac38d51ed7d 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -328,7 +328,7 @@ COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubu
 
 COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count)
 {
-	return sys_readahead(fd, (unsigned long)high << 32 | low, count);
+	return ksys_readahead(fd, (unsigned long)high << 32 | low, count);
 }
 
 struct stat64_emu31 {
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 4da66aed50b4..f166e5bbf506 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -217,7 +217,7 @@ asmlinkage long compat_sys_readahead(int fd,
 				     unsigned long offlo,
 				     compat_size_t count)
 {
-	return sys_readahead(fd, (offhi << 32) | offlo, count);
+	return ksys_readahead(fd, (offhi << 32) | offlo, count);
 }
 
 long compat_sys_fadvise64(int fd,
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index bff71b9ae3f5..bd8a7020b9a7 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -203,7 +203,7 @@ COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low,
 COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo,
 		       unsigned int, off_hi, size_t, count)
 {
-	return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
+	return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
 }
 
 COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ec866c959e7d..815fbdd9cca1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -982,6 +982,7 @@ static inline int ksys_fadvise64_64(int fd, loff_t offset, loff_t len,
 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 			      unsigned long prot, unsigned long flags,
 			      unsigned long fd, unsigned long pgoff);
+ssize_t ksys_readahead(int fd, loff_t offset, size_t count);
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
diff --git a/mm/readahead.c b/mm/readahead.c
index c4ca70239233..4d57b4644f98 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -573,7 +573,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
 	return force_page_cache_readahead(mapping, filp, index, nr);
 }
 
-SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
+ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
 {
 	ssize_t ret;
 	struct fd f;
@@ -592,3 +592,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
 	}
 	return ret;
 }
+
+SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
+{
+	return ksys_readahead(fd, offset, count);
+}
-- 
cgit v1.2.3