From e181ae0c5db9544de9c53239eb22bc012ce75033 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Sat, 14 Jul 2018 09:15:07 -0400
Subject: mm: zero unavailable pages before memmap init

We must zero struct pages for memory that is not backed by physical
memory, or kernel does not have access to.

Recently, there was a change which zeroed all memmap for all holes in
e820.  Unfortunately, it introduced a bug that is discussed here:

  https://www.spinics.net/lists/linux-mm/msg156764.html

Linus, also saw this bug on his machine, and confirmed that reverting
commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into
memblock.reserved") fixes the issue.

The problem is that we incorrectly zero some struct pages after they
were setup.

The fix is to zero unavailable struct pages prior to initializing of
struct pages.

A more detailed fix should come later that would avoid double zeroing
cases: one in __init_single_page(), the other one in
zero_resv_unavail().

Fixes: 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..5d800d61ddb7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6847,6 +6847,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
+	zero_resv_unavail();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
@@ -6857,7 +6858,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
-	zero_resv_unavail();
 }
 
 static int __init cmdline_parse_core(char *p, unsigned long *core,
@@ -7033,9 +7033,9 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 
 void __init free_area_init(unsigned long *zones_size)
 {
+	zero_resv_unavail();
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
-	zero_resv_unavail();
 }
 
 static int page_alloc_cpu_dead(unsigned int cpu)
-- 
cgit v1.2.3


From bce73e4842390f7b7309c8e253e139db71288ac3 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 13 Jul 2018 16:58:52 -0700
Subject: mm: do not drop unused pages when userfaultd is running

KVM guests on s390 can notify the host of unused pages.  This can result
in pte_unused callbacks to be true for KVM guest memory.

If a page is unused (checked with pte_unused) we might drop this page
instead of paging it.  This can have side-effects on userfaultd, when
the page in question was already migrated:

The next access of that page will trigger a fault and a user fault
instead of faulting in a new and empty zero page.  As QEMU does not
expect a userfault on an already migrated page this migration will fail.

The most straightforward solution is to ignore the pte_unused hint if a
userfault context is active for this VMA.

Link: http://lkml.kernel.org/r/20180703171854.63981-1-borntraeger@de.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 6db729dc4c50..eb477809a5c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -64,6 +64,7 @@
 #include <linux/backing-dev.h>
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlbflush.h>
 
@@ -1481,11 +1482,16 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
-		} else if (pte_unused(pteval)) {
+		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
 			/*
 			 * The guest indicated that the page content is of no
 			 * interest anymore. Simply discard the pte, vmscan
 			 * will take care of the rest.
+			 * A future reference will then fault in a new zero
+			 * page. When userfaultfd is active, we must not drop
+			 * this page though, as its main user (postcopy
+			 * migration) will not expect userfaults on already
+			 * copied pages.
 			 */
 			dec_mm_counter(mm, mm_counter(page));
 			/* We have to invalidate as we cleared the pte */
-- 
cgit v1.2.3


From e3d301cae0092062cbcd6b4e7ceebbab9d87e263 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 13 Jul 2018 16:59:16 -0700
Subject: mm/memblock.c: do not complain about top-down allocations for
 !MEMORY_HOTREMOVE

Mike Rapoport is converting architectures from bootmem to nobootmem
allocator.  While doing so for m68k Geert has noticed that he gets a
scary looking warning:

  WARNING: CPU: 0 PID: 0 at mm/memblock.c:230
  memblock_find_in_range_node+0x11c/0x1be
  memblock: bottom-up allocation failed, memory hotunplug may be affected
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted
  4.18.0-rc3-atari-01343-gf2fb5f2e09a97a3c-dirty #7
  Call Trace: __warn+0xa8/0xc2
    kernel_pg_dir+0x0/0x1000
    netdev_lower_get_next+0x2/0x22
    warn_slowpath_fmt+0x2e/0x36
    memblock_find_in_range_node+0x11c/0x1be
    memblock_find_in_range_node+0x11c/0x1be
    memblock_find_in_range_node+0x0/0x1be
    vprintk_func+0x66/0x6e
    memblock_virt_alloc_internal+0xd0/0x156
    netdev_lower_get_next+0x2/0x22
    netdev_lower_get_next+0x2/0x22
    kernel_pg_dir+0x0/0x1000
    memblock_virt_alloc_try_nid_nopanic+0x58/0x7a
    netdev_lower_get_next+0x2/0x22
    kernel_pg_dir+0x0/0x1000
    kernel_pg_dir+0x0/0x1000
    EXPTBL+0x234/0x400
    EXPTBL+0x234/0x400
    alloc_node_mem_map+0x4a/0x66
    netdev_lower_get_next+0x2/0x22
    free_area_init_node+0xe2/0x29e
    EXPTBL+0x234/0x400
    paging_init+0x430/0x462
    kernel_pg_dir+0x0/0x1000
    printk+0x0/0x1a
    EXPTBL+0x234/0x400
    setup_arch+0x1b8/0x22c
    start_kernel+0x4a/0x40a
    _sinittext+0x344/0x9e8

The warning is basically saying that a top-down allocation can break
memory hotremove because memblock allocation is not movable.  But m68k
doesn't even support MEMORY_HOTREMOVE so there is no point to warn about
it.

Make the warning conditional only to configurations that care.

Link: http://lkml.kernel.org/r/20180706061750.GH32658@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Sam Creasey <sammy@sammy.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 03d48d8835ba..11e46f83e1ad 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -227,7 +227,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 		 * so we use WARN_ONCE() here to see the stack trace if
 		 * fail happens.
 		 */
-		WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
+		WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
+			  "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
 	}
 
 	return __memblock_find_range_top_down(start, end, size, align, nid,
-- 
cgit v1.2.3


From bb177a732c4369bb58a1fe1df8f552b6f0f7db5f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 13 Jul 2018 16:59:20 -0700
Subject: mm: do not bug_on on incorrect length in __mm_populate()

syzbot has noticed that a specially crafted library can easily hit
VM_BUG_ON in __mm_populate

  kernel BUG at mm/gup.c:1242!
  invalid opcode: 0000 [#1] SMP
  CPU: 2 PID: 9667 Comm: a.out Not tainted 4.18.0-rc3 #644
  Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017
  RIP: 0010:__mm_populate+0x1e2/0x1f0
  Code: 55 d0 65 48 33 14 25 28 00 00 00 89 d8 75 21 48 83 c4 20 5b 41 5c 41 5d 41 5e 41 5f 5d c3 e8 75 18 f1 ff 0f 0b e8 6e 18 f1 ff <0f> 0b 31 db eb c9 e8 93 06 e0 ff 0f 1f 00 55 48 89 e5 53 48 89 fb
  Call Trace:
     vm_brk_flags+0xc3/0x100
     vm_brk+0x1f/0x30
     load_elf_library+0x281/0x2e0
     __ia32_sys_uselib+0x170/0x1e0
     do_fast_syscall_32+0xca/0x420
     entry_SYSENTER_compat+0x70/0x7f

The reason is that the length of the new brk is not page aligned when we
try to populate the it.  There is no reason to bug on that though.
do_brk_flags already aligns the length properly so the mapping is
expanded as it should.  All we need is to tell mm_populate about it.
Besides that there is absolutely no reason to to bug_on in the first
place.  The worst thing that could happen is that the last page wouldn't
get populated and that is far from putting system into an inconsistent
state.

Fix the issue by moving the length sanitization code from do_brk_flags
up to vm_brk_flags.  The only other caller of do_brk_flags is brk
syscall entry and it makes sure to provide the proper length so t here
is no need for sanitation and so we can use do_brk_flags without it.

Also remove the bogus BUG_ONs.

[osalvador@techadventures.net: fix up vm_brk_flags s@request@len@]
Link: http://lkml.kernel.org/r/20180706090217.GI32658@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: syzbot <syzbot+5dcb560fe12aa5091c06@syzkaller.appspotmail.com>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c  |  2 --
 mm/mmap.c | 29 ++++++++++++-----------------
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/gup.c b/mm/gup.c
index b70d7ba7cc13..fc5f98069f4e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1238,8 +1238,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 	int locked = 0;
 	long ret = 0;
 
-	VM_BUG_ON(start & ~PAGE_MASK);
-	VM_BUG_ON(len != PAGE_ALIGN(len));
 	end = start + len;
 
 	for (nstart = start; nstart < end; nstart = nend) {
diff --git a/mm/mmap.c b/mm/mmap.c
index d1eb87ef4b1a..5801b5f0a634 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -186,8 +186,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
-
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
+		struct list_head *uf);
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
 	unsigned long retval;
@@ -245,7 +245,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
+	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
 		goto out;
 
 set_brk:
@@ -2929,21 +2929,14 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
+static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
-	unsigned long len;
 	struct rb_node **rb_link, *rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
 	int error;
 
-	len = PAGE_ALIGN(request);
-	if (len < request)
-		return -ENOMEM;
-	if (!len)
-		return 0;
-
 	/* Until we need other flags, refuse anything except VM_EXEC. */
 	if ((flags & (~VM_EXEC)) != 0)
 		return -EINVAL;
@@ -3015,18 +3008,20 @@ out:
 	return 0;
 }
 
-static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
-{
-	return do_brk_flags(addr, len, 0, uf);
-}
-
-int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
+	unsigned long len;
 	int ret;
 	bool populate;
 	LIST_HEAD(uf);
 
+	len = PAGE_ALIGN(request);
+	if (len < request)
+		return -ENOMEM;
+	if (!len)
+		return 0;
+
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-- 
cgit v1.2.3


From d1b47a7c9efcf3c3384b70f6e3c8f1423b44d8c7 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Mon, 16 Jul 2018 11:16:30 -0400
Subject: mm: don't do zero_resv_unavail if memmap is not allocated

Moving zero_resv_unavail before memmap_init_zone(), caused a regression on
x86-32.

The cause is that we access struct pages before they are allocated when
CONFIG_FLAT_NODE_MEM_MAP is used.

free_area_init_nodes()
  zero_resv_unavail()
    mm_zero_struct_page(pfn_to_page(pfn)); <- struct page is not alloced
  free_area_init_node()
    if CONFIG_FLAT_NODE_MEM_MAP
      alloc_node_mem_map()
        memblock_virt_alloc_node_nopanic() <- struct page alloced here

On the other hand memblock_virt_alloc_node_nopanic() zeroes all the memory
that it returns, so we do not need to do zero_resv_unavail() here.

Fixes: e181ae0c5db9 ("mm: zero unavailable pages before memmap init")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Tested-by: Matt Hart <matt@mattface.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/page_alloc.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0fbb9ffe380..3982c83fdcbf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2132,7 +2132,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state);
 #endif
 
-#ifdef CONFIG_HAVE_MEMBLOCK
+#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
 void zero_resv_unavail(void);
 #else
 static inline void zero_resv_unavail(void) {}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d800d61ddb7..a790ef4be74e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6383,7 +6383,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 	free_area_init_core(pgdat);
 }
 
-#ifdef CONFIG_HAVE_MEMBLOCK
+#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
 /*
  * Only struct pages that are backed by physical memory are zeroed and
  * initialized by going through __init_single_page(). But, there are some
@@ -6421,7 +6421,7 @@ void __paginginit zero_resv_unavail(void)
 	if (pgcnt)
 		pr_info("Reserved but unavailable: %lld pages", pgcnt);
 }
-#endif /* CONFIG_HAVE_MEMBLOCK */
+#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 
-- 
cgit v1.2.3


From 1937367205d930deafb11c387b0a71dd215254d5 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Fri, 20 Jul 2018 17:53:31 -0700
Subject: mm/memblock: add missing include <linux/bootmem.h>

Commit 26f09e9b3a06 ("mm/memblock: add memblock memory allocation apis")
introduced two new function definitions:

  memblock_virt_alloc_try_nid_nopanic()
  memblock_virt_alloc_try_nid()

and commit ea1f5f3712af ("mm: define memblock_virt_alloc_try_nid_raw")
introduced the following function definition:

  memblock_virt_alloc_try_nid_raw()

This commit adds an include of header file <linux/bootmem.h> to provide
the missing function prototypes.  This silences the following gcc warning
(W=1):

  mm/memblock.c:1334:15: warning: no previous prototype for `memblock_virt_alloc_try_nid_raw' [-Wmissing-prototypes]
  mm/memblock.c:1371:15: warning: no previous prototype for `memblock_virt_alloc_try_nid_nopanic' [-Wmissing-prototypes]
  mm/memblock.c:1407:15: warning: no previous prototype for `memblock_virt_alloc_try_nid' [-Wmissing-prototypes]

Also adds #ifdef blockers to prevent compilation failure on mips/ia64
where CONFIG_NO_BOOTMEM=n as could be seen in commit commit 6cc22dc08a24
("revert "mm/memblock: add missing include <linux/bootmem.h>"").

Because Makefile already does:

  obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o

The #ifdef has been simplified from:

  #if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM)

to simply:

  #if defined(CONFIG_NO_BOOTMEM)

Link: http://lkml.kernel.org/r/20180626184422.24974-1-malat@debian.org
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Suggested-by: Tony Luck <tony.luck@intel.com>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 11e46f83e1ad..4b5d245fafc1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,7 @@
 #include <linux/kmemleak.h>
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>
 
 #include <asm/sections.h>
 #include <linux/io.h>
@@ -1225,6 +1226,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
 	return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
 
+#if defined(CONFIG_NO_BOOTMEM)
 /**
  * memblock_virt_alloc_internal - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
@@ -1432,6 +1434,7 @@ void * __init memblock_virt_alloc_try_nid(
 	      (u64)max_addr);
 	return NULL;
 }
+#endif
 
 /**
  * __memblock_free_early - free boot memory block
-- 
cgit v1.2.3


From e1f1b1572e8db87a56609fd05bef76f98f0e456a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 20 Jul 2018 17:53:45 -0700
Subject: mm/huge_memory.c: fix data loss when splitting a file pmd

__split_huge_pmd_locked() must check if the cleared huge pmd was dirty,
and propagate that to PageDirty: otherwise, data may be lost when a huge
tmpfs page is modified then split then reclaimed.

How has this taken so long to be noticed?  Because there was no problem
when the huge page is written by a write system call (shmem_write_end()
calls set_page_dirty()), nor when the page is allocated for a write fault
(fault_dirty_shared_page() calls set_page_dirty()); but when allocated for
a read fault (which MAP_POPULATE simulates), no set_page_dirty().

Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1807111741430.1106@eggly.anvils
Fixes: d21b9e57c74c ("thp: handle file pages in split_huge_pmd()")
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Ashwin Chaugule <ashwinch@google.com>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cd7c1a57a14..25346bd99364 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2084,6 +2084,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
+		if (!PageDirty(page) && pmd_dirty(_pmd))
+			set_page_dirty(page);
 		if (!PageReferenced(page) && pmd_young(_pmd))
 			SetPageReferenced(page);
 		page_remove_rmap(page, true);
-- 
cgit v1.2.3


From 9f15bde671355c351cf20d9f879004b234353100 Mon Sep 17 00:00:00 2001
From: Jing Xia <jing.xia.mail@gmail.com>
Date: Fri, 20 Jul 2018 17:53:48 -0700
Subject: mm: memcg: fix use after free in mem_cgroup_iter()

It was reported that a kernel crash happened in mem_cgroup_iter(), which
can be triggered if the legacy cgroup-v1 non-hierarchical mode is used.

Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b8f
......
Call trace:
  mem_cgroup_iter+0x2e0/0x6d4
  shrink_zone+0x8c/0x324
  balance_pgdat+0x450/0x640
  kswapd+0x130/0x4b8
  kthread+0xe8/0xfc
  ret_from_fork+0x10/0x20

  mem_cgroup_iter():
      ......
      if (css_tryget(css))    <-- crash here
	    break;
      ......

The crashing reason is that mem_cgroup_iter() uses the memcg object whose
pointer is stored in iter->position, which has been freed before and
filled with POISON_FREE(0x6b).

And the root cause of the use-after-free issue is that
invalidate_reclaim_iterators() fails to reset the value of iter->position
to NULL when the css of the memcg is released in non- hierarchical mode.

Link: http://lkml.kernel.org/r/1531994807-25639-1-git-send-email-jing.xia@unisoc.com
Fixes: 6df38689e0e9 ("mm: memcontrol: fix possible memcg leak due to interrupted reclaim")
Signed-off-by: Jing Xia <jing.xia.mail@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <chunyan.zhang@unisoc.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5ef320a..8c0280b3143e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -850,7 +850,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 	int nid;
 	int i;
 
-	while ((memcg = parent_mem_cgroup(memcg))) {
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 		for_each_node(nid) {
 			mz = mem_cgroup_nodeinfo(memcg, nid);
 			for (i = 0; i <= DEF_PRIORITY; i++) {
-- 
cgit v1.2.3


From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 13:48:51 -0700
Subject: mm: use helper functions for allocating and freeing vm_area structs

The vm_area_struct is one of the most fundamental memory management
objects, but the management of it is entirely open-coded evertwhere,
ranging from allocation and freeing (using kmem_cache_[z]alloc and
kmem_cache_free) to initializing all the fields.

We want to unify this in order to end up having some unified
initialization of the vmas, and the first step to this is to at least
have basic allocation functions.

Right now those functions are literally just wrappers around the
kmem_cache_*() calls.  This is a purely mechanical conversion:

    # new vma:
    kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc()

    # copy old vma
    kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old)

    # free vma
    kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma)

to the point where the old vma passed in to the vm_area_dup() function
isn't even used yet (because I've left all the old manual initialization
alone).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 ++--
 arch/ia64/mm/init.c        |  8 ++++----
 fs/exec.c                  |  4 ++--
 include/linux/mm.h         |  4 +++-
 kernel/fork.c              | 21 ++++++++++++++++++---
 mm/mmap.c                  | 22 +++++++++++-----------
 mm/nommu.c                 |  8 ++++----
 7 files changed, 44 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 3b38c717008a..e859246badca 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2278,7 +2278,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	DPRINT(("smpl_buf @%p\n", smpl_buf));
 
 	/* allocate vma */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma) {
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
@@ -2346,7 +2346,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	return 0;
 
 error:
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 error_kmem:
 	pfm_rvfree(smpl_buf, size);
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 18278b448530..3f2321bffb72 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -114,7 +114,7 @@ ia64_init_addr_space (void)
 	 * the problem.  When the process attempts to write to the register backing store
 	 * for the first time, it will get a SEGFAULT in this case.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (vma) {
 		INIT_LIST_HEAD(&vma->anon_vma_chain);
 		vma->vm_mm = current->mm;
@@ -125,7 +125,7 @@ ia64_init_addr_space (void)
 		down_write(&current->mm->mmap_sem);
 		if (insert_vm_struct(current->mm, vma)) {
 			up_write(&current->mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, vma);
+			vm_area_free(vma);
 			return;
 		}
 		up_write(&current->mm->mmap_sem);
@@ -133,7 +133,7 @@ ia64_init_addr_space (void)
 
 	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
 	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+		vma = vm_area_alloc();
 		if (vma) {
 			INIT_LIST_HEAD(&vma->anon_vma_chain);
 			vma->vm_mm = current->mm;
@@ -144,7 +144,7 @@ ia64_init_addr_space (void)
 			down_write(&current->mm->mmap_sem);
 			if (insert_vm_struct(current->mm, vma)) {
 				up_write(&current->mm->mmap_sem);
-				kmem_cache_free(vm_area_cachep, vma);
+				vm_area_free(vma);
 				return;
 			}
 			up_write(&current->mm->mmap_sem);
diff --git a/fs/exec.c b/fs/exec.c
index 2d4e0075bd24..9bd83989ea25 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	bprm->vma = vma = vm_area_alloc();
 	if (!vma)
 		return -ENOMEM;
 
@@ -326,7 +326,7 @@ err:
 	up_write(&mm->mmap_sem);
 err_free:
 	bprm->vma = NULL;
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return err;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3982c83fdcbf..de2fd86c6154 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -155,7 +155,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  * mmap() functions).
  */
 
-extern struct kmem_cache *vm_area_cachep;
+struct vm_area_struct *vm_area_alloc(void);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
+void vm_area_free(struct vm_area_struct *);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..0e23deb5acfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -303,11 +303,26 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+static struct kmem_cache *vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+struct vm_area_struct *vm_area_alloc(void)
+{
+	return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+	return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	void *stack = task_stack_page(tsk);
@@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = vm_area_dup(mpnt);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -539,7 +554,7 @@ fail_uprobe_end:
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
-	kmem_cache_free(vm_area_cachep, tmp);
+	vm_area_free(tmp);
 fail_nomem:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
diff --git a/mm/mmap.c b/mm/mmap.c
index 5801b5f0a634..4286ad2dd1f5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -182,7 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return next;
 }
 
@@ -911,7 +911,7 @@ again:
 			anon_vma_merge(vma, next);
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		vm_area_free(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1729,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
@@ -1832,7 +1832,7 @@ allow_write_and_free_vma:
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
@@ -2620,7 +2620,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 			return err;
 	}
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = vm_area_dup(vma);
 	if (!new)
 		return -ENOMEM;
 
@@ -2669,7 +2669,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
  out_free_mpol:
 	mpol_put(vma_policy(new));
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new);
+	vm_area_free(new);
 	return err;
 }
 
@@ -2984,7 +2984,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
@@ -3202,7 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		}
 		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
-		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		new_vma = vm_area_dup(vma);
 		if (!new_vma)
 			goto out;
 		*new_vma = *vma;
@@ -3226,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 out_free_mempol:
 	mpol_put(vma_policy(new_vma));
 out_free_vma:
-	kmem_cache_free(vm_area_cachep, new_vma);
+	vm_area_free(new_vma);
 out:
 	return NULL;
 }
@@ -3350,7 +3350,7 @@ static struct vm_area_struct *__install_special_mapping(
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (unlikely(vma == NULL))
 		return ERR_PTR(-ENOMEM);
 
@@ -3376,7 +3376,7 @@ static struct vm_area_struct *__install_special_mapping(
 	return vma;
 
 out:
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return ERR_PTR(ret);
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 4452d8bd9ae4..006e3fe65017 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -769,7 +769,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	put_nommu_region(vma->vm_region);
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 }
 
 /*
@@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file,
 	if (!region)
 		goto error_getting_region;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma)
 		goto error_getting_vma;
 
@@ -1368,7 +1368,7 @@ error:
 	kmem_cache_free(vm_region_jar, region);
 	if (vma->vm_file)
 		fput(vma->vm_file);
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return ret;
 
 sharing_violation:
@@ -1469,7 +1469,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!region)
 		return -ENOMEM;
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = vm_area_dup(vma);
 	if (!new) {
 		kmem_cache_free(vm_region_jar, region);
 		return -ENOMEM;
-- 
cgit v1.2.3


From 95faf6992df468f617edb788da8c21c6eed0dfa7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 14:48:45 -0700
Subject: mm: make vm_area_dup() actually copy the old vma data

.. and re-initialize th eanon_vma_chain head.

This removes some boiler-plate from the users, and also makes it clear
why it didn't need use the 'zalloc()' version.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 +++++++---
 mm/mmap.c     |  7 -------
 mm/nommu.c    |  1 -
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e23deb5acfc..67253e41bfb0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,7 +315,13 @@ struct vm_area_struct *vm_area_alloc(void)
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
-	return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+	if (new) {
+		*new = *orig;
+		INIT_LIST_HEAD(&new->anon_vma_chain);
+	}
+	return new;
 }
 
 void vm_area_free(struct vm_area_struct *vma)
@@ -473,8 +479,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		tmp = vm_area_dup(mpnt);
 		if (!tmp)
 			goto fail_nomem;
-		*tmp = *mpnt;
-		INIT_LIST_HEAD(&tmp->anon_vma_chain);
 		retval = vma_dup_policy(mpnt, tmp);
 		if (retval)
 			goto fail_nomem_policy;
diff --git a/mm/mmap.c b/mm/mmap.c
index 4286ad2dd1f5..b0ed8ce1b67e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2624,11 +2624,6 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!new)
 		return -ENOMEM;
 
-	/* most fields are the same, copy all, and then fixup */
-	*new = *vma;
-
-	INIT_LIST_HEAD(&new->anon_vma_chain);
-
 	if (new_below)
 		new->vm_end = addr;
 	else {
@@ -3205,13 +3200,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = vm_area_dup(vma);
 		if (!new_vma)
 			goto out;
-		*new_vma = *vma;
 		new_vma->vm_start = addr;
 		new_vma->vm_end = addr + len;
 		new_vma->vm_pgoff = pgoff;
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
-		INIT_LIST_HEAD(&new_vma->anon_vma_chain);
 		if (anon_vma_clone(new_vma, vma))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
diff --git a/mm/nommu.c b/mm/nommu.c
index 006e3fe65017..c2560e9cc803 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1476,7 +1476,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	/* most fields are the same, copy all, and then fixup */
-	*new = *vma;
 	*region = *vma->vm_region;
 	new->vm_region = region;
 
-- 
cgit v1.2.3


From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 15:24:03 -0700
Subject: mm: make vm_area_alloc() initialize core fields

Like vm_area_dup(), it initializes the anon_vma_chain head, and the
basic mm pointer.

The rest of the fields end up being different for different users,
although the plan is to also initialize the 'vm_ops' field to a dummy
entry.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 +---
 arch/ia64/mm/init.c        |  8 ++------
 fs/exec.c                  |  4 +---
 include/linux/mm.h         |  2 +-
 kernel/fork.c              | 10 ++++++++--
 mm/mmap.c                  | 12 +++---------
 mm/nommu.c                 |  3 +--
 7 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index e859246badca..46bff1661836 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2278,17 +2278,15 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	DPRINT(("smpl_buf @%p\n", smpl_buf));
 
 	/* allocate vma */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (!vma) {
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
 	}
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	/*
 	 * partially initialize the vma for the sampling buffer
 	 */
-	vma->vm_mm	     = mm;
 	vma->vm_file	     = get_file(filp);
 	vma->vm_flags	     = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 3f2321bffb72..bdb14a369137 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -114,10 +114,8 @@ ia64_init_addr_space (void)
 	 * the problem.  When the process attempts to write to the register backing store
 	 * for the first time, it will get a SEGFAULT in this case.
 	 */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(current->mm);
 	if (vma) {
-		INIT_LIST_HEAD(&vma->anon_vma_chain);
-		vma->vm_mm = current->mm;
 		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
 		vma->vm_end = vma->vm_start + PAGE_SIZE;
 		vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
@@ -133,10 +131,8 @@ ia64_init_addr_space (void)
 
 	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
 	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = vm_area_alloc();
+		vma = vm_area_alloc(current->mm);
 		if (vma) {
-			INIT_LIST_HEAD(&vma->anon_vma_chain);
-			vma->vm_mm = current->mm;
 			vma->vm_end = PAGE_SIZE;
 			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
 			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO |
diff --git a/fs/exec.c b/fs/exec.c
index 9bd83989ea25..72e961a62adb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = vm_area_alloc();
+	bprm->vma = vma = vm_area_alloc(mm);
 	if (!vma)
 		return -ENOMEM;
 
@@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 		err = -EINTR;
 		goto err_free;
 	}
-	vma->vm_mm = mm;
 
 	/*
 	 * Place the stack at the largest stack address the architecture
@@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
 	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	err = insert_vm_struct(mm, vma);
 	if (err)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index de2fd86c6154..d3a3842316b8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -155,7 +155,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  * mmap() functions).
  */
 
-struct vm_area_struct *vm_area_alloc(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 67253e41bfb0..a191c05e757d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-struct vm_area_struct *vm_area_alloc(void)
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+
+	if (vma) {
+		vma->vm_mm = mm;
+		INIT_LIST_HEAD(&vma->anon_vma_chain);
+	}
+	return vma;
 }
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
diff --git a/mm/mmap.c b/mm/mmap.c
index b0ed8ce1b67e..ff1944d8d458 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
 	}
 
-	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
 	vma->vm_page_prot = vm_get_page_prot(vm_flags);
 	vma->vm_pgoff = pgoff;
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	if (file) {
 		if (vm_flags & VM_DENYWRITE) {
@@ -2979,14 +2977,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
 	}
 
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
@@ -3343,12 +3339,10 @@ static struct vm_area_struct *__install_special_mapping(
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (unlikely(vma == NULL))
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 
diff --git a/mm/nommu.c b/mm/nommu.c
index c2560e9cc803..1d22fdbf7d7c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file,
 	if (!region)
 		goto error_getting_region;
 
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(current->mm);
 	if (!vma)
 		goto error_getting_vma;
 
@@ -1212,7 +1212,6 @@ unsigned long do_mmap(struct file *file,
 	region->vm_flags = vm_flags;
 	region->vm_pgoff = pgoff;
 
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_flags = vm_flags;
 	vma->vm_pgoff = pgoff;
 
-- 
cgit v1.2.3