From 49f0ce5f92321cdcf741e35f385669a421013cb7 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 21 Jan 2014 15:49:14 -0800
Subject: mm: add overcommit_kbytes sysctl variable

Some applications that run on HPC clusters are designed around the
availability of RAM and the overcommit ratio is fine tuned to get the
maximum usage of memory without swapping.  With growing memory, the
1%-of-all-RAM grain provided by overcommit_ratio has become too coarse
for these workload (on a 2TB machine it represents no less than 20GB).

This patch adds the new overcommit_kbytes sysctl variable that allow a
much finer grain.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..39552de6e1db 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-- 
cgit v1.2.3


From 363ee17f0f405faff74d9aaf93d21d5f41d5102d Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Tue, 21 Jan 2014 15:49:15 -0800
Subject: mm/mmap.c: add mlock_future_check() helper

Both do_brk and do_mmap_pgoff verify that we are actually capable of
locking future pages if the corresponding VM_LOCKED flags are used.
Encapsulate this logic into a single mlock_future_check() helper
function.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 39552de6e1db..a0e7153a79e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1191,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 	return hint;
 }
 
+static inline int mlock_future_check(struct mm_struct *mm,
+				     unsigned long flags,
+				     unsigned long len)
+{
+	unsigned long locked, lock_limit;
+
+	/*  mlock MCL_FUTURE? */
+	if (flags & VM_LOCKED) {
+		locked = len >> PAGE_SHIFT;
+		locked += mm->locked_vm;
+		lock_limit = rlimit(RLIMIT_MEMLOCK);
+		lock_limit >>= PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
@@ -1252,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		if (!can_do_mlock())
 			return -EPERM;
 
-	/* mlock MCL_FUTURE? */
-	if (vm_flags & VM_LOCKED) {
-		unsigned long locked, lock_limit;
-		locked = len >> PAGE_SHIFT;
-		locked += mm->locked_vm;
-		lock_limit = rlimit(RLIMIT_MEMLOCK);
-		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			return -EAGAIN;
-	}
+	if (mlock_future_check(mm, vm_flags, len))
+		return -EAGAIN;
 
 	if (file) {
 		struct inode *inode = file_inode(file);
@@ -2592,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (error & ~PAGE_MASK)
 		return error;
 
-	/*
-	 * mlock MCL_FUTURE?
-	 */
-	if (mm->def_flags & VM_LOCKED) {
-		unsigned long locked, lock_limit;
-		locked = len >> PAGE_SHIFT;
-		locked += mm->locked_vm;
-		lock_limit = rlimit(RLIMIT_MEMLOCK);
-		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			return -EAGAIN;
-	}
+	error = mlock_future_check(mm, mm->def_flags, len);
+	if (error)
+		return error;
 
 	/*
 	 * mm->mmap_sem is required to protect against another thread
-- 
cgit v1.2.3


From a64fb3cd610c8e6806512dbac63f3fc45812d8fd Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 23 Jan 2014 15:53:30 -0800
Subject: mm: audit/fix non-modular users of module_init in core code

Code that is obj-y (always built-in) or dependent on a bool Kconfig
(built-in or absent) can never be modular.  So using module_init as an
alias for __initcall can be somewhat misleading.

Fix these up now, so that we can relocate module_init from init.h into
module.h in the future.  If we don't do this, we'd have to add module.h
to obviously non-modular code, and that would be a worse thing.

The audit targets the following module_init users for change:
 mm/ksm.c                       bool KSM
 mm/mmap.c                      bool MMU
 mm/huge_memory.c               bool TRANSPARENT_HUGEPAGE
 mm/mmu_notifier.c              bool MMU_NOTIFIER

Note that direct use of __initcall is discouraged, vs.  one of the
priority categorized subgroups.  As __initcall gets mapped onto
device_initcall, our use of subsys_initcall (which makes sense for these
files) will thus change this registration from level 6-device to level
4-subsys (i.e.  slightly earlier).

However no observable impact of that difference has been observed during
testing.

One might think that core_initcall (l2) or postcore_initcall (l3) would
be more appropriate for anything in mm/ but if we look at some actual
init functions themselves, we see things like:

mm/huge_memory.c --> hugepage_init     --> hugepage_init_sysfs
mm/mmap.c        --> init_user_reserve --> sysctl_user_reserve_kbytes
mm/ksm.c         --> ksm_init          --> sysfs_create_group

and hence the choice of subsys_initcall (l4) seems reasonable, and at
the same time minimizes the risk of changing the priority too
drastically all at once.  We can adjust further in the future.

Also, several instances of missing ";" at EOL are fixed.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c  | 2 +-
 mm/ksm.c          | 2 +-
 mm/mmap.c         | 6 +++---
 mm/mmu_notifier.c | 3 +--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index afe738358370..65c98eb5483c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -661,7 +661,7 @@ out:
 	hugepage_exit_sysfs(hugepage_kobj);
 	return err;
 }
-module_init(hugepage_init)
+subsys_initcall(hugepage_init);
 
 static int __init setup_transparent_hugepage(char *str)
 {
diff --git a/mm/ksm.c b/mm/ksm.c
index f91ddf5c3688..aa4c7c7250c1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2345,4 +2345,4 @@ out_free:
 out:
 	return err;
 }
-module_init(ksm_init)
+subsys_initcall(ksm_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index a0e7153a79e6..126d8b976bfd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3142,7 +3142,7 @@ static int init_user_reserve(void)
 	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
 	return 0;
 }
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
 
 /*
  * Initialise sysctl_admin_reserve_kbytes.
@@ -3163,7 +3163,7 @@ static int init_admin_reserve(void)
 	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
 	return 0;
 }
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
 
 /*
  * Reinititalise user and admin reserves if memory is added or removed.
@@ -3233,4 +3233,4 @@ static int __meminit init_reserve_notifier(void)
 
 	return 0;
 }
-module_init(init_reserve_notifier)
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089cb456..41cefdf0aadd 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void)
 {
 	return init_srcu_struct(&srcu);
 }
-
-module_init(mmu_notifier_init);
+subsys_initcall(mmu_notifier_init);
-- 
cgit v1.2.3


From 34228d473efe764d4db7c0536375f0c993e6e06a Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 23 Jan 2014 15:53:42 -0800
Subject: mm: ignore VM_SOFTDIRTY on VMA merging

The VM_SOFTDIRTY bit affects vma merge routine: if two VMAs has all bits
in vm_flags matched except dirty bit the kernel can't longer merge them
and this forces the kernel to generate new VMAs instead.

It finally may lead to the situation when userspace application reaches
vm.max_map_count limit and get crashed in worse case

 | (gimp:11768): GLib-ERROR **: gmem.c:110: failed to allocate 4096 bytes
 |
 | (file-tiff-load:12038): LibGimpBase-WARNING **: file-tiff-load: gimp_wire_read(): error
 | xinit: connection to X server lost
 |
 | waiting for X server to shut down
 | /usr/lib64/gimp/2.0/plug-ins/file-tiff-load terminated: Hangup
 | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup
 | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup

  https://bugzilla.kernel.org/show_bug.cgi?id=67651
  https://bugzilla.gnome.org/show_bug.cgi?id=719619#c0

Initial problem came from missed VM_SOFTDIRTY in do_brk() routine but
even if we would set up VM_SOFTDIRTY here, there is still a way to
prevent VMAs from merging: one can call

 | echo 4 > /proc/$PID/clear_refs

and clear all VM_SOFTDIRTY over all VMAs presented in memory map, then
new do_brk() will try to extend old VMA and finds that dirty bit doesn't
match thus new VMA will be generated.

As discussed with Pavel, the right approach should be to ignore
VM_SOFTDIRTY bit when we're trying to merge VMAs and if merge successed
we mark extended VMA with dirty bit where needed.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reported-by: Bastian Hougaard <gnome@rvzt.net>
Reported-by: Mel Gorman <mgorman@suse.de>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 126d8b976bfd..20ff0c33274c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,7 +894,15 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
-	if (vma->vm_flags ^ vm_flags)
+	/*
+	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
+	 * match the flags but dirty bit -- the caller should mark
+	 * merged VMA as dirty. If dirty bit won't be excluded from
+	 * comparison, we increase pressue on the memory system forcing
+	 * the kernel to generate new VMAs when old one could be
+	 * extended instead.
+	 */
+	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
 		return 0;
 	if (vma->vm_file != file)
 		return 0;
@@ -1083,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 	return a->vm_end == b->vm_start &&
 		mpol_equal(vma_policy(a), vma_policy(b)) &&
 		a->vm_file == b->vm_file &&
-		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
 		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
 }
 
-- 
cgit v1.2.3