From 49f0ce5f92321cdcf741e35f385669a421013cb7 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 21 Jan 2014 15:49:14 -0800 Subject: mm: add overcommit_kbytes sysctl variable Some applications that run on HPC clusters are designed around the availability of RAM and the overcommit ratio is fine tuned to get the maximum usage of memory without swapping. With growing memory, the 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse for these workload (on a 2TB machine it represents no less than 20GB). This patch adds the new overcommit_kbytes sysctl variable that allow a much finer grain. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Jerome Marchand Cc: Dave Hansen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 834b2d785f1e..39552de6e1db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -- cgit v1.2.3 From 363ee17f0f405faff74d9aaf93d21d5f41d5102d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Jan 2014 15:49:15 -0800 Subject: mm/mmap.c: add mlock_future_check() helper Both do_brk and do_mmap_pgoff verify that we are actually capable of locking future pages if the corresponding VM_LOCKED flags are used. Encapsulate this logic into a single mlock_future_check() helper function. Signed-off-by: Davidlohr Bueso Cc: Rik van Riel Reviewed-by: Michel Lespinasse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 39552de6e1db..a0e7153a79e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1191,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1252,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; if (file) { struct inode *inode = file_inode(file); @@ -2592,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (error & ~PAGE_MASK) return error; - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; /* * mm->mmap_sem is required to protect against another thread -- cgit v1.2.3 From a64fb3cd610c8e6806512dbac63f3fc45812d8fd Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 23 Jan 2014 15:53:30 -0800 Subject: mm: audit/fix non-modular users of module_init in core code Code that is obj-y (always built-in) or dependent on a bool Kconfig (built-in or absent) can never be modular. So using module_init as an alias for __initcall can be somewhat misleading. Fix these up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. The audit targets the following module_init users for change: mm/ksm.c bool KSM mm/mmap.c bool MMU mm/huge_memory.c bool TRANSPARENT_HUGEPAGE mm/mmu_notifier.c bool MMU_NOTIFIER Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of subsys_initcall (which makes sense for these files) will thus change this registration from level 6-device to level 4-subsys (i.e. slightly earlier). However no observable impact of that difference has been observed during testing. One might think that core_initcall (l2) or postcore_initcall (l3) would be more appropriate for anything in mm/ but if we look at some actual init functions themselves, we see things like: mm/huge_memory.c --> hugepage_init --> hugepage_init_sysfs mm/mmap.c --> init_user_reserve --> sysctl_user_reserve_kbytes mm/ksm.c --> ksm_init --> sysfs_create_group and hence the choice of subsys_initcall (l4) seems reasonable, and at the same time minimizes the risk of changing the priority too drastically all at once. We can adjust further in the future. Also, several instances of missing ";" at EOL are fixed. Signed-off-by: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/mmap.c | 6 +++--- mm/mmu_notifier.c | 3 +-- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index afe738358370..65c98eb5483c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -661,7 +661,7 @@ out: hugepage_exit_sysfs(hugepage_kobj); return err; } -module_init(hugepage_init) +subsys_initcall(hugepage_init); static int __init setup_transparent_hugepage(char *str) { diff --git a/mm/ksm.c b/mm/ksm.c index f91ddf5c3688..aa4c7c7250c1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2345,4 +2345,4 @@ out_free: out: return err; } -module_init(ksm_init) +subsys_initcall(ksm_init); diff --git a/mm/mmap.c b/mm/mmap.c index a0e7153a79e6..126d8b976bfd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3142,7 +3142,7 @@ static int init_user_reserve(void) sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; } -module_init(init_user_reserve) +subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. @@ -3163,7 +3163,7 @@ static int init_admin_reserve(void) sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; } -module_init(init_admin_reserve) +subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. @@ -3233,4 +3233,4 @@ static int __meminit init_reserve_notifier(void) return 0; } -module_init(init_reserve_notifier) +subsys_initcall(init_reserve_notifier); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 93e6089cb456..41cefdf0aadd 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); } - -module_init(mmu_notifier_init); +subsys_initcall(mmu_notifier_init); -- cgit v1.2.3 From 34228d473efe764d4db7c0536375f0c993e6e06a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jan 2014 15:53:42 -0800 Subject: mm: ignore VM_SOFTDIRTY on VMA merging The VM_SOFTDIRTY bit affects vma merge routine: if two VMAs has all bits in vm_flags matched except dirty bit the kernel can't longer merge them and this forces the kernel to generate new VMAs instead. It finally may lead to the situation when userspace application reaches vm.max_map_count limit and get crashed in worse case | (gimp:11768): GLib-ERROR **: gmem.c:110: failed to allocate 4096 bytes | | (file-tiff-load:12038): LibGimpBase-WARNING **: file-tiff-load: gimp_wire_read(): error | xinit: connection to X server lost | | waiting for X server to shut down | /usr/lib64/gimp/2.0/plug-ins/file-tiff-load terminated: Hangup | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup https://bugzilla.kernel.org/show_bug.cgi?id=67651 https://bugzilla.gnome.org/show_bug.cgi?id=719619#c0 Initial problem came from missed VM_SOFTDIRTY in do_brk() routine but even if we would set up VM_SOFTDIRTY here, there is still a way to prevent VMAs from merging: one can call | echo 4 > /proc/$PID/clear_refs and clear all VM_SOFTDIRTY over all VMAs presented in memory map, then new do_brk() will try to extend old VMA and finds that dirty bit doesn't match thus new VMA will be generated. As discussed with Pavel, the right approach should be to ignore VM_SOFTDIRTY bit when we're trying to merge VMAs and if merge successed we mark extended VMA with dirty bit where needed. Signed-off-by: Cyrill Gorcunov Reported-by: Bastian Hougaard Reported-by: Mel Gorman Cc: Pavel Emelyanov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 126d8b976bfd..20ff0c33274c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -894,7 +894,15 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags ^ vm_flags) + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressue on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) return 0; if (vma->vm_file != file) return 0; @@ -1083,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } -- cgit v1.2.3