From e04c5d76b0cfb66cadd900cf147526f2271884b8 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jul 2013 22:21:57 -0300 Subject: remove sched notifier for cross-cpu migrations Linux as a guest on KVM hypervisor, the only user of the pvclock vsyscall interface, does not require notification on task migration because: 1. cpu ID number maps 1:1 to per-CPU pvclock time info. 2. per-CPU pvclock time info is updated if the underlying CPU changes. 3. that version is increased whenever underlying CPU changes. Which is sufficient to guarantee nanoseconds counter is calculated properly. Signed-off-by: Marcelo Tosatti Acked-by: Peter Zijlstra Signed-off-by: Gleb Natapov --- include/linux/sched.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d04b92ceda..bfc809d51745 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); -/* Notifier for when a task gets migrated to a new CPU */ -struct task_migration_notifier { - struct task_struct *task; - int from_cpu; - int to_cpu; -}; -extern void register_task_migration_notifier(struct notifier_block *n); - extern unsigned long get_parent_ip(unsigned long addr); extern void dump_cpu_task(int cpu); -- cgit v1.2.3 From 62470419e993f8d9d93db0effd3af4296ecb79a5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 4 Jul 2013 12:55:51 +0800 Subject: sched: Implement smarter wake-affine logic The wake-affine scheduler feature is currently always trying to pull the wakee close to the waker. In theory this should be beneficial if the waker's CPU caches hot data for the wakee, and it's also beneficial in the extreme ping-pong high context switch rate case. Testing shows it can benefit hackbench up to 15%. However, the feature is somewhat blind, from which some workloads such as pgbench suffer. It's also time-consuming algorithmically. Testing shows it can damage pgbench up to 50% - far more than the benefit it brings in the best case. So wake-affine should be smarter and it should realize when to stop its thankless effort at trying to find a suitable CPU to wake on. This patch introduces 'wakee_flips', which will be increased each time the task flips (switches) its wakee target. So a high 'wakee_flips' value means the task has more than one wakee, and the bigger the number, the higher the wakeup frequency. Now when making the decision on whether to pull or not, pay attention to the wakee with a high 'wakee_flips', pulling such a task may benefit the wakee. Also imply that the waker will face cruel competition later, it could be very cruel or very fast depends on the story behind 'wakee_flips', waker therefore suffers. Furthermore, if waker also has a high 'wakee_flips', that implies that multiple tasks rely on it, then waker's higher latency will damage all of them, so pulling wakee seems to be a bad deal. Thus, when 'waker->wakee_flips / wakee->wakee_flips' becomes higher and higher, the cost of pulling seems to be worse and worse. The patch therefore helps the wake-affine feature to stop its pulling work when: wakee->wakee_flips > factor && waker->wakee_flips > (factor * wakee->wakee_flips) The 'factor' here is the number of CPUs in the current CPU's NUMA node, so a bigger node will lead to more pulling since the trial becomes more severe. After applying the patch, pgbench shows up to 40% improvements and no regressions. Tested with 12 cpu x86 server and tip 3.10.0-rc7. The percentages in the final column highlight the areas with the biggest wins, all other areas improved as well: pgbench base smart | db_size | clients | tps | | tps | +---------+---------+-------+ +-------+ | 22 MB | 1 | 10598 | | 10796 | | 22 MB | 2 | 21257 | | 21336 | | 22 MB | 4 | 41386 | | 41622 | | 22 MB | 8 | 51253 | | 57932 | | 22 MB | 12 | 48570 | | 54000 | | 22 MB | 16 | 46748 | | 55982 | +19.75% | 22 MB | 24 | 44346 | | 55847 | +25.93% | 22 MB | 32 | 43460 | | 54614 | +25.66% | 7484 MB | 1 | 8951 | | 9193 | | 7484 MB | 2 | 19233 | | 19240 | | 7484 MB | 4 | 37239 | | 37302 | | 7484 MB | 8 | 46087 | | 50018 | | 7484 MB | 12 | 42054 | | 48763 | | 7484 MB | 16 | 40765 | | 51633 | +26.66% | 7484 MB | 24 | 37651 | | 52377 | +39.11% | 7484 MB | 32 | 37056 | | 51108 | +37.92% | 15 GB | 1 | 8845 | | 9104 | | 15 GB | 2 | 19094 | | 19162 | | 15 GB | 4 | 36979 | | 36983 | | 15 GB | 8 | 46087 | | 49977 | | 15 GB | 12 | 41901 | | 48591 | | 15 GB | 16 | 40147 | | 50651 | +26.16% | 15 GB | 24 | 37250 | | 52365 | +40.58% | 15 GB | 32 | 36470 | | 50015 | +37.14% Signed-off-by: Michael Wang Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51D50057.9000809@linux.vnet.ibm.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d04b92ceda..4f163a8ffabf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1034,6 +1034,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; + struct task_struct *last_wakee; + unsigned long wakee_flips; + unsigned long wakee_flip_decay_ts; #endif int on_rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 765d87acdf05..860063a8c849 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3017,6 +3017,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } +static void record_wakee(struct task_struct *p) +{ + /* + * Rough decay (wiping) for cost saving, don't worry + * about the boundary, really active task won't care + * about the loss. + */ + if (jiffies > current->wakee_flip_decay_ts + HZ) { + current->wakee_flips = 0; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} static void task_waking_fair(struct task_struct *p) { @@ -3037,6 +3054,7 @@ static void task_waking_fair(struct task_struct *p) #endif se->vruntime -= min_vruntime; + record_wakee(p); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3155,6 +3173,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif +static int wake_wide(struct task_struct *p) +{ + int factor = nr_cpus_node(cpu_to_node(smp_processor_id())); + + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -3164,6 +3204,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); -- cgit v1.2.3 From e1403b8edf669ff49bbdf602cc97fefa2760cb15 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:06 -0700 Subject: include/linux/sched.h: don't use task->pid/tgid in same_thread_group/has_group_leader_pid task_struct->pid/tgid should go away. 1. Change same_thread_group() to use task->signal for comparison. 2. Change has_group_leader_pid(task) to compare task_pid(task) with signal->leader_pid. Signed-off-by: Oleg Nesterov Cc: Michal Hocko Cc: Sergey Dyasly Reviewed-by: "Eric W. Biederman" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ce1e1c0aaa33..45f254dddafc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2169,15 +2169,15 @@ static inline bool thread_group_leader(struct task_struct *p) * all we care about is that we have a task with the appropriate * pid, we don't actually care if we have the right task. */ -static inline int has_group_leader_pid(struct task_struct *p) +static inline bool has_group_leader_pid(struct task_struct *p) { - return p->pid == p->tgid; + return task_pid(p) == p->signal->leader_pid; } static inline -int same_thread_group(struct task_struct *p1, struct task_struct *p2) +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { - return p1->tgid == p2->tgid; + return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) -- cgit v1.2.3 From 519e52473ebe9db5cdef44670d5a97f1fd53d721 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:42 -0700 Subject: mm: memcg: enable memcg OOM killer only for user faults System calls and kernel faults (uaccess, gup) can handle an out of memory situation gracefully and just return -ENOMEM. Enable the memcg OOM killer only for user faults, where it's really the only option available. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: azurIt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 3 +++ mm/filemap.c | 11 ++++++++++- mm/memcontrol.c | 2 +- mm/memory.c | 40 ++++++++++++++++++++++++++++++---------- 5 files changed, 88 insertions(+), 12 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d4d1f9b0dbba..34ac6497d01a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -151,6 +151,37 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); +/** + * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task + * @new: true to enable, false to disable + * + * Toggle whether a failed memcg charge should invoke the OOM killer + * or just return -ENOMEM. Returns the previous toggle state. + */ +static inline bool mem_cgroup_toggle_oom(bool new) +{ + bool old; + + old = current->memcg_oom.may_oom; + current->memcg_oom.may_oom = new; + + return old; +} + +static inline void mem_cgroup_enable_oom(void) +{ + bool old = mem_cgroup_toggle_oom(true); + + WARN_ON(old == true); +} + +static inline void mem_cgroup_disable_oom(void) +{ + bool old = mem_cgroup_toggle_oom(false); + + WARN_ON(old == false); +} + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -383,6 +414,19 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, { } +static inline bool mem_cgroup_toggle_oom(bool new) +{ + return false; +} + +static inline void mem_cgroup_enable_oom(void) +{ +} + +static inline void mem_cgroup_disable_oom(void) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 45f254dddafc..9ce1fa53031f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1393,6 +1393,9 @@ struct task_struct { unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; unsigned int memcg_kmem_skip_account; + struct memcg_oom_info { + unsigned int may_oom:1; + } memcg_oom; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a8..e3b6fc8c0b7b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1614,6 +1614,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; + bool memcg_oom; pgoff_t size; int ret = 0; @@ -1622,7 +1623,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? + * Do we have something in the page cache already? Either + * way, try readahead, but disable the memcg OOM killer for it + * as readahead is optional and no errors are propagated up + * the fault stack. The OOM killer is enabled while trying to + * instantiate the faulting page individually below. */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1630,10 +1635,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ + memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); + mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ + memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); + mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c4524458b7d0..0980bbf6438d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2454,7 +2454,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_RETRY; /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check) + if (!oom_check || !current->memcg_oom.may_oom) return CHARGE_NOMEM; /* check OOM */ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..a8f9deab8719 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3754,22 +3754,14 @@ unlock: /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); - - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3850,6 +3842,34 @@ retry: return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_enable_oom(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) + mem_cgroup_disable_oom(); + + return ret; +} + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. -- cgit v1.2.3 From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:44 -0700 Subject: mm: memcg: do not trap chargers with full callstack on OOM The memcg OOM handling is incredibly fragile and can deadlock. When a task fails to charge memory, it invokes the OOM killer and loops right there in the charge code until it succeeds. Comparably, any other task that enters the charge path at this point will go to a waitqueue right then and there and sleep until the OOM situation is resolved. The problem is that these tasks may hold filesystem locks and the mmap_sem; locks that the selected OOM victim may need to exit. For example, in one reported case, the task invoking the OOM killer was about to charge a page cache page during a write(), which holds the i_mutex. The OOM killer selected a task that was just entering truncate() and trying to acquire the i_mutex: OOM invoking task: mem_cgroup_handle_oom+0x241/0x3b0 mem_cgroup_cache_charge+0xbe/0xe0 add_to_page_cache_locked+0x4c/0x140 add_to_page_cache_lru+0x22/0x50 grab_cache_page_write_begin+0x8b/0xe0 ext3_write_begin+0x88/0x270 generic_file_buffered_write+0x116/0x290 __generic_file_aio_write+0x27c/0x480 generic_file_aio_write+0x76/0xf0 # takes ->i_mutex do_sync_write+0xea/0x130 vfs_write+0xf3/0x1f0 sys_write+0x51/0x90 system_call_fastpath+0x18/0x1d OOM kill victim: do_truncate+0x58/0xa0 # takes i_mutex do_last+0x250/0xa30 path_openat+0xd7/0x440 do_filp_open+0x49/0xa0 do_sys_open+0x106/0x240 sys_open+0x20/0x30 system_call_fastpath+0x18/0x1d The OOM handling task will retry the charge indefinitely while the OOM killed task is not releasing any resources. A similar scenario can happen when the kernel OOM killer for a memcg is disabled and a userspace task is in charge of resolving OOM situations. In this case, ALL tasks that enter the OOM path will be made to sleep on the OOM waitqueue and wait for userspace to free resources or increase the group's limit. But a userspace OOM handler is prone to deadlock itself on the locks held by the waiting tasks. For example one of the sleeping tasks may be stuck in a brk() call with the mmap_sem held for writing but the userspace handler, in order to pick an optimal victim, may need to read files from /proc/, which tries to acquire the same mmap_sem for reading and deadlocks. This patch changes the way tasks behave after detecting a memcg OOM and makes sure nobody loops or sleeps with locks held: 1. When OOMing in a user fault, invoke the OOM killer and restart the fault instead of looping on the charge attempt. This way, the OOM victim can not get stuck on locks the looping task may hold. 2. When OOMing in a user fault but somebody else is handling it (either the kernel OOM killer or a userspace handler), don't go to sleep in the charge context. Instead, remember the OOMing memcg in the task struct and then fully unwind the page fault stack with -ENOMEM. pagefault_out_of_memory() will then call back into the memcg code to check if the -ENOMEM came from the memcg, and then either put the task to sleep on the memcg's OOM waitqueue or just restart the fault. The OOM victim can no longer get stuck on any lock a sleeping task may hold. Debugged by Michal Hocko. Signed-off-by: Johannes Weiner Reported-by: azurIt Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 21 +++++++ include/linux/sched.h | 4 ++ mm/memcontrol.c | 154 +++++++++++++++++++++++++++++++-------------- mm/memory.c | 3 + mm/oom_kill.c | 7 ++- 5 files changed, 140 insertions(+), 49 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 34ac6497d01a..89d576cfcc4c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -157,6 +157,10 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage, * * Toggle whether a failed memcg charge should invoke the OOM killer * or just return -ENOMEM. Returns the previous toggle state. + * + * NOTE: Any path that enables the OOM killer before charging must + * call mem_cgroup_oom_synchronize() afterward to finalize the + * OOM handling and clean up. */ static inline bool mem_cgroup_toggle_oom(bool new) { @@ -182,6 +186,13 @@ static inline void mem_cgroup_disable_oom(void) WARN_ON(old == false); } +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return p->memcg_oom.in_memcg_oom; +} + +bool mem_cgroup_oom_synchronize(void); + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -427,6 +438,16 @@ static inline void mem_cgroup_disable_oom(void) { } +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return false; +} + +static inline bool mem_cgroup_oom_synchronize(void) +{ + return false; +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ce1fa53031f..6682da36b293 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1395,6 +1395,10 @@ struct task_struct { unsigned int memcg_kmem_skip_account; struct memcg_oom_info { unsigned int may_oom:1; + unsigned int in_memcg_oom:1; + unsigned int oom_locked:1; + int wakeups; + struct mem_cgroup *wait_on_memcg; } memcg_oom; #endif #ifdef CONFIG_UPROBES diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04250cbf46c6..4b5cfb509270 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,6 +255,7 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; + atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ @@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, static void memcg_wakeup_oom(struct mem_cgroup *memcg) { + atomic_inc(&memcg->oom_wakeups); /* for filtering, pass "memcg" as argument. */ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } @@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) } /* - * try to call OOM killer. returns false if we should exit memory-reclaim loop. + * try to call OOM killer */ -static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, - int order) +static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - struct oom_wait_info owait; bool locked; + int wakeups; - owait.memcg = memcg; - owait.wait.flags = 0; - owait.wait.func = memcg_oom_wake_function; - owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + if (!current->memcg_oom.may_oom) + return; + + current->memcg_oom.in_memcg_oom = 1; /* * As with any blocking lock, a contender needs to start @@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, * otherwise it can miss the wakeup from the unlock and sleep * indefinitely. This is just open-coded because our locking * is so particular to memcg hierarchies. - * - * Even if signal_pending(), we can't quit charge() loop without - * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL - * under OOM is always welcomed, use TASK_KILLABLE here. */ - prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + wakeups = atomic_read(&memcg->oom_wakeups); mem_cgroup_mark_under_oom(memcg); locked = mem_cgroup_oom_trylock(memcg); @@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); mem_cgroup_out_of_memory(memcg, mask, order); + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); } else { - schedule(); - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); + /* + * A system call can just return -ENOMEM, but if this + * is a page fault and somebody else is handling the + * OOM already, we need to sleep on the OOM waitqueue + * for this memcg until the situation is resolved. + * Which can take some time because it might be + * handled by a userspace task. + * + * However, this is the charge context, which means + * that we may sit on a large call stack and hold + * various filesystem locks, the mmap_sem etc. and we + * don't want the OOM handler to deadlock on them + * while we sit here and wait. Store the current OOM + * context in the task_struct, then return -ENOMEM. + * At the end of the page fault handler, with the + * stack unwound, pagefault_out_of_memory() will check + * back with us by calling + * mem_cgroup_oom_synchronize(), possibly putting the + * task to sleep. + */ + current->memcg_oom.oom_locked = locked; + current->memcg_oom.wakeups = wakeups; + css_get(&memcg->css); + current->memcg_oom.wait_on_memcg = memcg; } +} + +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * + * This has to be called at the end of a page fault if the the memcg + * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * + * Memcg supports userspace OOM handling, so failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to put the task to sleep and clean up the + * OOM state. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * finalized, %false otherwise. + */ +bool mem_cgroup_oom_synchronize(void) +{ + struct oom_wait_info owait; + struct mem_cgroup *memcg; + + /* OOM is global, do not handle */ + if (!current->memcg_oom.in_memcg_oom) + return false; + + /* + * We invoked the OOM killer but there is a chance that a kill + * did not free up any charges. Everybody else might already + * be sleeping, so restart the fault and keep the rampage + * going until some charges are released. + */ + memcg = current->memcg_oom.wait_on_memcg; + if (!memcg) + goto out; + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + goto out_memcg; + + owait.memcg = memcg; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); - if (locked) { + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + /* Only sleep if we didn't miss any wakeups since OOM */ + if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + schedule(); + finish_wait(&memcg_oom_waitq, &owait.wait); +out_memcg: + mem_cgroup_unmark_under_oom(memcg); + if (current->memcg_oom.oom_locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, */ memcg_oom_recover(memcg); } - - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - return false; - /* Give chance to dying process */ - schedule_timeout_uninterruptible(1); + css_put(&memcg->css); + current->memcg_oom.wait_on_memcg = NULL; +out: + current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2400,12 +2475,11 @@ enum { CHARGE_RETRY, /* need to retry but retry is not bad */ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, unsigned int min_pages, - bool oom_check) + bool invoke_oom) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_wait_acct_move(mem_over_limit)) return CHARGE_RETRY; - /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check || !current->memcg_oom.may_oom) - return CHARGE_NOMEM; - /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) - return CHARGE_OOM_DIE; + if (invoke_oom) + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - return CHARGE_RETRY; + return CHARGE_NOMEM; } /* @@ -2572,7 +2642,7 @@ again: } do { - bool oom_check; + bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ if (fatal_signal_pending(current)) { @@ -2580,14 +2650,8 @@ again: goto bypass; } - oom_check = false; - if (oom && !nr_oom_retries) { - oom_check = true; - nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - } - - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, - oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, + nr_pages, invoke_oom); switch (ret) { case CHARGE_OK: break; @@ -2600,16 +2664,12 @@ again: css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom) { + if (!oom || invoke_oom) { css_put(&memcg->css); goto nomem; } - /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; - case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&memcg->css); - goto bypass; } } while (ret != CHARGE_OK); diff --git a/mm/memory.c b/mm/memory.c index a8f9deab8719..5ec6f199e685 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_USER) mem_cgroup_disable_oom(); + if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) + mem_cgroup_oom_synchronize(); + return ret; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98e75f2ac7bc..314e9d274381 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -678,9 +678,12 @@ out: */ void pagefault_out_of_memory(void) { - struct zonelist *zonelist = node_zonelist(first_online_node, - GFP_KERNEL); + struct zonelist *zonelist; + if (mem_cgroup_oom_synchronize()) + return; + + zonelist = node_zonelist(first_online_node, GFP_KERNEL); if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); clear_zonelist_oom(zonelist, GFP_KERNEL); -- cgit v1.2.3