From 1b604d75bbb6e28628c5a95a433432973c33d581 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 14 Dec 2009 17:57:47 -0800 Subject: oom: dump stack and VM state when oom killer panics The oom killer header, including information such as the allocation order and gfp mask, current's cpuset and memory controller, call trace, and VM state information is currently only shown when the oom killer has selected a task to kill. This information is omitted, however, when the oom killer panics either because of panic_on_oom sysctl settings or when no killable task was found. It is still relevant to know crucial pieces of information such as the allocation order and VM state when diagnosing such issues, especially at boot. This patch displays the oom killer header whenever it panics so that bug reports can include pertinent information to debug the issue, if possible. Signed-off-by: David Rientjes Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..492c98624fc1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -337,6 +337,21 @@ static void dump_tasks(const struct mem_cgroup *mem) } while_each_thread(g, p); } +static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) +{ + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_adj); + task_lock(current); + cpuset_print_task_mems_allowed(current); + task_unlock(current); + dump_stack(); + mem_cgroup_print_oom_info(mem, current); + show_mem(); + if (sysctl_oom_dump_tasks) + dump_tasks(mem); +} + /* * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO @@ -395,20 +410,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *c; - if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->signal->oom_adj); - task_lock(current); - cpuset_print_task_mems_allowed(current); - task_unlock(current); - dump_stack(); - mem_cgroup_print_oom_info(mem, current); - show_mem(); - if (sysctl_oom_dump_tasks) - dump_tasks(mem); - } + if (printk_ratelimit()) + dump_header(gfp_mask, order, mem); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -544,6 +547,7 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); + dump_header(gfp_mask, order, NULL); panic("Out of memory and no killable processes...\n"); } @@ -609,8 +613,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) /* Got some memory back in the last second. */ return; - if (sysctl_panic_on_oom == 2) + if (sysctl_panic_on_oom == 2) { + dump_header(gfp_mask, order, NULL); panic("out of memory. Compulsory panic_on_oom is selected.\n"); + } /* * Check if there were limitations on the allocation (only relevant for @@ -626,8 +632,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) break; case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) + if (sysctl_panic_on_oom) { + dump_header(gfp_mask, order, NULL); panic("out of memory. panic_on_oom is selected\n"); + } /* Fall-through */ case CONSTRAINT_CPUSET: __out_of_memory(gfp_mask, order); -- cgit v1.2.3 From 3b4798cbc13dd8d1150aa6377f97f0e11450a67d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 15 Dec 2009 16:45:32 -0800 Subject: oom-kill: show virtual size and rss information of the killed process In a typical oom analysis scenario, we frequently want to know whether the killed process has a memory leak or not at the first step. This patch adds vsz and rss information to the oom log to help this analysis. To save time for the debugging. example: =================================================================== rsyslogd invoked oom-killer: gfp_mask=0x201da, order=0, oom_adj=0 Pid: 1308, comm: rsyslogd Not tainted 2.6.32-rc6 #24 Call Trace: [] ?_spin_unlock+0x2b/0x40 [] oom_kill_process+0xbe/0x2b0 (snip) 492283 pages non-shared Out of memory: kill process 2341 (memhog) score 527276 or a child Killed process 2341 (memhog) vsz:1054552kB, anon-rss:970588kB, file-rss:4kB =========================================================================== ^ | here [rientjes@google.com: fix race, add pid & comm to message] Signed-off-by: KOSAKI Motohiro Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 492c98624fc1..6bb8a7a7ec9a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -352,6 +352,8 @@ static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) dump_tasks(mem); } +#define K(x) ((x) << (PAGE_SHIFT-10)) + /* * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO @@ -365,15 +367,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } + task_lock(p); if (!p->mm) { WARN_ON(1); - printk(KERN_WARNING "tried to kill an mm-less task!\n"); + printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", + task_pid_nr(p), p->comm); + task_unlock(p); return; } if (verbose) - printk(KERN_ERR "Killed process %d (%s)\n", - task_pid_nr(p), p->comm); + printk(KERN_ERR "Killed process %d (%s) " + "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(p), p->comm, + K(p->mm->total_vm), + K(get_mm_counter(p->mm, anon_rss)), + K(get_mm_counter(p->mm, file_rss))); + task_unlock(p); /* * We give our sacrificial lamb high priority and access to -- cgit v1.2.3 From 4365a5676fa3aa1d5ae6c90c22a0044f09ba584e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 15 Dec 2009 16:45:33 -0800 Subject: oom-kill: fix NUMA constraint check with nodemask Fix node-oriented allocation handling in oom-kill.c I myself think of this as a bugfix not as an ehnancement. In these days, things are changed as - alloc_pages() eats nodemask as its arguments, __alloc_pages_nodemask(). - mempolicy don't maintain its own private zonelists. (And cpuset doesn't use nodemask for __alloc_pages_nodemask()) So, current oom-killer's check function is wrong. This patch does - check nodemask, if nodemask && nodemask doesn't cover all node_states[N_HIGH_MEMORY], this is CONSTRAINT_MEMORY_POLICY. - Scan all zonelist under nodemask, if it hits cpuset's wall this faiulre is from cpuset. And - modifies the caller of out_of_memory not to call oom if __GFP_THISNODE. This doesn't change "current" behavior. If callers use __GFP_THISNODE it should handle "page allocation failure" by itself. - handle __GFP_NOFAIL+__GFP_THISNODE path. This is something like a FIXME but this gfpmask is not used now. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: David Rientjes Cc: Daisuke Nishimura Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/sysrq.c | 2 +- include/linux/oom.h | 4 +++- mm/oom_kill.c | 46 +++++++++++++++++++++++++++++++++------------- mm/page_alloc.c | 22 ++++++++++++++++------ 4 files changed, 53 insertions(+), 21 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 44203ff599da..1ae2de7d8b4f 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -339,7 +339,7 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0); + out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL); } static DECLARE_WORK(moom_work, moom_callback); diff --git a/include/linux/oom.h b/include/linux/oom.h index 6aac5fe4f6f1..537662315627 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include +#include struct zonelist; struct notifier_block; @@ -26,7 +27,8 @@ enum oom_constraint { extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); -extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *mask); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6bb8a7a7ec9a..25c679e0288a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) /* * Determine the type of allocation constraint. */ -static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask) -{ #ifdef CONFIG_NUMA +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask) +{ struct zone *zone; struct zoneref *z; enum zone_type high_zoneidx = gfp_zone(gfp_mask); - nodemask_t nodes = node_states[N_HIGH_MEMORY]; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - if (cpuset_zone_allowed_softwall(zone, gfp_mask)) - node_clear(zone_to_nid(zone), nodes); - else - return CONSTRAINT_CPUSET; + /* + * Reach here only when __GFP_NOFAIL is used. So, we should avoid + * to kill current.We have to random task kill in this case. + * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. + */ + if (gfp_mask & __GFP_THISNODE) + return CONSTRAINT_NONE; - if (!nodes_empty(nodes)) + /* + * The nodemask here is a nodemask passed to alloc_pages(). Now, + * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy + * feature. mempolicy is an only user of nodemask here. + * check mempolicy's nodemask contains all N_HIGH_MEMORY + */ + if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) return CONSTRAINT_MEMORY_POLICY; -#endif + + /* Check this allocation failure is caused by cpuset's wall function */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) + if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) + return CONSTRAINT_CPUSET; return CONSTRAINT_NONE; } +#else +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask) +{ + return CONSTRAINT_NONE; +} +#endif /* * Simple selection loop. We chose the process with the highest @@ -613,7 +632,8 @@ rest_and_return: * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask) { unsigned long freed = 0; enum oom_constraint constraint; @@ -632,7 +652,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask); + constraint = constrained_alloc(zonelist, gfp_mask, nodemask); read_lock(&tasklist_lock); switch (constraint) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59d2e88fb47c..850c4a7e2fe5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1654,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) - goto out; - + if (!(gfp_mask & __GFP_NOFAIL)) { + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* + * GFP_THISNODE contains __GFP_NORETRY and we never hit this. + * Sanity check for bare calls of __GFP_THISNODE, not real OOM. + * The caller should handle page allocation failure by itself if + * it specifies __GFP_THISNODE. + * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. + */ + if (gfp_mask & __GFP_THISNODE) + goto out; + } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order, nodemask); out: clear_zonelist_oom(zonelist, gfp_mask); @@ -3123,7 +3133,7 @@ static int __cpuinit process_zones(int cpu) if (percpu_pagelist_fraction) setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); + (zone->present_pages / percpu_pagelist_fraction)); } return 0; -- cgit v1.2.3 From d31f56dbf8bafaacb0c617f9a6f137498d5c7aed Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 15 Dec 2009 16:47:12 -0800 Subject: memcg: avoid oom-killing innocent task in case of use_hierarchy task_in_mem_cgroup(), which is called by select_bad_process() to check whether a task can be a candidate for being oom-killed from memcg's limit, checks "curr->use_hierarchy"("curr" is the mem_cgroup the task belongs to). But this check return true(it's false positive) when: /aa use_hierarchy == 0 <- hitting limit /aa/00 use_hierarchy == 1 <- the task belongs to This leads to killing an innocent task in aa/00. This patch is a fix for this bug. And this patch also fixes the arg for mem_cgroup_print_oom_info(). We should print information of mem_cgroup which the task being killed, not current, belongs to. Signed-off-by: Daisuke Nishimura Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++-- mm/oom_kill.c | 13 +++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6273984f2e34..a294b7576070 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -760,7 +760,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) task_unlock(task); if (!curr) return 0; - if (curr->use_hierarchy) + /* + * We should check use_hierarchy of "mem" not "curr". Because checking + * use_hierarchy of "curr" here make this function true if hierarchy is + * enabled in "curr" and "curr" is a child of "mem" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "mem"). + */ + if (mem->use_hierarchy) ret = css_is_ancestor(&curr->css, &mem->css); else ret = (curr == mem); @@ -1009,7 +1015,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) static char memcg_name[PATH_MAX]; int ret; - if (!memcg) + if (!memcg || !p) return; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25c679e0288a..f52481b1c1e5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -356,7 +356,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } while_each_thread(g, p); } -static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) +static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem) { pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " "oom_adj=%d\n", @@ -365,7 +366,7 @@ static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(mem, current); + mem_cgroup_print_oom_info(mem, p); show_mem(); if (sysctl_oom_dump_tasks) dump_tasks(mem); @@ -440,7 +441,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) - dump_header(gfp_mask, order, mem); + dump_header(p, gfp_mask, order, mem); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -576,7 +577,7 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); - dump_header(gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL); panic("Out of memory and no killable processes...\n"); } @@ -644,7 +645,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, return; if (sysctl_panic_on_oom == 2) { - dump_header(gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL); panic("out of memory. Compulsory panic_on_oom is selected.\n"); } @@ -663,7 +664,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, case CONSTRAINT_NONE: if (sysctl_panic_on_oom) { - dump_header(gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL); panic("out of memory. panic_on_oom is selected\n"); } /* Fall-through */ -- cgit v1.2.3