From 69b27baf00fa9b7b14b3263c105390d1683425b2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 25 Mar 2016 14:20:21 -0700 Subject: sched: add schedule_timeout_idle() This will be needed in the patch "mm, oom: introduce oom reaper". Acked-by: Michal Hocko Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 589c4780b077..478b41de7f7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +extern signed long schedule_timeout_idle(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); -- cgit v1.2.3 From aac453635549699c13a84ea1456d5b0e574ef855 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:24 -0700 Subject: mm, oom: introduce oom reaper This patch (of 5): This is based on the idea from Mel Gorman discussed during LSFMM 2015 and independently brought up by Oleg Nesterov. The OOM killer currently allows to kill only a single task in a good hope that the task will terminate in a reasonable time and frees up its memory. Such a task (oom victim) will get an access to memory reserves via mark_oom_victim to allow a forward progress should there be a need for additional memory during exit path. It has been shown (e.g. by Tetsuo Handa) that it is not that hard to construct workloads which break the core assumption mentioned above and the OOM victim might take unbounded amount of time to exit because it might be blocked in the uninterruptible state waiting for an event (e.g. lock) which is blocked by another task looping in the page allocator. This patch reduces the probability of such a lockup by introducing a specialized kernel thread (oom_reaper) which tries to reclaim additional memory by preemptively reaping the anonymous or swapped out memory owned by the oom victim under an assumption that such a memory won't be needed when its owner is killed and kicked from the userspace anyway. There is one notable exception to this, though, if the OOM victim was in the process of coredumping the result would be incomplete. This is considered a reasonable constrain because the overall system health is more important than debugability of a particular application. A kernel thread has been chosen because we need a reliable way of invocation so workqueue context is not appropriate because all the workers might be busy (e.g. allocating memory). Kswapd which sounds like another good fit is not appropriate as well because it might get blocked on locks during reclaim as well. oom_reaper has to take mmap_sem on the target task for reading so the solution is not 100% because the semaphore might be held or blocked for write but the probability is reduced considerably wrt. basically any lock blocking forward progress as described above. In order to prevent from blocking on the lock without any forward progress we are using only a trylock and retry 10 times with a short sleep in between. Users of mmap_sem which need it for write should be carefully reviewed to use _killable waiting as much as possible and reduce allocations requests done with the lock held to absolute minimum to reduce the risk even further. The API between oom killer and oom reaper is quite trivial. wake_oom_reaper updates mm_to_reap with cmpxchg to guarantee only NULL->mm transition and oom_reaper clear this atomically once it is done with the work. This means that only a single mm_struct can be reaped at the time. As the operation is potentially disruptive we are trying to limit it to the ncessary minimum and the reaper blocks any updates while it operates on an mm. mm_struct is pinned by mm_count to allow parallel exit_mmap and a race is detected by atomic_inc_not_zero(mm_users). Signed-off-by: Michal Hocko Suggested-by: Oleg Nesterov Suggested-by: Mel Gorman Acked-by: Mel Gorman Acked-by: David Rientjes Cc: Mel Gorman Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrea Argangeli Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 + mm/internal.h | 5 ++ mm/memory.c | 17 +++--- mm/oom_kill.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 162 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 450fc977ed02..ed6407d1b7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1132,6 +1132,8 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + bool ignore_dirty; /* Ignore dirty pages */ + bool check_swap_entries; /* Check also swap entries */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/internal.h b/mm/internal.h index 7449392c6faa..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); + extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/memory.c b/mm/memory.c index 81dca0083fcd..098f00d05461 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1102,6 +1102,12 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { + /* + * oom_reaper cannot tear down dirty + * pages + */ + if (unlikely(details && details->ignore_dirty)) + continue; force_flush = 1; set_page_dirty(page); } @@ -1120,8 +1126,8 @@ again: } continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) + /* only check swap_entries if explicitly asked for in details */ + if (unlikely(details && !details->check_swap_entries)) continue; entry = pte_to_swp_entry(ptent); @@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping) - details = NULL; - BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details; + struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 06f7e1707847..f7ed6ece0719 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -35,6 +35,11 @@ #include #include #include +#include +#include + +#include +#include "internal.h" #define CREATE_TRACE_POINTS #include @@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static struct mm_struct *mm_to_reap; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); + +static bool __oom_reap_vmas(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + struct zap_details details = {.check_swap_entries = true, + .ignore_dirty = true}; + bool ret = true; + + /* We might have raced with exit path */ + if (!atomic_inc_not_zero(&mm->mm_users)) + return true; + + if (!down_read_trylock(&mm->mmap_sem)) { + ret = false; + goto out; + } + + tlb_gather_mmu(&tlb, mm, 0, -1); + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + /* + * mlocked VMAs require explicit munlocking before unmap. + * Let's keep it simple here and skip such VMAs. + */ + if (vma->vm_flags & VM_LOCKED) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, + &details); + } + tlb_finish_mmu(&tlb, 0, -1); + up_read(&mm->mmap_sem); +out: + mmput(mm); + return ret; +} + +static void oom_reap_vmas(struct mm_struct *mm) +{ + int attempts = 0; + + /* Retry the down_read_trylock(mmap_sem) a few times */ + while (attempts++ < 10 && !__oom_reap_vmas(mm)) + schedule_timeout_idle(HZ/10); + + /* Drop a reference taken by wake_oom_reaper */ + mmdrop(mm); +} + +static int oom_reaper(void *unused) +{ + while (true) { + struct mm_struct *mm; + + wait_event_freezable(oom_reaper_wait, + (mm = READ_ONCE(mm_to_reap))); + oom_reap_vmas(mm); + WRITE_ONCE(mm_to_reap, NULL); + } + + return 0; +} + +static void wake_oom_reaper(struct mm_struct *mm) +{ + struct mm_struct *old_mm; + + if (!oom_reaper_th) + return; + + /* + * Pin the given mm. Use mm_count instead of mm_users because + * we do not want to delay the address space tear down. + */ + atomic_inc(&mm->mm_count); + + /* + * Make sure that only a single mm is ever queued for the reaper + * because multiple are not necessary and the operation might be + * disruptive so better reduce it to the bare minimum. + */ + old_mm = cmpxchg(&mm_to_reap, NULL, mm); + if (!old_mm) + wake_up(&oom_reaper_wait); + else + mmdrop(mm); +} + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); + if (IS_ERR(oom_reaper_th)) { + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", + PTR_ERR(oom_reaper_th)); + oom_reaper_th = NULL; + } + return 0; +} +subsys_initcall(oom_init) +#else +static void wake_oom_reaper(struct mm_struct *mm) +{ +} +#endif + /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark @@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD)) - continue; - if (is_global_init(p)) - continue; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + /* + * We cannot use oom_reaper for the mm shared by this + * process because it wouldn't get killed and so the + * memory might be still used. + */ + can_oom_reap = false; continue; - + } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); + if (can_oom_reap) + wake_oom_reaper(mm); + mmdrop(mm); put_task_struct(victim); } -- cgit v1.2.3 From 36324a990cf578b57828c04cd85ac62cd25cf5a4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:27 -0700 Subject: oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space When oom_reaper manages to unmap all the eligible vmas there shouldn't be much of the freable memory held by the oom victim left anymore so it makes sense to clear the TIF_MEMDIE flag for the victim and allow the OOM killer to select another task. The lack of TIF_MEMDIE also means that the victim cannot access memory reserves anymore but that shouldn't be a problem because it would get the access again if it needs to allocate and hits the OOM killer again due to the fatal_signal_pending resp. PF_EXITING check. We can safely hide the task from the OOM killer because it is clearly not a good candidate anymore as everyhing reclaimable has been torn down already. This patch will allow to cap the time an OOM victim can keep TIF_MEMDIE and thus hold off further global OOM killer actions granted the oom reaper is able to take mmap_sem for the associated mm struct. This is not guaranteed now but further steps should make sure that mmap_sem for write should be blocked killable which will help to reduce such a lock contention. This is not done by this patch. Note that exit_oom_victim might be called on a remote task from __oom_reap_task now so we have to check and clear the flag atomically otherwise we might race and underflow oom_victims or wake up waiters too early. Signed-off-by: Michal Hocko Suggested-by: Johannes Weiner Suggested-by: Tetsuo Handa Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- kernel/exit.c | 2 +- mm/oom_kill.c | 73 +++++++++++++++++++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 03e6257321f0..45993b840ed6 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, extern bool out_of_memory(struct oom_control *oc); -extern void exit_oom_victim(void); +extern void exit_oom_victim(struct task_struct *tsk); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/kernel/exit.c b/kernel/exit.c index 953d1a1c0387..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f7ed6ece0719..2830b1c6483e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -416,20 +416,36 @@ bool oom_killer_disabled __read_mostly; * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; -static struct mm_struct *mm_to_reap; +static struct task_struct *task_to_reap; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static bool __oom_reap_vmas(struct mm_struct *mm) +static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; + struct mm_struct *mm; + struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; - /* We might have raced with exit path */ - if (!atomic_inc_not_zero(&mm->mm_users)) + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + p = find_lock_task_mm(tsk); + if (!p) + return true; + + mm = p->mm; + if (!atomic_inc_not_zero(&mm->mm_users)) { + task_unlock(p); return true; + } + + task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; @@ -464,60 +480,66 @@ static bool __oom_reap_vmas(struct mm_struct *mm) } tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); + + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore. OOM killer can continue + * by selecting other victim if unmapping hasn't led to any + * improvements. This also means that selecting this task doesn't + * make any sense. + */ + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + exit_oom_victim(tsk); out: mmput(mm); return ret; } -static void oom_reap_vmas(struct mm_struct *mm) +static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < 10 && !__oom_reap_vmas(mm)) + while (attempts++ < 10 && !__oom_reap_task(tsk)) schedule_timeout_idle(HZ/10); /* Drop a reference taken by wake_oom_reaper */ - mmdrop(mm); + put_task_struct(tsk); } static int oom_reaper(void *unused) { while (true) { - struct mm_struct *mm; + struct task_struct *tsk; wait_event_freezable(oom_reaper_wait, - (mm = READ_ONCE(mm_to_reap))); - oom_reap_vmas(mm); - WRITE_ONCE(mm_to_reap, NULL); + (tsk = READ_ONCE(task_to_reap))); + oom_reap_task(tsk); + WRITE_ONCE(task_to_reap, NULL); } return 0; } -static void wake_oom_reaper(struct mm_struct *mm) +static void wake_oom_reaper(struct task_struct *tsk) { - struct mm_struct *old_mm; + struct task_struct *old_tsk; if (!oom_reaper_th) return; - /* - * Pin the given mm. Use mm_count instead of mm_users because - * we do not want to delay the address space tear down. - */ - atomic_inc(&mm->mm_count); + get_task_struct(tsk); /* * Make sure that only a single mm is ever queued for the reaper * because multiple are not necessary and the operation might be * disruptive so better reduce it to the bare minimum. */ - old_mm = cmpxchg(&mm_to_reap, NULL, mm); - if (!old_mm) + old_tsk = cmpxchg(&task_to_reap, NULL, tsk); + if (!old_tsk) wake_up(&oom_reaper_wait); else - mmdrop(mm); + put_task_struct(tsk); } static int __init oom_init(void) @@ -532,7 +554,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static void wake_oom_reaper(struct mm_struct *mm) +static void wake_oom_reaper(struct task_struct *tsk) { } #endif @@ -563,9 +585,10 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(void) +void exit_oom_victim(struct task_struct *tsk) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -748,7 +771,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(mm); + wake_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); -- cgit v1.2.3 From 03049269de433cb5fe2859be9ae4469ceb1163ed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:33 -0700 Subject: mm, oom_reaper: implement OOM victims queuing wake_oom_reaper has allowed only 1 oom victim to be queued. The main reason for that was the simplicity as other solutions would require some way of queuing. The current approach is racy and that was deemed sufficient as the oom_reaper is considered a best effort approach to help with oom handling when the OOM victim cannot terminate in a reasonable time. The race could lead to missing an oom victim which can get stuck out_of_memory wake_oom_reaper cmpxchg // OK oom_reaper oom_reap_task __oom_reap_task oom_victim terminates atomic_inc_not_zero // fail out_of_memory wake_oom_reaper cmpxchg // fails task_to_reap = NULL This race requires 2 OOM invocations in a short time period which is not very likely but certainly not impossible. E.g. the original victim might have not released a lot of memory for some reason. The situation would improve considerably if wake_oom_reaper used a more robust queuing. This is what this patch implements. This means adding oom_reaper_list list_head into task_struct (eat a hole before embeded thread_struct for that purpose) and a oom_reaper_lock spinlock for queuing synchronization. wake_oom_reaper will then add the task on the queue and oom_reaper will dequeue it. Signed-off-by: Michal Hocko Cc: Vladimir Davydov Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ mm/oom_kill.c | 36 +++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 478b41de7f7d..788f223f8f8f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1849,6 +1849,9 @@ struct task_struct { unsigned long task_state_change; #endif int pagefault_disabled; +#ifdef CONFIG_MMU + struct list_head oom_reaper_list; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 30a60991173a..f6d4ae9f1c69 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -418,8 +418,10 @@ bool oom_killer_disabled __read_mostly; * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; -static struct task_struct *task_to_reap; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +static LIST_HEAD(oom_reaper_list); +static DEFINE_SPINLOCK(oom_reaper_lock); + static bool __oom_reap_task(struct task_struct *tsk) { @@ -524,12 +526,20 @@ static void oom_reap_task(struct task_struct *tsk) static int oom_reaper(void *unused) { while (true) { - struct task_struct *tsk; + struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, - (tsk = READ_ONCE(task_to_reap))); - oom_reap_task(tsk); - WRITE_ONCE(task_to_reap, NULL); + (!list_empty(&oom_reaper_list))); + spin_lock(&oom_reaper_lock); + if (!list_empty(&oom_reaper_list)) { + tsk = list_first_entry(&oom_reaper_list, + struct task_struct, oom_reaper_list); + list_del(&tsk->oom_reaper_list); + } + spin_unlock(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); } return 0; @@ -537,23 +547,15 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - struct task_struct *old_tsk; - if (!oom_reaper_th) return; get_task_struct(tsk); - /* - * Make sure that only a single mm is ever queued for the reaper - * because multiple are not necessary and the operation might be - * disruptive so better reduce it to the bare minimum. - */ - old_tsk = cmpxchg(&task_to_reap, NULL, tsk); - if (!old_tsk) - wake_up(&oom_reaper_wait); - else - put_task_struct(tsk); + spin_lock(&oom_reaper_lock); + list_add(&tsk->oom_reaper_list, &oom_reaper_list); + spin_unlock(&oom_reaper_lock); + wake_up(&oom_reaper_wait); } static int __init oom_init(void) -- cgit v1.2.3 From 855b018325737f7691f9b7d86339df40aa4e47c3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:36 -0700 Subject: oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task Tetsuo has reported that oom_kill_allocating_task=1 will cause oom_reaper_list corruption because oom_kill_process doesn't follow standard OOM exclusion (aka ignores TIF_MEMDIE) and allows to enqueue the same task multiple times - e.g. by sacrificing the same child multiple times. This patch fixes the issue by introducing a new MMF_OOM_KILLED mm flag which is set in oom_kill_process atomically and oom reaper is disabled if the flag was already set. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Cc: Mel Gorman Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ mm/oom_kill.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 788f223f8f8f..c2d2d7c5d463 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -512,6 +512,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_KILLED 21 /* OOM killer has chosen this mm */ + #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) struct sighand_struct { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f6d4ae9f1c69..1a21819a8e5e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -680,7 +680,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - bool can_oom_reap = true; + bool can_oom_reap; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -742,6 +742,10 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; atomic_inc(&mm->mm_count); + + /* Make sure we do not try to oom reap the mm multiple times */ + can_oom_reap = !test_and_set_bit(MMF_OOM_KILLED, &mm->flags); + /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user -- cgit v1.2.3 From 29c696e1c6eceb5db6b21f0c89495fcfcd40c0eb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 25 Mar 2016 14:20:39 -0700 Subject: oom: make oom_reaper_list single linked Entries are only added/removed from oom_reaper_list at head so we can use a single linked list and hence save a word in task_struct. Signed-off-by: Vladimir Davydov Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- mm/oom_kill.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c2d2d7c5d463..49b1febcf7c3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1852,7 +1852,7 @@ struct task_struct { #endif int pagefault_disabled; #ifdef CONFIG_MMU - struct list_head oom_reaper_list; + struct task_struct *oom_reaper_list; #endif /* CPU-specific state of this task */ struct thread_struct thread; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1a21819a8e5e..a49638f41e45 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -419,7 +419,7 @@ bool oom_killer_disabled __read_mostly; */ static struct task_struct *oom_reaper_th; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static LIST_HEAD(oom_reaper_list); +static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); @@ -528,13 +528,11 @@ static int oom_reaper(void *unused) while (true) { struct task_struct *tsk = NULL; - wait_event_freezable(oom_reaper_wait, - (!list_empty(&oom_reaper_list))); + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); spin_lock(&oom_reaper_lock); - if (!list_empty(&oom_reaper_list)) { - tsk = list_first_entry(&oom_reaper_list, - struct task_struct, oom_reaper_list); - list_del(&tsk->oom_reaper_list); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; } spin_unlock(&oom_reaper_lock); @@ -553,7 +551,8 @@ static void wake_oom_reaper(struct task_struct *tsk) get_task_struct(tsk); spin_lock(&oom_reaper_lock); - list_add(&tsk->oom_reaper_list, &oom_reaper_list); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; spin_unlock(&oom_reaper_lock); wake_up(&oom_reaper_wait); } -- cgit v1.2.3 From bb29902a7515208846114b3b36a4281a9bbf766a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 25 Mar 2016 14:20:44 -0700 Subject: oom, oom_reaper: protect oom_reaper_list using simpler way "oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task" tried to protect oom_reaper_list using MMF_OOM_KILLED flag. But we can do it by simply checking tsk->oom_reaper_list != NULL. Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 -- mm/oom_kill.c | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 49b1febcf7c3..60bba7e032dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -512,8 +512,6 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_KILLED 21 /* OOM killer has chosen this mm */ - #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) struct sighand_struct { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8cc55f0f0e5c..b34d279a7ee6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -547,7 +547,7 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - if (!oom_reaper_th) + if (!oom_reaper_th || tsk->oom_reaper_list) return; get_task_struct(tsk); @@ -681,7 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - bool can_oom_reap; + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -743,10 +743,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; atomic_inc(&mm->mm_count); - - /* Make sure we do not try to oom reap the mm multiple times */ - can_oom_reap = !test_and_set_bit(MMF_OOM_KILLED, &mm->flags); - /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user -- cgit v1.2.3 From aaf4fb712b8311d8b950e89937479d61e9c25ba8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 25 Mar 2016 14:21:53 -0700 Subject: include/linux/oom.h: remove undefined oom_kills_count()/note_oom_kill() A leftover from commit c32b3cbe0d06 ("oom, PM: make OOM detection in the freezer path raceless"). Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 45993b840ed6..628a43242a34 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -76,8 +76,6 @@ extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); -extern int oom_kills_count(void); -extern void note_oom_kill(void); extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, const char *message); -- cgit v1.2.3 From 7ed2f9e663854db313f177a511145630e398b402 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:21:59 -0700 Subject: mm, kasan: SLAB support Add KASAN hooks to SLAB allocator. This patch is based on the "mm: kasan: unified support for SLUB and SLAB allocators" patch originally prepared by Dmitry Chernenkov. Signed-off-by: Alexander Potapenko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kasan.txt | 5 +-- include/linux/kasan.h | 12 ++++++ include/linux/slab.h | 6 +++ include/linux/slab_def.h | 14 +++++++ include/linux/slub_def.h | 11 +++++ lib/Kconfig.kasan | 4 +- mm/Makefile | 1 + mm/kasan/kasan.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 34 ++++++++++++++++ mm/kasan/report.c | 54 ++++++++++++++++++++----- mm/slab.c | 43 +++++++++++++++++--- mm/slab_common.c | 2 +- 12 files changed, 266 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt index aa1e0c91e368..7dd95b35cd7c 100644 --- a/Documentation/kasan.txt +++ b/Documentation/kasan.txt @@ -12,8 +12,7 @@ KASAN uses compile-time instrumentation for checking every memory access, therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is required for detection of out-of-bounds accesses to stack or global variables. -Currently KASAN is supported only for x86_64 architecture and requires the -kernel to be built with the SLUB allocator. +Currently KASAN is supported only for x86_64 architecture. 1. Usage ======== @@ -27,7 +26,7 @@ inline are compiler instrumentation types. The former produces smaller binary the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC version 5.0 or later. -Currently KASAN works only with the SLUB memory allocator. +KASAN works with both SLUB and SLAB memory allocators. For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. To disable instrumentation for specific files or directories, add a line diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0fdc798e3ff7..839f2007a0f9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -48,6 +48,9 @@ void kasan_unpoison_task_stack(struct task_struct *task); void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); +void kasan_cache_create(struct kmem_cache *cache, size_t *size, + unsigned long *flags); + void kasan_poison_slab(struct page *page); void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); void kasan_poison_object_data(struct kmem_cache *cache, void *object); @@ -61,6 +64,11 @@ void kasan_krealloc(const void *object, size_t new_size); void kasan_slab_alloc(struct kmem_cache *s, void *object); void kasan_slab_free(struct kmem_cache *s, void *object); +struct kasan_cache { + int alloc_meta_offset; + int free_meta_offset; +}; + int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); @@ -76,6 +84,10 @@ static inline void kasan_disable_current(void) {} static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} static inline void kasan_free_pages(struct page *page, unsigned int order) {} +static inline void kasan_cache_create(struct kmem_cache *cache, + size_t *size, + unsigned long *flags) {} + static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} diff --git a/include/linux/slab.h b/include/linux/slab.h index e4b568738ca3..aa61595a1482 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -92,6 +92,12 @@ # define SLAB_ACCOUNT 0x00000000UL #endif +#ifdef CONFIG_KASAN +#define SLAB_KASAN 0x08000000UL +#else +#define SLAB_KASAN 0x00000000UL +#endif + /* The following flags affect the page allocator grouping pages by mobility */ #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index e878ba35ae91..9edbbf352340 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -76,8 +76,22 @@ struct kmem_cache { #ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; #endif +#ifdef CONFIG_KASAN + struct kasan_cache kasan_info; +#endif struct kmem_cache_node *node[MAX_NUMNODES]; }; +static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, + void *x) { + void *object = x - (x - page->s_mem) % cache->size; + void *last_object = page->s_mem + (cache->num - 1) * cache->size; + + if (unlikely(object > last_object)) + return last_object; + else + return object; +} + #endif /* _LINUX_SLAB_DEF_H */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index ac5143f95ee6..665cd0cd18b8 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -130,4 +130,15 @@ static inline void *virt_to_obj(struct kmem_cache *s, void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason); +static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, + void *x) { + void *object = x - (x - page_address(page)) % cache->size; + void *last_object = page_address(page) + + (page->objects - 1) * cache->size; + if (unlikely(object > last_object)) + return last_object; + else + return object; +} + #endif /* _LINUX_SLUB_DEF_H */ diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 0fee5acd5aa0..0e4d2b3b0aee 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -5,7 +5,7 @@ if HAVE_ARCH_KASAN config KASAN bool "KASan: runtime memory debugger" - depends on SLUB_DEBUG + depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS help Enables kernel address sanitizer - runtime memory debugger, @@ -16,6 +16,8 @@ config KASAN This feature consumes about 1/8 of available memory and brings about ~x3 performance slowdown. For better error detection enable CONFIG_STACKTRACE. + Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB + (the resulting kernel does not boot). choice prompt "Instrumentation type" diff --git a/mm/Makefile b/mm/Makefile index f5e797cbd128..deb467edca2d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,6 +3,7 @@ # KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n # These files are disabled because they produce non-interesting and/or diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1ad20ade8c91..7c82509ef169 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -334,6 +334,59 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } +#ifdef CONFIG_SLAB +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static size_t optimal_redzone(size_t object_size) +{ + int rz = + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; + return rz; +} + +void kasan_cache_create(struct kmem_cache *cache, size_t *size, + unsigned long *flags) +{ + int redzone_adjust; + /* Make sure the adjusted size is still less than + * KMALLOC_MAX_CACHE_SIZE. + * TODO: this check is only useful for SLAB, but not SLUB. We'll need + * to skip it for SLUB when it starts using kasan_cache_create(). + */ + if (*size > KMALLOC_MAX_CACHE_SIZE - + sizeof(struct kasan_alloc_meta) - + sizeof(struct kasan_free_meta)) + return; + *flags |= SLAB_KASAN; + /* Add alloc meta. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* Add free meta. */ + if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + } + redzone_adjust = optimal_redzone(cache->object_size) - + (*size - cache->object_size); + if (redzone_adjust > 0) + *size += redzone_adjust; + *size = min(KMALLOC_MAX_CACHE_SIZE, + max(*size, + cache->object_size + + optimal_redzone(cache->object_size))); +} +#endif + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -351,8 +404,36 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_INIT; + } +#endif +} + +static inline void set_track(struct kasan_track *track) +{ + track->cpu = raw_smp_processor_id(); + track->pid = current->pid; + track->when = jiffies; } +#ifdef CONFIG_SLAB +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) +{ + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) +{ + return (void *)object + cache->kasan_info.free_meta_offset; +} +#endif + void kasan_slab_alloc(struct kmem_cache *cache, void *object) { kasan_kmalloc(cache, object, cache->object_size); @@ -367,6 +448,17 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_free_meta *free_info = + get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_FREE; + set_track(&free_info->track); + } +#endif + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } @@ -386,6 +478,16 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + + alloc_info->state = KASAN_STATE_ALLOC; + alloc_info->alloc_size = size; + set_track(&alloc_info->track); + } +#endif } EXPORT_SYMBOL(kasan_kmalloc); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4f6c62e5c21e..7b9e4ab9b66b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -54,6 +54,40 @@ struct kasan_global { #endif }; +/** + * Structures to keep alloc and free tracks * + */ + +enum kasan_state { + KASAN_STATE_INIT, + KASAN_STATE_ALLOC, + KASAN_STATE_FREE +}; + +struct kasan_track { + u64 cpu : 6; /* for NR_CPUS = 64 */ + u64 pid : 16; /* 65536 processes */ + u64 when : 42; /* ~140 years */ +}; + +struct kasan_alloc_meta { + u32 state : 2; /* enum kasan_state */ + u32 alloc_size : 30; + struct kasan_track track; +}; + +struct kasan_free_meta { + /* Allocator freelist pointer, unused by KASAN. */ + void **freelist; + struct kasan_track track; +}; + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object); + + static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 745aa8f36028..3e3385cc97ac 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -115,6 +115,46 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +#ifdef CONFIG_SLAB +static void print_track(struct kasan_track *track) +{ + pr_err("PID = %u, CPU = %u, timestamp = %lu\n", track->pid, + track->cpu, (unsigned long)track->when); +} + +static void object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) +{ + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + struct kasan_free_meta *free_info; + + dump_stack(); + pr_err("Object at %p, in cache %s\n", object, cache->name); + if (!(cache->flags & SLAB_KASAN)) + return; + switch (alloc_info->state) { + case KASAN_STATE_INIT: + pr_err("Object not allocated yet\n"); + break; + case KASAN_STATE_ALLOC: + pr_err("Object allocated with size %u bytes.\n", + alloc_info->alloc_size); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + break; + case KASAN_STATE_FREE: + pr_err("Object freed, allocated with size %u bytes\n", + alloc_info->alloc_size); + free_info = get_free_info(cache, object); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + pr_err("Deallocation:\n"); + print_track(&free_info->track); + break; + } +} +#endif + static void print_address_description(struct kasan_access_info *info) { const void *addr = info->access_addr; @@ -126,17 +166,10 @@ static void print_address_description(struct kasan_access_info *info) if (PageSlab(page)) { void *object; struct kmem_cache *cache = page->slab_cache; - void *last_object; - - object = virt_to_obj(cache, page_address(page), addr); - last_object = page_address(page) + - page->objects * cache->size; - - if (unlikely(object > last_object)) - object = last_object; /* we hit into padding */ - + object = nearest_obj(cache, page, + (void *)info->access_addr); object_err(cache, page, object, - "kasan: bad access detected"); + "kasan: bad access detected"); return; } dump_page(page, "kasan: bad access detected"); @@ -146,7 +179,6 @@ static void print_address_description(struct kasan_access_info *info) if (!init_task_stack_addr(addr)) pr_err("Address belongs to variable %pS\n", addr); } - dump_stack(); } diff --git a/mm/slab.c b/mm/slab.c index e719a5cb3396..7515578471d8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2086,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif + kasan_cache_create(cachep, &size, &flags); + size = ALIGN(size, cachep->align); /* * We should restrict the number of objects in a slab to implement @@ -2387,8 +2389,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2409,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; + void *objp; cache_init_objs_debug(cachep, page); @@ -2419,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { /* constructor could break poison info */ - if (DEBUG == 0 && cachep->ctor) - cachep->ctor(index_to_obj(cachep, page, i)); + if (DEBUG == 0 && cachep->ctor) { + objp = index_to_obj(cachep, page, i); + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } set_free_obj(page, i, i); } @@ -2550,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); + kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -3316,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, { struct array_cache *ac = cpu_cache_get(cachep); + kasan_slab_free(cachep, objp); + check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3363,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_slab_alloc(cachep, ret); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3428,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_kmalloc(cachep, ret, size); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3451,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_slab_alloc(cachep, ret); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3468,7 +3486,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, void *ret; ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - + kasan_kmalloc(cachep, ret, size); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3481,11 +3499,15 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; + void *ret; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); + kasan_kmalloc(cachep, ret, size); + + return ret; } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3519,6 +3541,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); + kasan_kmalloc(cachep, ret, size); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4290,10 +4313,18 @@ module_init(slab_proc_init); */ size_t ksize(const void *objp) { + size_t size; + BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return virt_to_cache(objp)->object_size; + size = virt_to_cache(objp)->object_size; + /* We assume that ksize callers could use the whole allocated area, + * so we need to unpoison this area. + */ + kasan_krealloc(objp, size); + + return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab_common.c b/mm/slab_common.c index b2e379639a5b..4de72e220c82 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) + SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_NOTRACK | SLAB_ACCOUNT) -- cgit v1.2.3 From 505f5dcb1c419e55a9621a01f83eb5745d8d7398 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:02 -0700 Subject: mm, kasan: add GFP flags to KASAN API Add GFP flags to KASAN hooks for future patches to use. This patch is based on the "mm: kasan: unified support for SLUB and SLAB allocators" patch originally prepared by Dmitry Chernenkov. Signed-off-by: Alexander Potapenko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 19 +++++++++++-------- include/linux/slab.h | 4 ++-- mm/kasan/kasan.c | 15 ++++++++------- mm/mempool.c | 16 ++++++++-------- mm/slab.c | 15 ++++++++------- mm/slab.h | 2 +- mm/slab_common.c | 4 ++-- mm/slub.c | 15 ++++++++------- 8 files changed, 48 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 839f2007a0f9..737371b56044 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -55,13 +55,14 @@ void kasan_poison_slab(struct page *page); void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); void kasan_poison_object_data(struct kmem_cache *cache, void *object); -void kasan_kmalloc_large(const void *ptr, size_t size); +void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); void kasan_kfree_large(const void *ptr); void kasan_kfree(void *ptr); -void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); -void kasan_krealloc(const void *object, size_t new_size); +void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, + gfp_t flags); +void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); -void kasan_slab_alloc(struct kmem_cache *s, void *object); +void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); void kasan_slab_free(struct kmem_cache *s, void *object); struct kasan_cache { @@ -94,14 +95,16 @@ static inline void kasan_unpoison_object_data(struct kmem_cache *cache, static inline void kasan_poison_object_data(struct kmem_cache *cache, void *object) {} -static inline void kasan_kmalloc_large(void *ptr, size_t size) {} +static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {} static inline void kasan_kfree_large(const void *ptr) {} static inline void kasan_kfree(void *ptr) {} static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, - size_t size) {} -static inline void kasan_krealloc(const void *object, size_t new_size) {} + size_t size, gfp_t flags) {} +static inline void kasan_krealloc(const void *object, size_t new_size, + gfp_t flags) {} -static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} +static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, + gfp_t flags) {} static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } diff --git a/include/linux/slab.h b/include/linux/slab.h index aa61595a1482..508bd827e6dc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -376,7 +376,7 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, { void *ret = kmem_cache_alloc(s, flags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -387,7 +387,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, { void *ret = kmem_cache_alloc_node(s, gfpflags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } #endif /* CONFIG_TRACING */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 7c82509ef169..cb998e0ec9d3 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -434,9 +434,9 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, } #endif -void kasan_slab_alloc(struct kmem_cache *cache, void *object) +void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) { - kasan_kmalloc(cache, object, cache->object_size); + kasan_kmalloc(cache, object, cache->object_size, flags); } void kasan_slab_free(struct kmem_cache *cache, void *object) @@ -462,7 +462,8 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, + gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; @@ -491,7 +492,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size) +void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { struct page *page; unsigned long redzone_start; @@ -510,7 +511,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) KASAN_PAGE_REDZONE); } -void kasan_krealloc(const void *object, size_t size) +void kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; @@ -520,9 +521,9 @@ void kasan_krealloc(const void *object, size_t size) page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size); + kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size); + kasan_kmalloc(page->slab_cache, object, size, flags); } void kasan_kfree(void *ptr) diff --git a/mm/mempool.c b/mm/mempool.c index 07c383ddbbab..9b7a14a791cc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_alloc(pool->pool_data, element); + kasan_slab_alloc(pool->pool_data, element, flags); if (pool->alloc == mempool_kmalloc) - kasan_krealloc(element, (size_t)pool->pool_data); + kasan_krealloc(element, (size_t)pool->pool_data, flags); if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } @@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(mempool_t *pool, gfp_t flags) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element); + kasan_unpoison_element(pool, element, flags); check_element(pool, element); return element; } @@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool) return; while (pool->curr_nr) { - void *element = remove_element(pool); + void *element = remove_element(pool, GFP_KERNEL); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool); + element = remove_element(pool, GFP_KERNEL); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -347,7 +347,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool); + element = remove_element(pool, gfp_temp); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); diff --git a/mm/slab.c b/mm/slab.c index 7515578471d8..17e2848979c5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3378,7 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_slab_alloc(cachep, ret); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3444,7 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_kmalloc(cachep, ret, size); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3468,7 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_slab_alloc(cachep, ret); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3486,7 +3486,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, void *ret; ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_kmalloc(cachep, ret, size); + + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3505,7 +3506,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - kasan_kmalloc(cachep, ret, size); + kasan_kmalloc(cachep, ret, size, flags); return ret; } @@ -3541,7 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); - kasan_kmalloc(cachep, ret, size); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4323,7 +4324,7 @@ size_t ksize(const void *objp) /* We assume that ksize callers could use the whole allocated area, * so we need to unpoison this area. */ - kasan_krealloc(objp, size); + kasan_krealloc(objp, size, GFP_NOWAIT); return size; } diff --git a/mm/slab.h b/mm/slab.h index ff39a8fc3b3f..5969769fbee6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -405,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - kasan_slab_alloc(s, object); + kasan_slab_alloc(s, object, flags); } memcg_kmem_put_cache(s); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 4de72e220c82..3239bfd758e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_kmem_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size); + kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1192,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size); + kasan_krealloc((void *)p, new_size, flags); return (void *)p; } diff --git a/mm/slub.c b/mm/slub.c index 7277413ebc8b..4dbb109eb8cd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1313,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size); + kasan_kmalloc_large(ptr, size, flags); } static inline void kfree_hook(const void *x) @@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3561,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3606,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3635,7 +3636,7 @@ size_t ksize(const void *object) size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, so we need unpoison this area. */ - kasan_krealloc(object, size); + kasan_krealloc(object, size, GFP_NOWAIT); return size; } EXPORT_SYMBOL(ksize); -- cgit v1.2.3 From be7635e7287e0e8013af3c89a6354a9e0182594c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:05 -0700 Subject: arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections KASAN needs to know whether the allocation happens in an IRQ handler. This lets us strip everything below the IRQ entry point to reduce the number of unique stack traces needed to be stored. Move the definition of __irq_entry to so that the users don't need to pull in . Also introduce the __softirq_entry macro which is similar to __irq_entry, but puts the corresponding functions to the .softirqentry.text section. Signed-off-by: Alexander Potapenko Acked-by: Steven Rostedt Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/exception.h | 2 +- arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm64/include/asm/exception.h | 2 +- arch/arm64/kernel/vmlinux.lds.S | 1 + arch/blackfin/kernel/vmlinux.lds.S | 1 + arch/c6x/kernel/vmlinux.lds.S | 1 + arch/metag/kernel/vmlinux.lds.S | 1 + arch/microblaze/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/vmlinux.lds.S | 1 + arch/nios2/kernel/vmlinux.lds.S | 1 + arch/openrisc/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/vmlinux.lds.S | 1 + arch/powerpc/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/vmlinux.lds.S | 1 + arch/tile/kernel/vmlinux.lds.S | 1 + arch/x86/kernel/vmlinux.lds.S | 1 + include/asm-generic/vmlinux.lds.h | 12 +++++++++++- include/linux/ftrace.h | 11 ----------- include/linux/interrupt.h | 20 ++++++++++++++++++++ kernel/softirq.c | 2 +- kernel/trace/trace_functions_graph.c | 1 + 23 files changed, 51 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h index 5abaf5bbd985..bf1991263d2d 100644 --- a/arch/arm/include/asm/exception.h +++ b/arch/arm/include/asm/exception.h @@ -7,7 +7,7 @@ #ifndef __ASM_ARM_EXCEPTION_H #define __ASM_ARM_EXCEPTION_H -#include +#include #define __exception __attribute__((section(".exception.text"))) #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 1fab979daeaf..e2c6da096cef 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -108,6 +108,7 @@ SECTIONS *(.exception.text) __exception_text_end = .; IRQENTRY_TEXT + SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 6cb7e1a6bc02..0c2eec490abf 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -18,7 +18,7 @@ #ifndef __ASM_EXCEPTION_H #define __ASM_EXCEPTION_H -#include +#include #define __exception __attribute__((section(".exception.text"))) #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 37f624df68fa..5a1939a74ff3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -103,6 +103,7 @@ SECTIONS *(.exception.text) __exception_text_end = .; IRQENTRY_TEXT + SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index c9eec84aa258..d920b959ff3a 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -35,6 +35,7 @@ SECTIONS #endif LOCK_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT KPROBES_TEXT #ifdef CONFIG_ROMKERNEL __sinittext = .; diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 5a6e141d1641..50bc10f97bcb 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT KPROBES_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S index e12055e88bfe..150ace92c7ad 100644 --- a/arch/metag/kernel/vmlinux.lds.S +++ b/arch/metag/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.text.*) *(.gnu.warning) } diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index be9488d69734..0a47f0410554 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -36,6 +36,7 @@ SECTIONS { LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT . = ALIGN (4) ; _etext = . ; } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 0a93e83cd014..54d653ee17e1 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -58,6 +58,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.text.*) *(.fixup) *(.gnu.warning) diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index 326fab40a9de..e23e89539967 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S @@ -39,6 +39,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT KPROBES_TEXT } =0 _etext = .; diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index 2d69a853b742..d936de4c07ca 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.text.__*) _etext = .; diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 308f29081d46..f3ead0b6ce46 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.text.do_softirq) *(.text.sys_exit) *(.text.do_sigaltstack) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index d41fd0af8980..2dd91f79de05 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -55,6 +55,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT #ifdef CONFIG_PPC32 *(.got1) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 445657fe658c..0f41a8286378 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) } :text = 0x0700 diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index db88cbf9eafd..235a4101999f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -39,6 +39,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index f1a2f688b28a..aadd321aa05d 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -48,6 +48,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.gnu.warning) } = 0 _etext = .; diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 0e059a0101ea..378f5d8d1ec8 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -45,6 +45,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT __fix_text_end = .; /* tile-cpack won't rearrange before this */ ALIGN_FUNCTION(); *(.hottext*) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d239639e0c1d..4c941f88d405 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -101,6 +101,7 @@ SECTIONS KPROBES_TEXT ENTRY_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) /* End of text section */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8f5a12ab2f2b..339125bb4d2c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -456,7 +456,7 @@ *(.entry.text) \ VMLINUX_SYMBOL(__entry_text_end) = .; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) #define IRQENTRY_TEXT \ ALIGN_FUNCTION(); \ VMLINUX_SYMBOL(__irqentry_text_start) = .; \ @@ -466,6 +466,16 @@ #define IRQENTRY_TEXT #endif +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +#define SOFTIRQENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ + *(.softirqentry.text) \ + VMLINUX_SYMBOL(__softirqentry_text_end) = .; +#else +#define SOFTIRQENTRY_TEXT +#endif + /* Section used for early init (in .S files) */ #define HEAD_TEXT *(.head.text) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6d9df3f7e334..dea12a6e413b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -811,16 +811,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, */ #define __notrace_funcgraph notrace -/* - * We want to which function is an entrypoint of a hardirq. - * That will help us to put a signal on output. - */ -#define __irq_entry __attribute__((__section__(".irqentry.text"))) - -/* Limits of hardirq entrypoints */ -extern char __irqentry_text_start[]; -extern char __irqentry_text_end[]; - #define FTRACE_NOTRACE_DEPTH 65536 #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 @@ -857,7 +847,6 @@ static inline void unpause_graph_tracing(void) #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ #define __notrace_funcgraph -#define __irq_entry #define INIT_FTRACE_GRAPH static inline void ftrace_graph_init_task(struct task_struct *t) { } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 358076eda364..9fcabeb07787 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -683,4 +683,24 @@ extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +/* + * We want to know which function is an entrypoint of a hardirq or a softirq. + */ +#define __irq_entry __attribute__((__section__(".irqentry.text"))) +#define __softirq_entry \ + __attribute__((__section__(".softirqentry.text"))) + +/* Limits of hardirq entrypoints */ +extern char __irqentry_text_start[]; +extern char __irqentry_text_end[]; +/* Limits of softirq entrypoints */ +extern char __softirqentry_text_start[]; +extern char __softirqentry_text_end[]; + +#else +#define __irq_entry +#define __softirq_entry +#endif + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 8aae49dd7da8..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 91d6a63a2ea7..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include -- cgit v1.2.3 From cd11016e5f5212c13c0cec7384a525edc93b4921 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:08 -0700 Subject: mm, kasan: stackdepot implementation. Enable stackdepot for SLAB Implement the stack depot and provide CONFIG_STACKDEPOT. Stack depot will allow KASAN store allocation/deallocation stack traces for memory chunks. The stack traces are stored in a hash table and referenced by handles which reside in the kasan_alloc_meta and kasan_free_meta structures in the allocated memory chunks. IRQ stack traces are cut below the IRQ entry point to avoid unnecessary duplication. Right now stackdepot support is only enabled in SLAB allocator. Once KASAN features in SLAB are on par with those in SLUB we can switch SLUB to stackdepot as well, thus removing the dependency on SLUB stack bookkeeping, which wastes a lot of memory. This patch is based on the "mm: kasan: stack depots" patch originally prepared by Dmitry Chernenkov. Joonsoo has said that he plans to reuse the stackdepot code for the mm/page_owner.c debugging facility. [akpm@linux-foundation.org: s/depot_stack_handle/depot_stack_handle_t] [aryabinin@virtuozzo.com: comment style fixes] Signed-off-by: Alexander Potapenko Signed-off-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/Makefile | 1 + include/linux/stackdepot.h | 32 +++++ lib/Kconfig | 4 + lib/Kconfig.kasan | 1 + lib/Makefile | 3 + lib/stackdepot.c | 284 +++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.c | 55 ++++++++- mm/kasan/kasan.h | 11 +- mm/kasan/report.c | 12 +- 9 files changed, 391 insertions(+), 12 deletions(-) create mode 100644 include/linux/stackdepot.h create mode 100644 lib/stackdepot.c (limited to 'include/linux') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index adaae2c781c1..616ebd22ef9a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -19,6 +19,7 @@ endif KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n +KASAN_SANITIZE_stacktrace.o := n OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h new file mode 100644 index 000000000000..7978b3e2c1e1 --- /dev/null +++ b/include/linux/stackdepot.h @@ -0,0 +1,32 @@ +/* + * A generic stack depot implementation + * + * Author: Alexander Potapenko + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _LINUX_STACKDEPOT_H +#define _LINUX_STACKDEPOT_H + +typedef u32 depot_stack_handle_t; + +struct stack_trace; + +depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags); + +void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace); + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index 133ebc0c1773..3cca1222578e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -536,4 +536,8 @@ config ARCH_HAS_PMEM_API config ARCH_HAS_MMIO_FLUSH bool +config STACKDEPOT + bool + select STACKTRACE + endmenu diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 0e4d2b3b0aee..67d8c6838ba9 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -7,6 +7,7 @@ config KASAN bool "KASan: runtime memory debugger" depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS + select STACKDEPOT if SLAB help Enables kernel address sanitizer - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/Makefile b/lib/Makefile index a1de5b61ff40..7bd6fd436c97 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -181,6 +181,9 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o obj-$(CONFIG_IRQ_POLL) += irq_poll.o +obj-$(CONFIG_STACKDEPOT) += stackdepot.o +KASAN_SANITIZE_stackdepot.o := n + libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o $(foreach file, $(libfdt_files), \ diff --git a/lib/stackdepot.c b/lib/stackdepot.c new file mode 100644 index 000000000000..654c9d87e83a --- /dev/null +++ b/lib/stackdepot.c @@ -0,0 +1,284 @@ +/* + * Generic stack depot for storing stack traces. + * + * Some debugging tools need to save stack traces of certain events which can + * be later presented to the user. For example, KASAN needs to safe alloc and + * free stacks for each object, but storing two stack traces per object + * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for + * that). + * + * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc + * and free stacks repeat a lot, we save about 100x space. + * Stacks are never removed from depot, so we store them contiguously one after + * another in a contiguos memory allocation. + * + * Author: Alexander Potapenko + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) + +#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ +#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) +#define STACK_ALLOC_ALIGN 4 +#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ + STACK_ALLOC_ALIGN) +#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS) +#define STACK_ALLOC_SLABS_CAP 1024 +#define STACK_ALLOC_MAX_SLABS \ + (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ + (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) + +/* The compact structure to store the reference to stacks. */ +union handle_parts { + depot_stack_handle_t handle; + struct { + u32 slabindex : STACK_ALLOC_INDEX_BITS; + u32 offset : STACK_ALLOC_OFFSET_BITS; + }; +}; + +struct stack_record { + struct stack_record *next; /* Link in the hashtable */ + u32 hash; /* Hash in the hastable */ + u32 size; /* Number of frames in the stack */ + union handle_parts handle; + unsigned long entries[1]; /* Variable-sized array of entries. */ +}; + +static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; + +static int depot_index; +static int next_slab_inited; +static size_t depot_offset; +static DEFINE_SPINLOCK(depot_lock); + +static bool init_stack_slab(void **prealloc) +{ + if (!*prealloc) + return false; + /* + * This smp_load_acquire() pairs with smp_store_release() to + * |next_slab_inited| below and in depot_alloc_stack(). + */ + if (smp_load_acquire(&next_slab_inited)) + return true; + if (stack_slabs[depot_index] == NULL) { + stack_slabs[depot_index] = *prealloc; + } else { + stack_slabs[depot_index + 1] = *prealloc; + /* + * This smp_store_release pairs with smp_load_acquire() from + * |next_slab_inited| above and in depot_save_stack(). + */ + smp_store_release(&next_slab_inited, 1); + } + *prealloc = NULL; + return true; +} + +/* Allocation of a new stack in raw storage */ +static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, + u32 hash, void **prealloc, gfp_t alloc_flags) +{ + int required_size = offsetof(struct stack_record, entries) + + sizeof(unsigned long) * size; + struct stack_record *stack; + + required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + + if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return NULL; + } + depot_index++; + depot_offset = 0; + /* + * smp_store_release() here pairs with smp_load_acquire() from + * |next_slab_inited| in depot_save_stack() and + * init_stack_slab(). + */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) + smp_store_release(&next_slab_inited, 0); + } + init_stack_slab(prealloc); + if (stack_slabs[depot_index] == NULL) + return NULL; + + stack = stack_slabs[depot_index] + depot_offset; + + stack->hash = hash; + stack->size = size; + stack->handle.slabindex = depot_index; + stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + memcpy(stack->entries, entries, size * sizeof(unsigned long)); + depot_offset += required_size; + + return stack; +} + +#define STACK_HASH_ORDER 20 +#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) +#define STACK_HASH_MASK (STACK_HASH_SIZE - 1) +#define STACK_HASH_SEED 0x9747b28c + +static struct stack_record *stack_table[STACK_HASH_SIZE] = { + [0 ... STACK_HASH_SIZE - 1] = NULL +}; + +/* Calculate hash for a stack */ +static inline u32 hash_stack(unsigned long *entries, unsigned int size) +{ + return jhash2((u32 *)entries, + size * sizeof(unsigned long) / sizeof(u32), + STACK_HASH_SEED); +} + +/* Find a stack that is equal to the one stored in entries in the hash */ +static inline struct stack_record *find_stack(struct stack_record *bucket, + unsigned long *entries, int size, + u32 hash) +{ + struct stack_record *found; + + for (found = bucket; found; found = found->next) { + if (found->hash == hash && + found->size == size && + !memcmp(entries, found->entries, + size * sizeof(unsigned long))) { + return found; + } + } + return NULL; +} + +void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) +{ + union handle_parts parts = { .handle = handle }; + void *slab = stack_slabs[parts.slabindex]; + size_t offset = parts.offset << STACK_ALLOC_ALIGN; + struct stack_record *stack = slab + offset; + + trace->nr_entries = trace->max_entries = stack->size; + trace->entries = stack->entries; + trace->skip = 0; +} + +/** + * depot_save_stack - save stack in a stack depot. + * @trace - the stacktrace to save. + * @alloc_flags - flags for allocating additional memory if required. + * + * Returns the handle of the stack struct stored in depot. + */ +depot_stack_handle_t depot_save_stack(struct stack_trace *trace, + gfp_t alloc_flags) +{ + u32 hash; + depot_stack_handle_t retval = 0; + struct stack_record *found = NULL, **bucket; + unsigned long flags; + struct page *page = NULL; + void *prealloc = NULL; + + if (unlikely(trace->nr_entries == 0)) + goto fast_exit; + + hash = hash_stack(trace->entries, trace->nr_entries); + /* Bad luck, we won't store this stack. */ + if (hash == 0) + goto exit; + + bucket = &stack_table[hash & STACK_HASH_MASK]; + + /* + * Fast path: look the stack trace up without locking. + * The smp_load_acquire() here pairs with smp_store_release() to + * |bucket| below. + */ + found = find_stack(smp_load_acquire(bucket), trace->entries, + trace->nr_entries, hash); + if (found) + goto exit; + + /* + * Check if the current or the next stack slab need to be initialized. + * If so, allocate the memory - we won't be able to do that under the + * lock. + * + * The smp_load_acquire() here pairs with smp_store_release() to + * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). + */ + if (unlikely(!smp_load_acquire(&next_slab_inited))) { + /* + * Zero out zone modifiers, as we don't have specific zone + * requirements. Keep the flags related to allocation in atomic + * contexts and I/O. + */ + alloc_flags &= ~GFP_ZONEMASK; + alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); + page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); + if (page) + prealloc = page_address(page); + } + + spin_lock_irqsave(&depot_lock, flags); + + found = find_stack(*bucket, trace->entries, trace->nr_entries, hash); + if (!found) { + struct stack_record *new = + depot_alloc_stack(trace->entries, trace->nr_entries, + hash, &prealloc, alloc_flags); + if (new) { + new->next = *bucket; + /* + * This smp_store_release() pairs with + * smp_load_acquire() from |bucket| above. + */ + smp_store_release(bucket, new); + found = new; + } + } else if (prealloc) { + /* + * We didn't need to store this stack trace, but let's keep + * the preallocated memory for the future. + */ + WARN_ON(!init_stack_slab(&prealloc)); + } + + spin_unlock_irqrestore(&depot_lock, flags); +exit: + if (prealloc) { + /* Nobody used this memory, ok to free it. */ + free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); + } + if (found) + retval = found->handle.handle; +fast_exit: + return retval; +} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index cb998e0ec9d3..acb3b6c4dd89 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -17,7 +17,9 @@ #define DISABLE_BRANCH_PROFILING #include +#include #include +#include #include #include #include @@ -32,7 +34,6 @@ #include #include #include -#include #include "kasan.h" #include "../slab.h" @@ -413,23 +414,65 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) #endif } -static inline void set_track(struct kasan_track *track) +#ifdef CONFIG_SLAB +static inline int in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) { - track->cpu = raw_smp_processor_id(); track->pid = current->pid; - track->when = jiffies; + track->stack = save_stack(flags); } -#ifdef CONFIG_SLAB struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, const void *object) { + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); return (void *)object + cache->kasan_info.alloc_meta_offset; } struct kasan_free_meta *get_free_info(struct kmem_cache *cache, const void *object) { + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); return (void *)object + cache->kasan_info.free_meta_offset; } #endif @@ -486,7 +529,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, alloc_info->state = KASAN_STATE_ALLOC; alloc_info->alloc_size = size; - set_track(&alloc_info->track); + set_track(&alloc_info->track, flags); } #endif } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7b9e4ab9b66b..30a2f0ba0e09 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #define __MM_KASAN_KASAN_H #include +#include #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) @@ -64,16 +65,18 @@ enum kasan_state { KASAN_STATE_FREE }; +#define KASAN_STACK_DEPTH 64 + struct kasan_track { - u64 cpu : 6; /* for NR_CPUS = 64 */ - u64 pid : 16; /* 65536 processes */ - u64 when : 42; /* ~140 years */ + u32 pid; + depot_stack_handle_t stack; }; struct kasan_alloc_meta { + struct kasan_track track; u32 state : 2; /* enum kasan_state */ u32 alloc_size : 30; - struct kasan_track track; + u32 reserved; }; struct kasan_free_meta { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3e3385cc97ac..60869a5a0124 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -118,8 +119,15 @@ static inline bool init_task_stack_addr(const void *addr) #ifdef CONFIG_SLAB static void print_track(struct kasan_track *track) { - pr_err("PID = %u, CPU = %u, timestamp = %lu\n", track->pid, - track->cpu, (unsigned long)track->when); + pr_err("PID = %u\n", track->pid); + if (track->stack) { + struct stack_trace trace; + + depot_fetch_stack(track->stack, &trace); + print_stack_trace(&trace, 0); + } else { + pr_err("(stack is not available)\n"); + } } static void object_err(struct kmem_cache *cache, struct page *page, -- cgit v1.2.3