From 549bfc14270681cd776c6d9b78fe544cbd21673a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 23 Jul 2020 21:15:11 -0700 Subject: mm/mmap.c: close race between munmap() and expand_upwards()/downwards() commit 246c320a8cfe0b11d81a4af38fa9985ef0cc9a4c upstream. VMA with VM_GROWSDOWN or VM_GROWSUP flag set can change their size under mmap_read_lock(). It can lead to race with __do_munmap(): Thread A Thread B __do_munmap() detach_vmas_to_be_unmapped() mmap_write_downgrade() expand_downwards() vma->vm_start = address; // The VMA now overlaps with // VMAs detached by the Thread A // page fault populates expanded part // of the VMA unmap_region() // Zaps pagetables partly // populated by Thread B Similar race exists for expand_upwards(). The fix is to avoid downgrading mmap_lock in __do_munmap() if detached VMAs are next to VM_GROWSDOWN or VM_GROWSUP VMA. [akpm@linux-foundation.org: s/mmap_sem/mmap_lock/ in comment] Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Reported-by: Jann Horn Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Acked-by: Vlastimil Babka Cc: Oleg Nesterov Cc: Matthew Wilcox Cc: [4.20+] Link: http://lkml.kernel.org/r/20200709105309.42495-1-kirill.shutemov@linux.intel.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mmap.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 514cc19c5916..ea1ba2db4f4f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2622,7 +2622,7 @@ static void unmap_region(struct mm_struct *mm, * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. */ -static void +static bool detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end) { @@ -2647,6 +2647,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, /* Kill the cache */ vmacache_invalidate(mm); + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (vma && (vma->vm_flags & VM_GROWSDOWN)) + return false; + if (prev && (prev->vm_flags & VM_GROWSUP)) + return false; + return true; } /* @@ -2827,7 +2838,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } /* Detach vmas from rbtree */ - detach_vmas_to_be_unmapped(mm, vma, prev, end); + if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + downgrade = false; if (downgrade) downgrade_write(&mm->mmap_sem); -- cgit v1.2.3 From db949f60d98317bafaf3041976c9e63e600af793 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 23 Jul 2020 21:15:24 -0700 Subject: mm/memcg: fix refcount error while moving and swapping commit 8d22a9351035ef2ff12ef163a1091b8b8cf1e49c upstream. It was hard to keep a test running, moving tasks between memcgs with move_charge_at_immigrate, while swapping: mem_cgroup_id_get_many()'s refcount is discovered to be 0 (supposedly impossible), so it is then forced to REFCOUNT_SATURATED, and after thousands of warnings in quick succession, the test is at last put out of misery by being OOM killed. This is because of the way moved_swap accounting was saved up until the task move gets completed in __mem_cgroup_clear_mc(), deferred from when mem_cgroup_move_swap_account() actually exchanged old and new ids. Concurrent activity can free up swap quicker than the task is scanned, bringing id refcount down 0 (which should only be possible when offlining). Just skip that optimization: do that part of the accounting immediately. Fixes: 615d66c37c75 ("mm: memcontrol: fix memcg id ref counter on swap charge move") Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Alex Shi Cc: Johannes Weiner Cc: Alex Shi Cc: Shakeel Butt Cc: Michal Hocko Cc: Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2007071431050.4726@eggly.anvils Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3f4c35bb5fa..402c8bc65e08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5770,7 +5770,6 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - mem_cgroup_id_get_many(mc.to, mc.moved_swap); css_put_many(&mc.to->css, mc.moved_swap); mc.moved_swap = 0; @@ -5961,7 +5960,8 @@ put: /* get_mctgt_type() gets the page */ ent = target.ent; if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { mc.precharge--; - /* we fixup refcnts and charges later. */ + mem_cgroup_id_get_many(mc.to, 1); + /* we fixup other refcnts and charges later. */ mc.moved_swap++; } break; -- cgit v1.2.3 From 95750e1edbcd9f9badc53ecc7059a5f40fdedfdc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 23 Jul 2020 21:15:27 -0700 Subject: mm: memcg/slab: fix memory leak at non-root kmem_cache destroy commit d38a2b7a9c939e6d7329ab92b96559ccebf7b135 upstream. If the kmem_cache refcount is greater than one, we should not mark the root kmem_cache as dying. If we mark the root kmem_cache dying incorrectly, the non-root kmem_cache can never be destroyed. It resulted in memory leak when memcg was destroyed. We can use the following steps to reproduce. 1) Use kmem_cache_create() to create a new kmem_cache named A. 2) Coincidentally, the kmem_cache A is an alias for kmem_cache B, so the refcount of B is just increased. 3) Use kmem_cache_destroy() to destroy the kmem_cache A, just decrease the B's refcount but mark the B as dying. 4) Create a new memory cgroup and alloc memory from the kmem_cache B. It leads to create a non-root kmem_cache for allocating memory. 5) When destroy the memory cgroup created in the step 4), the non-root kmem_cache can never be destroyed. If we repeat steps 4) and 5), this will cause a lot of memory leak. So only when refcount reach zero, we mark the root kmem_cache as dying. Fixes: 92ee383f6daa ("mm: fix race between kmem_cache destroy, create and deactivate") Signed-off-by: Muchun Song Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Shakeel Butt Cc: Link: http://lkml.kernel.org/r/20200716165103.83462-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slab_common.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 8c1ffbf7de45..e36dd36c7076 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -326,6 +326,14 @@ int slab_unmergeable(struct kmem_cache *s) if (s->refcount < 0) return 1; +#ifdef CONFIG_MEMCG_KMEM + /* + * Skip the dying kmem_cache. + */ + if (s->memcg_params.dying) + return 1; +#endif + return 0; } @@ -886,12 +894,15 @@ static int shutdown_memcg_caches(struct kmem_cache *s) return 0; } -static void flush_memcg_workqueue(struct kmem_cache *s) +static void memcg_set_kmem_cache_dying(struct kmem_cache *s) { spin_lock_irq(&memcg_kmem_wq_lock); s->memcg_params.dying = true; spin_unlock_irq(&memcg_kmem_wq_lock); +} +static void flush_memcg_workqueue(struct kmem_cache *s) +{ /* * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. @@ -923,10 +934,6 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } - -static inline void flush_memcg_workqueue(struct kmem_cache *s) -{ -} #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) @@ -944,8 +951,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; - flush_memcg_workqueue(s); - get_online_cpus(); get_online_mems(); @@ -955,6 +960,22 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; +#ifdef CONFIG_MEMCG_KMEM + memcg_set_kmem_cache_dying(s); + + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + flush_memcg_workqueue(s); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); +#endif + err = shutdown_memcg_caches(s); if (!err) err = shutdown_cache(s); -- cgit v1.2.3 From 40c5836b4a48ab1f08674db5171c7def505842ab Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 23 Jul 2020 21:15:34 -0700 Subject: khugepaged: fix null-pointer dereference due to race commit 594cced14ad3903166c8b091ff96adac7552f0b3 upstream. khugepaged has to drop mmap lock several times while collapsing a page. The situation can change while the lock is dropped and we need to re-validate that the VMA is still in place and the PMD is still subject for collapse. But we miss one corner case: while collapsing an anonymous pages the VMA could be replaced with file VMA. If the file VMA doesn't have any private pages we get NULL pointer dereference: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] anon_vma_lock_write include/linux/rmap.h:120 [inline] collapse_huge_page mm/khugepaged.c:1110 [inline] khugepaged_scan_pmd mm/khugepaged.c:1349 [inline] khugepaged_scan_mm_slot mm/khugepaged.c:2110 [inline] khugepaged_do_scan mm/khugepaged.c:2193 [inline] khugepaged+0x3bba/0x5a10 mm/khugepaged.c:2238 The fix is to make sure that the VMA is anonymous in hugepage_vma_revalidate(). The helper is only used for collapsing anonymous pages. Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS") Reported-by: syzbot+ed318e8b790ca72c5ad0@syzkaller.appspotmail.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Yang Shi Cc: Link: http://lkml.kernel.org/r/20200722121439.44328-1-kirill.shutemov@linux.intel.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f765475be359..5977f7824a9a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -876,6 +876,9 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_ADDRESS_RANGE; if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; + /* Anon VMA expected */ + if (!vma->anon_vma || vma->vm_ops) + return SCAN_VMA_CHECK; return 0; } -- cgit v1.2.3 From 47e20933814feeb75543eb514437989cea53ec52 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Apr 2020 21:04:40 -0700 Subject: mm/filemap.c: don't bother dropping mmap_sem for zero size readahead commit 5c72feee3e45b40a3c96c7145ec422899d0e8964 upstream. When handling a page fault, we drop mmap_sem to start async readahead so that we don't block on IO submission with mmap_sem held. However there's no point to drop mmap_sem in case readahead is disabled. Handle that case to avoid pointless dropping of mmap_sem and retrying the fault. This was actually reported to block mlockall(MCL_CURRENT) indefinitely. Fixes: 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations") Reported-by: Minchan Kim Reported-by: Robert Stupp Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Reviewed-by: Josef Bacik Reviewed-by: Minchan Kim Link: http://lkml.kernel.org/r/20200212101356.30759-1-jack@suse.cz Signed-off-by: Linus Torvalds Cc: SeongJae Park Signed-off-by: Greg Kroah-Hartman --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1f5731768222..18c1f5830074 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2438,7 +2438,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vmf->vma->vm_flags & VM_RAND_READ) + if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; -- cgit v1.2.3