summaryrefslogtreecommitdiff
path: root/mm
AgeCommit message (Collapse)AuthorFilesLines
2023-03-22mm: teach mincore_hugetlb about pte markersJames Houghton1-1/+1
commit 63cf584203f3367c8b073d417c8e5cbbfc450506 upstream. By checking huge_pte_none(), we incorrectly classify PTE markers as "present". Instead, check huge_pte_none_mostly(), classifying PTE markers the same as if the PTE were completely blank. PTE markers, unlike other kinds of swap entries, don't reference any physical page and don't indicate that a physical page was mapped previously. As such, treat them as non-present for the sake of mincore(). Link: https://lkml.kernel.org/r/20230302222404.175303-1-jthoughton@google.com Fixes: 5c041f5d1f23 ("mm: teach core mm about pte markers") Signed-off-by: James Houghton <jthoughton@google.com> Acked-by: Peter Xu <peterx@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: James Houghton <jthoughton@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-22mm/userfaultfd: propagate uffd-wp bit when PTE-mapping the huge zeropageDavid Hildenbrand1-2/+4
commit 42b2af2c9b7eede8ef21d0943f84d135e21a32a3 upstream. Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge zeropage, resulting in the next write faults in the PMD range not triggering uffd-wp events. Various actions (partial MADV_DONTNEED, partial mremap, partial munmap, partial mprotect) could trigger this. However, most importantly, un-protecting a single sub-page from the userfaultfd-wp handler when processing a uffd-wp event will PTE-map the shared huge zeropage and lose the uffd-wp bit for the remainder of the PMD. Let's properly propagate the uffd-wp bit to the PMDs. #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <stdbool.h> #include <inttypes.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> #include <poll.h> #include <pthread.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> static size_t pagesize; static int uffd; static volatile bool uffd_triggered; #define barrier() __asm__ __volatile__("": : :"memory") static void uffd_wp_range(char *start, size_t size, bool wp) { struct uffdio_writeprotect uffd_writeprotect; uffd_writeprotect.range.start = (unsigned long) start; uffd_writeprotect.range.len = size; if (wp) { uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; } else { uffd_writeprotect.mode = 0; } if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); exit(1); } } static void *uffd_thread_fn(void *arg) { static struct uffd_msg msg; ssize_t nread; while (1) { struct pollfd pollfd; int nready; pollfd.fd = uffd; pollfd.events = POLLIN; nready = poll(&pollfd, 1, -1); if (nready == -1) { fprintf(stderr, "poll() failed: %d\n", errno); exit(1); } nread = read(uffd, &msg, sizeof(msg)); if (nread <= 0) continue; if (msg.event != UFFD_EVENT_PAGEFAULT || !(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) { printf("FAIL: wrong uffd-wp event fired\n"); exit(1); } /* un-protect the single page. */ uffd_triggered = true; uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address, pagesize, false); } return arg; } static int setup_uffd(char *map, size_t size) { struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; pthread_t thread; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd < 0) { fprintf(stderr, "syscall() failed: %d\n", errno); return -errno; } uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { fprintf(stderr, "UFFDIO_API failed: %d\n", errno); return -errno; } if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); return -ENOSYS; } uffdio_register.range.start = (unsigned long) map; uffdio_register.range.len = size; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); return -errno; } pthread_create(&thread, NULL, uffd_thread_fn, NULL); return 0; } int main(void) { const size_t size = 4 * 1024 * 1024ull; char *map, *cur; pagesize = getpagesize(); map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } if (madvise(map, size, MADV_HUGEPAGE)) { fprintf(stderr, "MADV_HUGEPAGE failed\n"); return -errno; } if (setup_uffd(map, size)) return 1; /* Read the whole range, populating zeropages. */ madvise(map, size, MADV_POPULATE_READ); /* Write-protect the whole range. */ uffd_wp_range(map, size, true); /* Make sure uffd-wp triggers on each page. */ for (cur = map; cur < map + size; cur += pagesize) { uffd_triggered = false; barrier(); /* Trigger a write fault. */ *cur = 1; barrier(); if (!uffd_triggered) { printf("FAIL: uffd-wp did not trigger\n"); return 1; } } printf("PASS: uffd-wp triggered\n"); return 0; } Link: https://lkml.kernel.org/r/20230302175423.589164-1-david@redhat.com Fixes: e06f1e1dd499 ("userfaultfd: wp: enabled write protection in userfaultfd API") Signed-off-by: David Hildenbrand <david@redhat.com> Acked-by: Peter Xu <peterx@redhat.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: Shaohua Li <shli@fb.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-10memory tier: release the new_memtier in find_create_memory_tier()Tong Tiangen1-2/+2
commit 93419139fa14124c1c507d804f2b28866ebee28d upstream. In find_create_memory_tier(), if failed to register device, then we should release new_memtier from the tier list and put device instead of memtier. Link: https://lkml.kernel.org/r/20230129040651.1329208-1-tongtiangen@huawei.com Fixes: 9832fb87834e ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: Tong Tiangen <tongtiangen@huawei.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Hanjun Guo <guohanjun@huawei.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Guohanjun <guohanjun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-10mm/thp: check and bail out if page in deferred queue alreadyYin Fengwei1-0/+3
commit 81e506bec9be1eceaf5a2c654e28ba5176ef48d8 upstream. Kernel build regression with LLVM was reported here: https://lore.kernel.org/all/Y1GCYXGtEVZbcv%2F5@dev-arch.thelio-3990X/ with commit f35b5d7d676e ("mm: align larger anonymous mappings on THP boundaries"). And the commit f35b5d7d676e was reverted. It turned out the regression is related with madvise(MADV_DONTNEED) was used by ld.lld. But with none PMD_SIZE aligned parameter len. trace-bpfcc captured: 531607 531732 ld.lld do_madvise.part.0 start: 0x7feca9000000, len: 0x7fb000, behavior: 0x4 531607 531793 ld.lld do_madvise.part.0 start: 0x7fec86a00000, len: 0x7fb000, behavior: 0x4 If the underneath physical page is THP, the madvise(MADV_DONTNEED) can trigger split_queue_lock contention raised significantly. perf showed following data: 14.85% 0.00% ld.lld [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe 11.52% entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_madvise do_madvise.part.0 zap_page_range unmap_single_vma unmap_page_range page_remove_rmap deferred_split_huge_page __lock_text_start native_queued_spin_lock_slowpath If THP can't be removed from rmap as whole THP, partial THP will be removed from rmap by removing sub-pages from rmap. Even the THP head page is added to deferred queue already, the split_queue_lock will be acquired and check whether the THP head page is in the queue already. Thus, the contention of split_queue_lock is raised. Before acquire split_queue_lock, check and bail out early if the THP head page is in the queue already. The checking without holding split_queue_lock could race with deferred_split_scan, but it doesn't impact the correctness here. Test result of building kernel with ld.lld: commit 7b5a0b664ebe (parent commit of f35b5d7d676e): time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 6:07.99 real, 26367.77 user, 5063.35 sys commit f35b5d7d676e: time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 7:22.15 real, 26235.03 user, 12504.55 sys commit f35b5d7d676e with the fixing patch: time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 6:08.49 real, 26520.15 user, 5047.91 sys Link: https://lkml.kernel.org/r/20221223135207.2275317-1-fengwei.yin@intel.com Signed-off-by: Yin Fengwei <fengwei.yin@intel.com> Tested-by: Nathan Chancellor <nathan@kernel.org> Acked-by: David Rientjes <rientjes@google.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Rik van Riel <riel@surriel.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Cc: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-10mm: memcontrol: deprecate charge movingJohannes Weiner1-0/+4
commit da34a8484d162585e22ed8c1e4114aa2f60e3567 upstream. Charge moving mode in cgroup1 allows memory to follow tasks as they migrate between cgroups. This is, and always has been, a questionable thing to do - for several reasons. First, it's expensive. Pages need to be identified, locked and isolated from various MM operations, and reassigned, one by one. Second, it's unreliable. Once pages are charged to a cgroup, there isn't always a clear owner task anymore. Cache isn't moved at all, for example. Mapped memory is moved - but if trylocking or isolating a page fails, it's arbitrarily left behind. Frequent moving between domains may leave a task's memory scattered all over the place. Third, it isn't really needed. Launcher tasks can kick off workload tasks directly in their target cgroup. Using dedicated per-workload groups allows fine-grained policy adjustments - no need to move tasks and their physical pages between control domains. The feature was never forward-ported to cgroup2, and it hasn't been missed. Despite it being a niche usecase, the maintenance overhead of supporting it is enormous. Because pages are moved while they are live and subject to various MM operations, the synchronization rules are complicated. There are lock_page_memcg() in MM and FS code, which non-cgroup people don't understand. In some cases we've been able to shift code and cgroup API calls around such that we can rely on native locking as much as possible. But that's fragile, and sometimes we need to hold MM locks for longer than we otherwise would (pte lock e.g.). Mark the feature deprecated. Hopefully we can remove it soon. And backport into -stable kernels so that people who develop against earlier kernels are warned about this deprecation as early as possible. [akpm@linux-foundation.org: fix memory.rst underlining] Link: https://lkml.kernel.org/r/Y5COd+qXwk/S+n8N@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Shakeel Butt <shakeelb@google.com> Acked-by: Hugh Dickins <hughd@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-10mm/hwpoison: convert TTU_IGNORE_HWPOISON to TTU_HWPOISONNaoya Horiguchi2-5/+5
commit 6da6b1d4a7df8c35770186b53ef65d388398e139 upstream. After a memory error happens on a clean folio, a process unexpectedly receives SIGBUS when it accesses the error page. This SIGBUS killing is pointless and simply degrades the level of RAS of the system, because the clean folio can be dropped without any data lost on memory error handling as we do for a clean pagecache. When memory_failure() is called on a clean folio, try_to_unmap() is called twice (one from split_huge_page() and one from hwpoison_user_mappings()). The root cause of the issue is that pte conversion to hwpoisoned entry is now done in the first call of try_to_unmap() because PageHWPoison is already set at this point, while it's actually expected to be done in the second call. This behavior disturbs the error handling operation like removing pagecache, which results in the malfunction described above. So convert TTU_IGNORE_HWPOISON into TTU_HWPOISON and set TTU_HWPOISON only when we really intend to convert pte to hwpoison entry. This can prevent other callers of try_to_unmap() from accidentally converting to hwpoison entries. Link: https://lkml.kernel.org/r/20230221085905.1465385-1-naoya.horiguchi@linux.dev Fixes: a42634a6c07d ("readahead: Use a folio in read_pages()") Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: David Hildenbrand <david@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-10mm/damon/paddr: fix missing folio_put()andrew.yang1-4/+3
commit 3f98c9a62c338bbe06a215c9491e6166ea39bf82 upstream. damon_get_folio() would always increase folio _refcount and folio_isolate_lru() would increase folio _refcount if the folio's lru flag is set. If an unevictable folio isolated successfully, there will be two more _refcount. The one from folio_isolate_lru() will be decreased in folio_puback_lru(), but the other one from damon_get_folio() will be left behind. This causes a pin page. Whatever the case, the _refcount from damon_get_folio() should be decreased. Link: https://lkml.kernel.org/r/20230222064223.6735-1-andrew.yang@mediatek.com Fixes: 57223ac29584 ("mm/damon/paddr: support the pageout scheme") Signed-off-by: andrew.yang <andrew.yang@mediatek.com> Reviewed-by: SeongJae Park <sj@kernel.org> Cc: <stable@vger.kernel.org> [5.16.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: SeongJae Park <sj@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-10sysctl: fix proc_dobool() usabilityOndrej Mosnacek1-1/+1
[ Upstream commit f1aa2eb5ea05ccd1fd92d235346e60e90a1ed949 ] Currently proc_dobool expects a (bool *) in table->data, but sizeof(int) in table->maxsize, because it uses do_proc_dointvec() directly. This is unsafe for at least two reasons: 1. A sysctl table definition may use { .data = &variable, .maxsize = sizeof(variable) }, not realizing that this makes the sysctl unusable (see the Fixes: tag) and that they need to use the completely counterintuitive sizeof(int) instead. 2. proc_dobool() will currently try to parse an array of values if given .maxsize >= 2*sizeof(int), but will try to write values of type bool by offsets of sizeof(int), so it will not work correctly with neither an (int *) nor a (bool *). There is no .maxsize validation to prevent this. Fix this by: 1. Constraining proc_dobool() to allow only one value and .maxsize == sizeof(bool). 2. Wrapping the original struct ctl_table in a temporary one with .data pointing to a local int variable and .maxsize set to sizeof(int) and passing this one to proc_dointvec(), converting the value to/from bool as needed (using proc_dou8vec_minmax() as an example). 3. Extending sysctl_check_table() to enforce proc_dobool() expectations. 4. Fixing the proc_dobool() docstring (it was just copy-pasted from proc_douintvec, apparently...). 5. Converting all existing proc_dobool() users to set .maxsize to sizeof(bool) instead of sizeof(int). Fixes: 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled") Fixes: a2071573d634 ("sysctl: introduce new proc handler proc_dobool") Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> Acked-by: Kees Cook <keescook@chromium.org> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
2023-02-18Merge tag 'mm-hotfixes-stable-2023-02-17-15-16-2' of ↵Linus Torvalds4-4/+10
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "Six hotfixes. Five are cc:stable: four for MM, one for nilfs2. Also a MAINTAINERS update" * tag 'mm-hotfixes-stable-2023-02-17-15-16-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: nilfs2: fix underflow in second superblock position calculations hugetlb: check for undefined shift on 32 bit architectures mm/migrate: fix wrongly apply write bit after mkdirty on sparc64 MAINTAINERS: update FPU EMULATOR web page mm/MADV_COLLAPSE: set EAGAIN on unexpected page refcount mm/filemap: fix page end in filemap_get_read_batch
2023-02-18mm/migrate: fix wrongly apply write bit after mkdirty on sparc64Peter Xu2-2/+6
Nick Bowler reported another sparc64 breakage after the young/dirty persistent work for page migration (per "Link:" below). That's after a similar report [2]. It turns out page migration was overlooked, and it wasn't failing before because page migration was not enabled in the initial report test environment. David proposed another way [2] to fix this from sparc64 side, but that patch didn't land somehow. Neither did I check whether there's any other arch that has similar issues. Let's fix it for now as simple as moving the write bit handling to be after dirty, like what we did before. Note: this is based on mm-unstable, because the breakage was since 6.1 and we're at a very late stage of 6.2 (-rc8), so I assume for this specific case we should target this at 6.3. [1] https://lore.kernel.org/all/20221021160603.GA23307@u164.east.ru/ [2] https://lore.kernel.org/all/20221212130213.136267-1-david@redhat.com/ Link: https://lkml.kernel.org/r/20230216153059.256739-1-peterx@redhat.com Fixes: 2e3468778dbe ("mm: remember young/dirty bit for page migrations") Link: https://lore.kernel.org/all/CADyTPExpEqaJiMGoV+Z6xVgL50ZoMJg49B10LcZ=8eg19u34BA@mail.gmail.com/ Signed-off-by: Peter Xu <peterx@redhat.com> Reported-by: Nick Bowler <nbowler@draconx.ca> Acked-by: David Hildenbrand <david@redhat.com> Tested-by: Nick Bowler <nbowler@draconx.ca> Cc: <regressions@lists.linux.dev> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-17mm/MADV_COLLAPSE: set EAGAIN on unexpected page refcountZach O'Keefe1-0/+1
During collapse, in a few places we check to see if a given small page has any unaccounted references. If the refcount on the page doesn't match our expectations, it must be there is an unknown user concurrently interested in the page, and so it's not safe to move the contents elsewhere. However, the unaccounted pins are likely an ephemeral state. In this situation, MADV_COLLAPSE returns -EINVAL when it should return -EAGAIN. This could cause userspace to conclude that the syscall failed, when it in fact could succeed by retrying. Link: https://lkml.kernel.org/r/20230125015738.912924-1-zokeefe@google.com Fixes: 7d8faaf15545 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reported-by: Hugh Dickins <hughd@google.com> Acked-by: Hugh Dickins <hughd@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-17mm/filemap: fix page end in filemap_get_read_batchQian Yingjin1-2/+3
I was running traces of the read code against an RAID storage system to understand why read requests were being misaligned against the underlying RAID strips. I found that the page end offset calculation in filemap_get_read_batch() was off by one. When a read is submitted with end offset 1048575, then it calculates the end page for read of 256 when it should be 255. "last_index" is the index of the page beyond the end of the read and it should be skipped when get a batch of pages for read in @filemap_get_read_batch(). The below simple patch fixes the problem. This code was introduced in kernel 5.12. Link: https://lkml.kernel.org/r/20230208022400.28962-1-coolqyj@163.com Fixes: cbd59c48ae2b ("mm/filemap: use head pages in generic_file_buffered_read") Signed-off-by: Qian Yingjin <qian@ddn.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-14Merge tag 'mm-hotfixes-stable-2023-02-13-13-50' of ↵Linus Torvalds9-16/+57
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "Twelve hotfixes, mostly against mm/. Five of these fixes are cc:stable" * tag 'mm-hotfixes-stable-2023-02-13-13-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: of: reserved_mem: Have kmemleak ignore dynamically allocated reserved mem scripts/gdb: fix 'lx-current' for x86 lib: parser: optimize match_NUMBER apis to use local array mm: shrinkers: fix deadlock in shrinker debugfs mm: hwpoison: support recovery from ksm_might_need_to_copy() kasan: fix Oops due to missing calls to kasan_arch_is_ready() revert "squashfs: harden sanity check in squashfs_read_xattr_id_table" fsdax: dax_unshare_iter() should return a valid length mm/gup: add folio to list when folio_isolate_lru() succeed aio: fix mremap after fork null-deref mailmap: add entry for Alexander Mikhalitsyn mm: extend max struct page size for kmsan
2023-02-12Fix page corruption caused by racy check in __free_pagesDavid Chen1-1/+4
When we upgraded our kernel, we started seeing some page corruption like the following consistently: BUG: Bad page state in process ganesha.nfsd pfn:1304ca page:0000000022261c55 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x0 pfn:0x1304ca flags: 0x17ffffc0000000() raw: 0017ffffc0000000 ffff8a513ffd4c98 ffffeee24b35ec08 0000000000000000 raw: 0000000000000000 0000000000000001 00000000ffffff7f 0000000000000000 page dumped because: nonzero mapcount CPU: 0 PID: 15567 Comm: ganesha.nfsd Kdump: loaded Tainted: P B O 5.10.158-1.nutanix.20221209.el7.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016 Call Trace: dump_stack+0x74/0x96 bad_page.cold+0x63/0x94 check_new_page_bad+0x6d/0x80 rmqueue+0x46e/0x970 get_page_from_freelist+0xcb/0x3f0 ? _cond_resched+0x19/0x40 __alloc_pages_nodemask+0x164/0x300 alloc_pages_current+0x87/0xf0 skb_page_frag_refill+0x84/0x110 ... Sometimes, it would also show up as corruption in the free list pointer and cause crashes. After bisecting the issue, we found the issue started from commit e320d3012d25 ("mm/page_alloc.c: fix freeing non-compound pages"): if (put_page_testzero(page)) free_the_page(page, order); else if (!PageHead(page)) while (order-- > 0) free_the_page(page + (1 << order), order); So the problem is the check PageHead is racy because at this point we already dropped our reference to the page. So even if we came in with compound page, the page can already be freed and PageHead can return false and we will end up freeing all the tail pages causing double free. Fixes: e320d3012d25 ("mm/page_alloc.c: fix freeing non-compound pages") Link: https://lore.kernel.org/lkml/BYAPR02MB448855960A9656EEA81141FC94D99@BYAPR02MB4488.namprd02.prod.outlook.com/ Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@vger.kernel.org Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-02-10mm: shrinkers: fix deadlock in shrinker debugfsQi Zheng2-6/+13
The debugfs_remove_recursive() is invoked by unregister_shrinker(), which is holding the write lock of shrinker_rwsem. It will waits for the handler of debugfs file complete. The handler also needs to hold the read lock of shrinker_rwsem to do something. So it may cause the following deadlock: CPU0 CPU1 debugfs_file_get() shrinker_debugfs_count_show()/shrinker_debugfs_scan_write() unregister_shrinker() --> down_write(&shrinker_rwsem); debugfs_remove_recursive() // wait for (A) --> wait_for_completion(); // wait for (B) --> down_read_killable(&shrinker_rwsem) debugfs_file_put() -- (A) up_write() -- (B) The down_read_killable() can be killed, so that the above deadlock can be recovered. But it still requires an extra kill action, otherwise it will block all subsequent shrinker-related operations, so it's better to fix it. [akpm@linux-foundation.org: fix CONFIG_SHRINKER_DEBUG=n stub] Link: https://lkml.kernel.org/r/20230202105612.64641-1-zhengqi.arch@bytedance.com Fixes: 5035ebc644ae ("mm: shrinkers: introduce debugfs interface for memory shrinkers") Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-10mm: hwpoison: support recovery from ksm_might_need_to_copy()Kefeng Wang3-8/+22
When the kernel copies a page from ksm_might_need_to_copy(), but runs into an uncorrectable error, it will crash since poisoned page is consumed by kernel, this is similar to the issue recently fixed by Copy-on-write poison recovery. When an error is detected during the page copy, return VM_FAULT_HWPOISON in do_swap_page(), and install a hwpoison entry in unuse_pte() when swapoff, which help us to avoid system crash. Note, memory failure on a KSM page will be skipped, but still call memory_failure_queue() to be consistent with general memory failure process, and we could support KSM page recovery in the feature. [wangkefeng.wang@huawei.com: enhance unuse_pte(), fix issue found by lkp] Link: https://lkml.kernel.org/r/20221213120523.141588-1-wangkefeng.wang@huawei.com [wangkefeng.wang@huawei.com: update changelog, alter ksm_might_need_to_copy(), restore unlikely() in unuse_pte()] Link: https://lkml.kernel.org/r/20230201074433.96641-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20221209072801.193221-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Tony Luck <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-10kasan: fix Oops due to missing calls to kasan_arch_is_ready()Christophe Leroy3-1/+21
On powerpc64, you can build a kernel with KASAN as soon as you build it with RADIX MMU support. However if the CPU doesn't have RADIX MMU, KASAN isn't enabled at init and the following Oops is encountered. [ 0.000000][ T0] KASAN not enabled as it requires radix! [ 4.484295][ T26] BUG: Unable to handle kernel data access at 0xc00e000000804a04 [ 4.485270][ T26] Faulting instruction address: 0xc00000000062ec6c [ 4.485748][ T26] Oops: Kernel access of bad area, sig: 11 [#1] [ 4.485920][ T26] BE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries [ 4.486259][ T26] Modules linked in: [ 4.486637][ T26] CPU: 0 PID: 26 Comm: kworker/u2:2 Not tainted 6.2.0-rc3-02590-gf8a023b0a805 #249 [ 4.486907][ T26] Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1200 0xf000005 of:SLOF,HEAD pSeries [ 4.487445][ T26] Workqueue: eval_map_wq .tracer_init_tracefs_work_func [ 4.488744][ T26] NIP: c00000000062ec6c LR: c00000000062bb84 CTR: c0000000002ebcd0 [ 4.488867][ T26] REGS: c0000000049175c0 TRAP: 0380 Not tainted (6.2.0-rc3-02590-gf8a023b0a805) [ 4.489028][ T26] MSR: 8000000002009032 <SF,VEC,EE,ME,IR,DR,RI> CR: 44002808 XER: 00000000 [ 4.489584][ T26] CFAR: c00000000062bb80 IRQMASK: 0 [ 4.489584][ T26] GPR00: c0000000005624d4 c000000004917860 c000000001cfc000 1800000000804a04 [ 4.489584][ T26] GPR04: c0000000003a2650 0000000000000cc0 c00000000000d3d8 c00000000000d3d8 [ 4.489584][ T26] GPR08: c0000000049175b0 a80e000000000000 0000000000000000 0000000017d78400 [ 4.489584][ T26] GPR12: 0000000044002204 c000000003790000 c00000000435003c c0000000043f1c40 [ 4.489584][ T26] GPR16: c0000000043f1c68 c0000000043501a0 c000000002106138 c0000000043f1c08 [ 4.489584][ T26] GPR20: c0000000043f1c10 c0000000043f1c20 c000000004146c40 c000000002fdb7f8 [ 4.489584][ T26] GPR24: c000000002fdb834 c000000003685e00 c000000004025030 c000000003522e90 [ 4.489584][ T26] GPR28: 0000000000000cc0 c0000000003a2650 c000000004025020 c000000004025020 [ 4.491201][ T26] NIP [c00000000062ec6c] .kasan_byte_accessible+0xc/0x20 [ 4.491430][ T26] LR [c00000000062bb84] .__kasan_check_byte+0x24/0x90 [ 4.491767][ T26] Call Trace: [ 4.491941][ T26] [c000000004917860] [c00000000062ae70] .__kasan_kmalloc+0xc0/0x110 (unreliable) [ 4.492270][ T26] [c0000000049178f0] [c0000000005624d4] .krealloc+0x54/0x1c0 [ 4.492453][ T26] [c000000004917990] [c0000000003a2650] .create_trace_option_files+0x280/0x530 [ 4.492613][ T26] [c000000004917a90] [c000000002050d90] .tracer_init_tracefs_work_func+0x274/0x2c0 [ 4.492771][ T26] [c000000004917b40] [c0000000001f9948] .process_one_work+0x578/0x9f0 [ 4.492927][ T26] [c000000004917c30] [c0000000001f9ebc] .worker_thread+0xfc/0x950 [ 4.493084][ T26] [c000000004917d60] [c00000000020be84] .kthread+0x1a4/0x1b0 [ 4.493232][ T26] [c000000004917e10] [c00000000000d3d8] .ret_from_kernel_thread+0x58/0x60 [ 4.495642][ T26] Code: 60000000 7cc802a6 38a00000 4bfffc78 60000000 7cc802a6 38a00001 4bfffc68 60000000 3d20a80e 7863e8c2 792907c6 <7c6348ae> 20630007 78630fe0 68630001 [ 4.496704][ T26] ---[ end trace 0000000000000000 ]--- The Oops is due to kasan_byte_accessible() not checking the readiness of KASAN. Add missing call to kasan_arch_is_ready() and bail out when not ready. The same problem is observed with ____kasan_kfree_large() so fix it the same. Also, as KASAN is not available and no shadow area is allocated for linear memory mapping, there is no point in allocating shadow mem for vmalloc memory as shown below in /sys/kernel/debug/kernel_page_tables ---[ kasan shadow mem start ]--- 0xc00f000000000000-0xc00f00000006ffff 0x00000000040f0000 448K r w pte valid present dirty accessed 0xc00f000000860000-0xc00f00000086ffff 0x000000000ac10000 64K r w pte valid present dirty accessed 0xc00f3ffffffe0000-0xc00f3fffffffffff 0x0000000004d10000 128K r w pte valid present dirty accessed ---[ kasan shadow mem end ]--- So, also verify KASAN readiness before allocating and poisoning shadow mem for VMAs. Link: https://lkml.kernel.org/r/150768c55722311699fdcf8f5379e8256749f47d.1674716617.git.christophe.leroy@csgroup.eu Fixes: 41b7a347bf14 ("powerpc: Book3S 64-bit outline-only KASAN support") Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Reported-by: Nathan Lynch <nathanl@linux.ibm.com> Suggested-by: Michael Ellerman <mpe@ellerman.id.au> Cc: Alexander Potapenko <glider@google.com> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: <stable@vger.kernel.org> [5.19+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-07Revert "mm: Always release pages to the buddy allocator in ↵Aaron Thompson1-7/+1
memblock_free_late()." This reverts commit 115d9d77bb0f9152c60b6e8646369fa7f6167593. The pages being freed by memblock_free_late() have already been initialized, but if they are in the deferred init range, __free_one_page() might access nearby uninitialized pages when trying to coalesce buddies. This can, for example, trigger this BUG: BUG: unable to handle page fault for address: ffffe964c02580c8 RIP: 0010:__list_del_entry_valid+0x3f/0x70 <TASK> __free_one_page+0x139/0x410 __free_pages_ok+0x21d/0x450 memblock_free_late+0x8c/0xb9 efi_free_boot_services+0x16b/0x25c efi_enter_virtual_mode+0x403/0x446 start_kernel+0x678/0x714 secondary_startup_64_no_verify+0xd2/0xdb </TASK> A proper fix will be more involved so revert this change for the time being. Fixes: 115d9d77bb0f ("mm: Always release pages to the buddy allocator in memblock_free_late().") Signed-off-by: Aaron Thompson <dev@aaront.org> Link: https://lore.kernel.org/r/20230207082151.1303-1-dev@aaront.org Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
2023-02-04mm/gup: add folio to list when folio_isolate_lru() succeedKuan-Ying Lee1-1/+1
If we call folio_isolate_lru() successfully, we will get return value 0. We need to add this folio to the movable_pages_list. Link: https://lkml.kernel.org/r/20230131063206.28820-1-Kuan-Ying.Lee@mediatek.com Fixes: 67e139b02d99 ("mm/gup.c: refactor check_and_migrate_movable_pages()") Signed-off-by: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com> Reviewed-by: Alistair Popple <apopple@nvidia.com> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Andrew Yang <andrew.yang@mediatek.com> Cc: Chinwen Chang <chinwen.chang@mediatek.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Matthias Brugger <matthias.bgg@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-03Merge tag 'mm-hotfixes-stable-2023-02-02-19-24-2' of ↵Linus Torvalds11-108/+286
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "25 hotfixes, mainly for MM. 13 are cc:stable" * tag 'mm-hotfixes-stable-2023-02-02-19-24-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (26 commits) mm: memcg: fix NULL pointer in mem_cgroup_track_foreign_dirty_slowpath() Kconfig.debug: fix the help description in SCHED_DEBUG mm/swapfile: add cond_resched() in get_swap_pages() mm: use stack_depot_early_init for kmemleak Squashfs: fix handling and sanity checking of xattr_ids count sh: define RUNTIME_DISCARD_EXIT highmem: round down the address passed to kunmap_flush_on_unmap() migrate: hugetlb: check for hugetlb shared PMD in node migration mm: hugetlb: proc: check for hugetlb shared PMD in /proc/PID/smaps mm/MADV_COLLAPSE: catch !none !huge !bad pmd lookups Revert "mm: kmemleak: alloc gray object for reserved region with direct map" freevxfs: Kconfig: fix spelling maple_tree: should get pivots boundary by type .mailmap: update e-mail address for Eugen Hristev mm, mremap: fix mremap() expanding for vma's with vm_ops->close() squashfs: harden sanity check in squashfs_read_xattr_id_table ia64: fix build error due to switch case label appearing next to declaration mm: multi-gen LRU: fix crash during cgroup migration Revert "mm: add nodes= arg to memory.reclaim" zsmalloc: fix a race with deferred_handles storing ...
2023-02-01mm/swapfile: add cond_resched() in get_swap_pages()Longlong Xia1-0/+1
The softlockup still occurs in get_swap_pages() under memory pressure. 64 CPU cores, 64GB memory, and 28 zram devices, the disksize of each zram device is 50MB with same priority as si. Use the stress-ng tool to increase memory pressure, causing the system to oom frequently. The plist_for_each_entry_safe() loops in get_swap_pages() could reach tens of thousands of times to find available space (extreme case: cond_resched() is not called in scan_swap_map_slots()). Let's add cond_resched() into get_swap_pages() when failed to find available space to avoid softlockup. Link: https://lkml.kernel.org/r/20230128094757.1060525-1-xialonglong1@huawei.com Signed-off-by: Longlong Xia <xialonglong1@huawei.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Chen Wandun <chenwandun@huawei.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Nanyong Sun <sunnanyong@huawei.com> Cc: Hugh Dickins <hughd@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01mm: use stack_depot_early_init for kmemleakZhaoyang Huang1-2/+3
Mirsad report the below error which is caused by stack_depot_init() failure in kvcalloc. Solve this by having stackdepot use stack_depot_early_init(). On 1/4/23 17:08, Mirsad Goran Todorovac wrote: I hate to bring bad news again, but there seems to be a problem with the output of /sys/kernel/debug/kmemleak: [root@pc-mtodorov ~]# cat /sys/kernel/debug/kmemleak unreferenced object 0xffff951c118568b0 (size 16): comm "kworker/u12:2", pid 56, jiffies 4294893952 (age 4356.548s) hex dump (first 16 bytes): 6d 65 6d 73 74 69 63 6b 30 00 00 00 00 00 00 00 memstick0....... backtrace: [root@pc-mtodorov ~]# Apparently, backtrace of called functions on the stack is no longer printed with the list of memory leaks. This appeared on Lenovo desktop 10TX000VCR, with AlmaLinux 8.7 and BIOS version M22KT49A (11/10/2022) and 6.2-rc1 and 6.2-rc2 builds. This worked on 6.1 with the same CONFIG_KMEMLEAK=y and MGLRU enabled on a vanilla mainstream kernel from Mr. Torvalds' tree. I don't know if this is deliberate feature for some reason or a bug. Please find attached the config, lshw and kmemleak output. [vbabka@suse.cz: remove stack_depot_init() call] Link: https://lore.kernel.org/all/5272a819-ef74-65ff-be61-4d2d567337de@alu.unizg.hr/ Link: https://lkml.kernel.org/r/1674091345-14799-2-git-send-email-zhaoyang.huang@unisoc.com Fixes: 56a61617dd22 ("mm: use stack_depot for recording kmemleak's backtrace") Reported-by: Mirsad Todorovac <mirsad.todorovac@alu.unizg.hr> Suggested-by: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Tested-by: Borislav Petkov (AMD) <bp@alien8.de> Cc: ke.wang <ke.wang@unisoc.com> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01migrate: hugetlb: check for hugetlb shared PMD in node migrationMike Kravetz1-1/+2
migrate_pages/mempolicy semantics state that CAP_SYS_NICE is required to move pages shared with another process to a different node. page_mapcount > 1 is being used to determine if a hugetlb page is shared. However, a hugetlb page will have a mapcount of 1 if mapped by multiple processes via a shared PMD. As a result, hugetlb pages shared by multiple processes and mapped with a shared PMD can be moved by a process without CAP_SYS_NICE. To fix, check for a shared PMD if mapcount is 1. If a shared PMD is found consider the page shared. Link: https://lkml.kernel.org/r/20230126222721.222195-3-mike.kravetz@oracle.com Fixes: e2d8cf405525 ("migrate: add hugepage migration code to migrate_pages()") Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Acked-by: Peter Xu <peterx@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01mm/MADV_COLLAPSE: catch !none !huge !bad pmd lookupsZach O'Keefe1-0/+8
In commit 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE") we make the following change to find_pmd_or_thp_or_none(): - if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + if (pmd_none(pmde)) + return SCAN_PMD_NONE; This was for-use by MADV_COLLAPSE file/shmem codepaths, where MADV_COLLAPSE might identify a pte-mapped hugepage, only to have khugepaged race-in, free the pte table, and clear the pmd. Such codepaths include: A) If we find a suitably-aligned compound page of order HPAGE_PMD_ORDER already in the pagecache. B) In retract_page_tables(), if we fail to grab mmap_lock for the target mm/address. In these cases, collapse_pte_mapped_thp() really does expect a none (not just !present) pmd, and we want to suitably identify that case separate from the case where no pmd is found, or it's a bad-pmd (of course, many things could happen once we drop mmap_lock, and the pmd could plausibly undergo multiple transitions due to intervening fault, split, etc). Regardless, the code is prepared install a huge-pmd only when the existing pmd entry is either a genuine pte-table-mapping-pmd, or the none-pmd. However, the commit introduces a logical hole; namely, that we've allowed !none- && !huge- && !bad-pmds to be classified as genuine pte-table-mapping-pmds. One such example that could leak through are swap entries. The pmd values aren't checked again before use in pte_offset_map_lock(), which is expecting nothing less than a genuine pte-table-mapping-pmd. We want to put back the !pmd_present() check (below the pmd_none() check), but need to be careful to deal with subtleties in pmd transitions and treatments by various arch. The issue is that __split_huge_pmd_locked() temporarily clears the present bit (or otherwise marks the entry as invalid), but pmd_present() and pmd_trans_huge() still need to return true while the pmd is in this transitory state. For example, x86's pmd_present() also checks the _PAGE_PSE , riscv's version also checks the _PAGE_LEAF bit, and arm64 also checks a PMD_PRESENT_INVALID bit. Covering all 4 cases for x86 (all checks done on the same pmd value): 1) pmd_present() && pmd_trans_huge() All we actually know here is that the PSE bit is set. Either: a) We aren't racing with __split_huge_page(), and PRESENT or PROTNONE is set. => huge-pmd b) We are currently racing with __split_huge_page(). The danger here is that we proceed as-if we have a huge-pmd, but really we are looking at a pte-mapping-pmd. So, what is the risk of this danger? The only relevant path is: madvise_collapse() -> collapse_pte_mapped_thp() Where we might just incorrectly report back "success", when really the memory isn't pmd-backed. This is fine, since split could happen immediately after (actually) successful madvise_collapse(). So, it should be safe to just assume huge-pmd here. 2) pmd_present() && !pmd_trans_huge() Either: a) PSE not set and either PRESENT or PROTNONE is. => pte-table-mapping pmd (or PROT_NONE) b) devmap. This routine can be called immediately after unlocking/locking mmap_lock -- or called with no locks held (see khugepaged_scan_mm_slot()), so previous VMA checks have since been invalidated. 3) !pmd_present() && pmd_trans_huge() Not possible. 4) !pmd_present() && !pmd_trans_huge() Neither PRESENT nor PROTNONE set => not present I've checked all archs that implement pmd_trans_huge() (arm64, riscv, powerpc, longarch, x86, mips, s390) and this logic roughly translates (though devmap treatment is unique to x86 and powerpc, and (3) doesn't necessarily hold in general -- but that doesn't matter since !pmd_present() always takes failure path). Also, add a comment above find_pmd_or_thp_or_none() to help future travelers reason about the validity of the code; namely, the possible mutations that might happen out from under us, depending on how mmap_lock is held (if at all). Link: https://lkml.kernel.org/r/20230125225358.2576151-1-zokeefe@google.com Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE") Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reported-by: Hugh Dickins <hughd@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01mm, mremap: fix mremap() expanding for vma's with vm_ops->close()Vlastimil Babka1-6/+19
Fabian has reported another regression in 6.1 due to ca3d76b0aa80 ("mm: add merging after mremap resize"). The problem is that vma_merge() can fail when vma has a vm_ops->close() method, causing is_mergeable_vma() test to be negative. This was happening for vma mapping a file from fuse-overlayfs, which does have the method. But when we are simply expanding the vma, we never remove it due to the "merge" with the added area, so the test should not prevent the expansion. As a quick fix, check for such vmas and expand them using vma_adjust() directly as was done before commit ca3d76b0aa80. For a more robust long term solution we should try to limit the check for vma_ops->close only to cases that actually result in vma removal, so that no merge would be prevented unnecessarily. [akpm@linux-foundation.org: fix indenting whitespace, reflow comment] Link: https://lkml.kernel.org/r/20230117101939.9753-1-vbabka@suse.cz Fixes: ca3d76b0aa80 ("mm: add merging after mremap resize") Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Reported-by: Fabian Vogt <fvogt@suse.com> Link: https://bugzilla.suse.com/show_bug.cgi?id=1206359#c35 Tested-by: Fabian Vogt <fvogt@suse.com> Cc: Jakub Matěna <matenajakub@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01mm: multi-gen LRU: fix crash during cgroup migrationYu Zhao1-1/+4
lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself. This isn't true for the following scenario: CPU 1 CPU 2 clone() cgroup_can_fork() cgroup_procs_write() cgroup_post_fork() task_lock() lru_gen_migrate_mm() task_unlock() task_lock() lru_gen_add_mm() task_unlock() And when the above happens, kernel crashes because of linked list corruption (mm_struct->lru_gen.list). Link: https://lore.kernel.org/r/20230115134651.30028-1-msizanoen@qtmlabs.xyz/ Link: https://lkml.kernel.org/r/20230116034405.2960276-1-yuzhao@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao <yuzhao@google.com> Reported-by: msizanoen <msizanoen@qtmlabs.xyz> Tested-by: msizanoen <msizanoen@qtmlabs.xyz> Cc: <stable@vger.kernel.org> [6.1+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01Revert "mm: add nodes= arg to memory.reclaim"Michal Hocko2-57/+14
This reverts commit 12a5d3955227b0d7e04fb793ccceeb2a1dd275c5. Although it is recognized that a finer grained pro-active reclaim is something we need and want the semantic of this implementation is really ambiguous. In a follow up discussion it became clear that there are two essential usecases here. One is to use memory.reclaim to pro-actively reclaim memory and expectation is that the requested and reported amount of memory is uncharged from the memcg. Another usecase focuses on pro-active demotion when the memory is merely shuffled around to demotion targets while the overall charged memory stays unchanged. The current implementation considers demoted pages as reclaimed and that break both usecases. [1] has tried to address the reporting part but there are more issues with that summarized in [2] and follow up emails. Let's revert the nodemask based extension of the memcg pro-active reclaim for now until we settle with a more robust semantic. [1] http://lkml.kernel.org/r/http://lkml.kernel.org/r/20221206023406.3182800-1-almasrymina@google.com [2] http://lkml.kernel.org/r/Y5bsmpCyeryu3Zz1@dhcp22.suse.cz Link: https://lkml.kernel.org/r/Y5xASNe1x8cusiTx@dhcp22.suse.cz Fixes: 12a5d3955227b0d ("mm: add nodes= arg to memory.reclaim") Signed-off-by: Michal Hocko <mhocko@suse.com> Cc: Bagas Sanjaya <bagasdotme@gmail.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Shakeel Butt <shakeelb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Wei Xu <weixugc@google.com> Cc: Yang Shi <yang.shi@linux.alibaba.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: zefan li <lizefan.x@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01zsmalloc: fix a race with deferred_handles storingNhat Pham1-32/+205
Currently, there is a race between zs_free() and zs_reclaim_page(): zs_reclaim_page() finds a handle to an allocated object, but before the eviction happens, an independent zs_free() call to the same handle could come in and overwrite the object value stored at the handle with the last deferred handle. When zs_reclaim_page() finally gets to call the eviction handler, it will see an invalid object value (i.e the previous deferred handle instead of the original object value). This race happens quite infrequently. We only managed to produce it with out-of-tree developmental code that triggers zsmalloc writeback with a much higher frequency than usual. This patch fixes this race by storing the deferred handle in the object header instead. We differentiate the deferred handle from the other two cases (handle for allocated object, and linkage for free object) with a new tag. If zspage reclamation succeeds, we will free these deferred handles by walking through the zspage objects. On the other hand, if zspage reclamation fails, we reconstruct the zspage freelist (with the deferred handle tag and allocated tag) before trying again with the reclamation. [arnd@arndb.de: avoid unused-function warning] Link: https://lkml.kernel.org/r/20230117170507.2651972-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230110231701.326724-1-nphamcs@gmail.com Fixes: 9997bc017549 ("zsmalloc: implement writeback mechanism for zsmalloc") Signed-off-by: Nhat Pham <nphamcs@gmail.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Suggested-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Dan Streetman <ddstreet@ieee.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Seth Jennings <sjenning@redhat.com> Cc: Vitaly Wool <vitaly.wool@konsulko.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-01mm/khugepaged: fix ->anon_vma raceJann Horn1-1/+13
If an ->anon_vma is attached to the VMA, collapse_and_free_pmd() requires it to be locked. Page table traversal is allowed under any one of the mmap lock, the anon_vma lock (if the VMA is associated with an anon_vma), and the mapping lock (if the VMA is associated with a mapping); and so to be able to remove page tables, we must hold all three of them. retract_page_tables() bails out if an ->anon_vma is attached, but does this check before holding the mmap lock (as the comment above the check explains). If we racily merged an existing ->anon_vma (shared with a child process) from a neighboring VMA, subsequent rmap traversals on pages belonging to the child will be able to see the page tables that we are concurrently removing while assuming that nothing else can access them. Repeat the ->anon_vma check once we hold the mmap lock to ensure that there really is no concurrent page table access. Hitting this bug causes a lockdep warning in collapse_and_free_pmd(), in the line "lockdep_assert_held_write(&vma->anon_vma->root->rwsem)". It can also lead to use-after-free access. Link: https://lore.kernel.org/linux-mm/CAG48ez3434wZBKFFbdx4M9j6eUwSUVPd4dxhzW_k_POneSDF+A@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230111133351.807024-1-jannh@google.com Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Jann Horn <jannh@google.com> Reported-by: Zach O'Keefe <zokeefe@google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@intel.linux.com> Reviewed-by: Yang Shi <shy828301@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-29Revert "mm/compaction: fix set skip in fast_find_migrateblock"Vlastimil Babka1-0/+1
This reverts commit 7efc3b7261030da79001c00d92bc3392fd6c664c. We have got openSUSE reports (Link 1) for 6.1 kernel with khugepaged stalling CPU for long periods of time. Investigation of tracepoint data shows that compaction is stuck in repeating fast_find_migrateblock() based migrate page isolation, and then fails to migrate all isolated pages. Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") was suspected as it was merged in 6.1 and in theory can indeed remove a termination condition for fast_find_migrateblock() under certain conditions, as it removes a place that always marks a scanned pageblock from being re-scanned. There are other such places, but those can be skipped under certain conditions, which seems to match the tracepoint data. Testing of revert also appears to have resolved the issue, thus revert the commit until a more robust solution for the original problem is developed. It's also likely this will fix qemu stalls with 6.1 kernel reported in Link 2, but that is not yet confirmed. Link: https://bugzilla.suse.com/show_bug.cgi?id=1206848 Link: https://lore.kernel.org/kvm/b8017e09-f336-3035-8344-c549086c2340@kernel.org/ Link: https://lore.kernel.org/lkml/20230125134434.18017-1-mgorman@techsingularity.net/ Fixes: 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") Cc: <stable@vger.kernel.org> Tested-by: Pedro Falcato <pedro.falcato@gmail.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-01-19Merge tag 'slab-for-6.2-rc5' of ↵Linus Torvalds1-0/+2
git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab Pull slab fix from Vlastimil Babka: "Just a single fix, since the lkp report originally for a slub-tiny commit ended up being a gcov/compiler bug: - periodically resched in SLAB's drain_freelist(), by David Rientjes" * tag 'slab-for-6.2-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: mm, slab: periodically resched in drain_freelist()
2023-01-19mm: fix a few rare cases of using swapin error pte markerPeter Xu3-3/+16
This patch should harden commit 15520a3f0469 ("mm: use pte markers for swap errors") on using pte markers for swapin errors on a few corner cases. 1. Propagate swapin errors across fork()s: if there're swapin errors in the parent mm, after fork()s the child should sigbus too when an error page is accessed. 2. Fix a rare condition race in pte_marker_clear() where a uffd-wp pte marker can be quickly switched to a swapin error. 3. Explicitly ignore swapin error pte markers in change_protection(). I mostly don't worry on (2) or (3) at all, but we should still have them. Case (1) is special because it can potentially cause silent data corrupt on child when parent has swapin error triggered with swapoff, but since swapin error is rare itself already it's probably not easy to trigger either. Currently there is a priority difference between the uffd-wp bit and the swapin error entry, in which the swapin error always has higher priority (e.g. we don't need to wr-protect a swapin error pte marker). If there will be a 3rd bit introduced, we'll probably need to consider a more involved approach so we may need to start operate on the bits. Let's leave that for later. This patch is tested with case (1) explicitly where we'll get corrupted data before in the child if there's existing swapin error pte markers, and after patch applied the child can be rightfully killed. We don't need to copy stable for this one since 15520a3f0469 just landed as part of v6.2-rc1, only "Fixes" applied. Link: https://lkml.kernel.org/r/20221214200453.1772655-3-peterx@redhat.com Fixes: 15520a3f0469 ("mm: use pte markers for swap errors") Signed-off-by: Peter Xu <peterx@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Miaohe Lin <linmiaohe@huawei.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Pengfei Xu <pengfei.xu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-19mm/uffd: fix pte marker when fork() without fork eventPeter Xu1-6/+2
Patch series "mm: Fixes on pte markers". Patch 1 resolves the syzkiller report from Pengfei. Patch 2 further harden pte markers when used with the recent swapin error markers. The major case is we should persist a swapin error marker after fork(), so child shouldn't read a corrupted page. This patch (of 2): When fork(), dst_vma is not guaranteed to have VM_UFFD_WP even if src may have it and has pte marker installed. The warning is improper along with the comment. The right thing is to inherit the pte marker when needed, or keep the dst pte empty. A vague guess is this happened by an accident when there's the prior patch to introduce src/dst vma into this helper during the uffd-wp feature got developed and I probably messed up in the rebase, since if we replace dst_vma with src_vma the warning & comment it all makes sense too. Hugetlb did exactly the right here (copy_hugetlb_page_range()). Fix the general path. Reproducer: https://github.com/xupengfe/syzkaller_logs/blob/main/221208_115556_copy_page_range/repro.c Bugzilla report: https://bugzilla.kernel.org/show_bug.cgi?id=216808 Link: https://lkml.kernel.org/r/20221214200453.1772655-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221214200453.1772655-2-peterx@redhat.com Fixes: c56d1b62cce8 ("mm/shmem: handle uffd-wp during fork()") Signed-off-by: Peter Xu <peterx@redhat.com> Reported-by: Pengfei Xu <pengfei.xu@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Miaohe Lin <linmiaohe@huawei.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: <stable@vger.kernel.org> # 5.19+ Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-19Sync with v6.2-rc4Andrew Morton1-1/+7
Merge branch 'master' into mm-hotfixes-stable
2023-01-17Merge tag 'mm-hotfixes-stable-2023-01-16-15-23' of ↵Linus Torvalds7-55/+87
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc hotfixes from Andrew Morton: "21 hotfixes. Thirteen of these address pre-6.1 issues and hence have the cc:stable tag" * tag 'mm-hotfixes-stable-2023-01-16-15-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (21 commits) init/Kconfig: fix typo (usafe -> unsafe) nommu: fix split_vma() map_count error nommu: fix do_munmap() error path nommu: fix memory leak in do_mmap() error path MAINTAINERS: update Robert Foss' email address proc: fix PIE proc-empty-vm, proc-pid-vm tests mm: update mmap_sem comments to refer to mmap_lock include/linux/mm: fix release_pages_arg kernel doc comment lib/win_minmax: use /* notation for regular comments kasan: mark kasan_kunit_executing as static nilfs2: fix general protection fault in nilfs_btree_insert() Docs/admin-guide/mm/zswap: remove zsmalloc's lack of writeback warning mm/hugetlb: pre-allocate pgtable pages for uffd wr-protects hugetlb: unshare some PMDs when splitting VMAs mm: fix vma->anon_name memory leak for anonymous shmem VMAs mm/shmem: restore SHMEM_HUGE_DENY precedence over MADV_COLLAPSE mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested end mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA mm/khugepaged: fix collapse_pte_mapped_thp() to allow anon_vma mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection() ...
2023-01-12nommu: fix split_vma() map_count errorLiam Howlett1-1/+3
During the maple tree conversion of nommu, an error in counting the VMAs was introduced by counting the existing VMA again. The counting used to be decremented by one and incremented by two, but now it only increments by two. Fix the counting error by moving the increment outside the setup_vma_to_mm() function to the callers. Link: https://lkml.kernel.org/r/20230109205809.956325-1-Liam.Howlett@oracle.com Fixes: 8220543df148 ("nommu: remove uses of VMA linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yu Zhao <yuzhao@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12nommu: fix do_munmap() error pathLiam Howlett1-1/+2
When removing a VMA from the tree fails due to no memory, do not free the VMA since a reference still exists. Link: https://lkml.kernel.org/r/20230109205708.956103-1-Liam.Howlett@oracle.com Fixes: 8220543df148 ("nommu: remove uses of VMA linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yu Zhao <yuzhao@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12nommu: fix memory leak in do_mmap() error pathLiam Howlett1-1/+1
The preallocation of the maple tree nodes may leak if the error path to "error_just_free" is taken. Fix this by moving the freeing of the maple tree nodes to a shared location for all error paths. Link: https://lkml.kernel.org/r/20230109205507.955577-1-Liam.Howlett@oracle.com Fixes: 8220543df148 ("nommu: remove uses of VMA linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yu Zhao <yuzhao@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm: update mmap_sem comments to refer to mmap_lockLorenzo Stoakes3-5/+5
The rename from mm->mmap_sem to mm->mmap_lock was performed in commit da1c55f1b272 ("mmap locking API: rename mmap_sem to mmap_lock") and commit c1e8d7c6a7a6 ("map locking API: convert mmap_sem comments"), however some incorrect comments remain. This patch simply corrects those comments which are obviously incorrect within mm itself. Link: https://lkml.kernel.org/r/33fba04389ab63fc4980e7ba5442f521df6dc657.1673048927.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com> Cc: David Hildenbrand <david@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12kasan: mark kasan_kunit_executing as staticAndrey Konovalov1-1/+1
Mark kasan_kunit_executing as static, as it is only used within mm/kasan/report.c. Link: https://lkml.kernel.org/r/f64778a4683b16a73bba72576f73bf4a2b45a82f.1672794398.git.andreyknvl@google.com Fixes: c8c7016f50c8 ("kasan: fail non-kasan KUnit tests on KASAN reports") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Andrey Konovalov <andreyknvl@google.com> Reviewed-by: Marco Elver <elver@google.com> Cc: Alexander Potapenko <glider@google.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Dmitry Vyukov <dvyukov@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/hugetlb: pre-allocate pgtable pages for uffd wr-protectsPeter Xu1-2/+11
Userfaultfd-wp uses pte markers to mark wr-protected pages for both shmem and hugetlb. Shmem has pre-allocation ready for markers, but hugetlb path was overlooked. Doing so by calling huge_pte_alloc() if the initial pgtable walk fails to find the huge ptep. It's possible that huge_pte_alloc() can fail with high memory pressure, in that case stop the loop immediately and fail silently. This is not the most ideal solution but it matches with what we do with shmem meanwhile it avoids the splat in dmesg. Link: https://lkml.kernel.org/r/20230104225207.1066932-2-peterx@redhat.com Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes") Signed-off-by: Peter Xu <peterx@redhat.com> Reported-by: James Houghton <jthoughton@google.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Acked-by: David Hildenbrand <david@redhat.com> Acked-by: James Houghton <jthoughton@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: <stable@vger.kernel.org> [5.19+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12hugetlb: unshare some PMDs when splitting VMAsJames Houghton1-9/+35
PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs; however, it is possible that HugeTLB VMAs are split without unsharing the PMDs first. Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE in hugetlb_change_protection [1]. The key there is that hugetlb_unshare_all_pmds will not attempt to unshare PMDs in non-PUD_SIZE-aligned sections of the VMA. It might seem ideal to unshare in hugetlb_vm_op_open, but we need to unshare in both the new and old VMAs, so unsharing in hugetlb_vm_op_split seems natural. [1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9nMgcWtBSg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230104231910.1464197-1-jthoughton@google.com Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp") Signed-off-by: James Houghton <jthoughton@google.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Acked-by: Peter Xu <peterx@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/shmem: restore SHMEM_HUGE_DENY precedence over MADV_COLLAPSEZach O'Keefe1-4/+2
SHMEM_HUGE_DENY is for emergency use by the admin, to disable allocation of shmem huge pages if, for example, a dangerous bug is found in their usage: see "deny" in Documentation/mm/transhuge.rst. An app using madvise(,,MADV_COLLAPSE) should not be allowed to override it: restore its precedence over shmem_huge_force. Restore SHMEM_HUGE_DENY precedence over MADV_COLLAPSE. Link: https://lkml.kernel.org/r/20221224082035.3197140-2-zokeefe@google.com Fixes: 7c6c6cc4d3a2 ("mm/shmem: add flag to enforce shmem THP in hugepage_vma_check()") Signed-off-by: Zach O'Keefe <zokeefe@google.com> Suggested-by: Hugh Dickins <hughd@google.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested endZach O'Keefe1-1/+1
MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until it has collapsed all eligible memory contained within the bounds supplied by the user. At the top of each hugepage iteration we (re)lock mmap_lock and (re)validate the VMA for eligibility and update variables that might have changed while mmap_lock was dropped. One thing that might occur is that the VMA could be resized, and as such, we refetch vma->vm_end to make sure we don't collapse past the end of the VMA's new end. However, it's possible that when refetching vma->vm_end that we expand the region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len supplied by the user. The consequence here is that we may attempt to collapse more memory than requested, possibly yielding either "too much success" or "false failure" user-visible results. An example of the former is if we MADV_COLLAPSE the first 4MiB of a 2TiB mmap()'d file, the incorrect refetch would cause the operation to block for much longer than anticipated as we attempt to collapse the entire TiB region. An example of the latter is that applying MADV_COLLPSE to a 4MiB file mapped to the start of a 6MiB VMA will successfully collapse the first 4MiB, then incorrectly attempt to collapse the last hugepage-aligned/sized region -- fail (since readahead/page cache lookup will fail) -- and report a failure to the user. I don't believe there is a kernel stability concern here as we always (re)validate the VMA / region accordingly. Also as Hugh mentions, the user-visible effects are: we try to collapse more memory than requested by the user, and/or failing an operation that should have otherwise succeeded. An example is trying to collapse a 4MiB file contained within a 12MiB VMA. Don't expand the acted-on region when refetching vma->vm_end. Link: https://lkml.kernel.org/r/20221224082035.3197140-1-zokeefe@google.com Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock") Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reported-by: Hugh Dickins <hughd@google.com> Cc: Yang Shi <shy828301@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMADavid Hildenbrand1-0/+4
Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now only shmem and hugetlb). The consequence is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...). So far, vma->vm_page_prot is assumed to be a safe default, meaning that we only add permissions (e.g., mkwrite) but not remove permissions (e.g., wrprotect). For example, when enabling softdirty tracking, we enable writenotify. With uffd-wp on shared mappings, that changed. More details on vma->vm_page_prot semantics were summarized in [1]. This is problematic for uffd-wp: we'd have to manually check for a uffd-wp PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone. Prone to such issues is any code that uses vma->vm_page_prot to set PTE permissions: primarily pte_modify() and mk_pte(). Instead, let's enable writenotify such that PTEs/PMDs/... will be mapped write-protected as default and we will only allow selected PTEs that are definitely safe to be mapped without write-protection (see can_change_pte_writable()) to be writable. In the future, we might want to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more locations, for example, also when removing uffd-wp protection. This fixes two known cases: (a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting in uffd-wp not triggering on write access. (b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs writable, resulting in uffd-wp not triggering on write access. Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even without NUMA hinting (which currently doesn't seem to be applicable to shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA. On such a VMA, userfaultfd-wp is currently non-functional. Note that when enabling userfaultfd-wp, there is no need to walk page tables to enforce the new default protection for the PTEs: we know that they cannot be uffd-wp'ed yet, because that can only happen after enabling uffd-wp for the VMA in general. Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not accidentally set the write bit -- which would result in uffd-wp not triggering on later write access. This commit makes uffd-wp on shmem behave just like uffd-wp on anonymous memory in that regard, even though, mixing mprotect with uffd-wp is controversial. [1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand <david@redhat.com> Reported-by: Ives van Hoorne <ives@codesandbox.io> Debugged-by: Peter Xu <peterx@redhat.com> Acked-by: Peter Xu <peterx@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/khugepaged: fix collapse_pte_mapped_thp() to allow anon_vmaHugh Dickins1-8/+6
uprobe_write_opcode() uses collapse_pte_mapped_thp() to restore huge pmd, when removing a breakpoint from hugepage text: vma->anon_vma is always set in that case, so undo the prohibition. And MADV_COLLAPSE ought to be able to collapse some page tables in a vma which happens to have anon_vma set from CoWing elsewhere. Is anon_vma lock required? Almost not: if any page other than expected subpage of the non-anon huge page is found in the page table, collapse is aborted without making any change. However, it is possible that an anon page was CoWed from this extent in another mm or vma, in which case a concurrent lookup might look here: so keep it away while clearing pmd (but perhaps we shall go back to using pmd_lock() there in future). Note that collapse_pte_mapped_thp() is exceptional in freeing a page table without having cleared its ptes: I'm uneasy about that, and had thought pte_clear()ing appropriate; but exclusive i_mmap lock does fix the problem, and we would have to move the mmu_notification if clearing those ptes. What this fixes is not a dangerous instability. But I suggest Cc stable because uprobes "healing" has regressed in that way, so this should follow 8d3c106e19e8 into those stable releases where it was backported (and may want adjustment there - I'll supply backports as needed). Link: https://lkml.kernel.org/r/b740c9fb-edba-92ba-59fb-7a5592e5dfc@google.com Fixes: 8d3c106e19e8 ("mm/khugepaged: take the right locks for page table retraction") Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Zach O'Keefe <zokeefe@google.com> Cc: Song Liu <songliubraving@fb.com> Cc: <stable@vger.kernel.org> [5.4+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/hugetlb: fix uffd-wp handling for migration entries in ↵David Hildenbrand1-8/+9
hugetlb_change_protection() We have to update the uffd-wp SWP PTE bit independent of the type of migration entry. Currently, if we're unlucky and we want to install/clear the uffd-wp bit just while we're migrating a read-only mapped hugetlb page, we would miss to set/clear the uffd-wp bit. Further, if we're processing a readable-exclusive migration entry and neither want to set or clear the uffd-wp bit, we could currently end up losing the uffd-wp bit. Note that the same would hold for writable migrating entries, however, having a writable migration entry with the uffd-wp bit set would already mean that something went wrong. Note that the change from !is_readable_migration_entry -> writable_migration_entry is harmless and actually cleaner, as raised by Miaohe Lin and discussed in [1]. [1] https://lkml.kernel.org/r/90dd6a93-4500-e0de-2bf0-bf522c311b0c@huawei.com Link: https://lkml.kernel.org/r/20221222205511.675832-3-david@redhat.com Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes") Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Peter Xu <peterx@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-12mm/hugetlb: fix PTE marker handling in hugetlb_change_protection()David Hildenbrand1-14/+7
Patch series "mm/hugetlb: uffd-wp fixes for hugetlb_change_protection()". Playing with virtio-mem and background snapshots (using uffd-wp) on hugetlb in QEMU, I managed to trigger a VM_BUG_ON(). Looking into the details, hugetlb_change_protection() seems to not handle uffd-wp correctly in all cases. Patch #1 fixes my test case. I don't have reproducers for patch #2, as it requires running into migration entries. I did not yet check in detail yet if !hugetlb code requires similar care. This patch (of 2): There are two problematic cases when stumbling over a PTE marker in hugetlb_change_protection(): (1) We protect an uffd-wp PTE marker a second time using uffd-wp: we will end up in the "!huge_pte_none(pte)" case and mess up the PTE marker. (2) We unprotect a uffd-wp PTE marker: we will similarly end up in the "!huge_pte_none(pte)" case even though we cleared the PTE, because the "pte" variable is stale. We'll mess up the PTE marker. For example, if we later stumble over such a "wrongly modified" PTE marker, we'll treat it like a present PTE that maps some garbage page. This can, for example, be triggered by mapping a memfd backed by huge pages, registering uffd-wp, uffd-wp'ing an unmapped page and (a) uffd-wp'ing it a second time; or (b) uffd-unprotecting it; or (c) unregistering uffd-wp. Then, ff we trigger fallocate(FALLOC_FL_PUNCH_HOLE) on that file range, we will run into a VM_BUG_ON: [ 195.039560] page:00000000ba1f2987 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x0 [ 195.039565] flags: 0x7ffffc0001000(reserved|node=0|zone=0|lastcpupid=0x1fffff) [ 195.039568] raw: 0007ffffc0001000 ffffe742c0000008 ffffe742c0000008 0000000000000000 [ 195.039569] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 [ 195.039569] page dumped because: VM_BUG_ON_PAGE(compound && !PageHead(page)) [ 195.039573] ------------[ cut here ]------------ [ 195.039574] kernel BUG at mm/rmap.c:1346! [ 195.039579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 195.039581] CPU: 7 PID: 4777 Comm: qemu-system-x86 Not tainted 6.0.12-200.fc36.x86_64 #1 [ 195.039583] Hardware name: LENOVO 20WNS1F81N/20WNS1F81N, BIOS N35ET50W (1.50 ) 09/15/2022 [ 195.039584] RIP: 0010:page_remove_rmap+0x45b/0x550 [ 195.039588] Code: [...] [ 195.039589] RSP: 0018:ffffbc03c3633ba8 EFLAGS: 00010292 [ 195.039591] RAX: 0000000000000040 RBX: ffffe742c0000000 RCX: 0000000000000000 [ 195.039592] RDX: 0000000000000002 RSI: ffffffff8e7aac1a RDI: 00000000ffffffff [ 195.039592] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffbc03c3633a08 [ 195.039593] R10: 0000000000000003 R11: ffffffff8f146328 R12: ffff9b04c42754b0 [ 195.039594] R13: ffffffff8fcc6328 R14: ffffbc03c3633c80 R15: ffff9b0484ab9100 [ 195.039595] FS: 00007fc7aaf68640(0000) GS:ffff9b0bbf7c0000(0000) knlGS:0000000000000000 [ 195.039596] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 195.039597] CR2: 000055d402c49110 CR3: 0000000159392003 CR4: 0000000000772ee0 [ 195.039598] PKRU: 55555554 [ 195.039599] Call Trace: [ 195.039600] <TASK> [ 195.039602] __unmap_hugepage_range+0x33b/0x7d0 [ 195.039605] unmap_hugepage_range+0x55/0x70 [ 195.039608] hugetlb_vmdelete_list+0x77/0xa0 [ 195.039611] hugetlbfs_fallocate+0x410/0x550 [ 195.039612] ? _raw_spin_unlock_irqrestore+0x23/0x40 [ 195.039616] vfs_fallocate+0x12e/0x360 [ 195.039618] __x64_sys_fallocate+0x40/0x70 [ 195.039620] do_syscall_64+0x58/0x80 [ 195.039623] ? syscall_exit_to_user_mode+0x17/0x40 [ 195.039624] ? do_syscall_64+0x67/0x80 [ 195.039626] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 195.039628] RIP: 0033:0x7fc7b590651f [ 195.039653] Code: [...] [ 195.039654] RSP: 002b:00007fc7aaf66e70 EFLAGS: 00000293 ORIG_RAX: 000000000000011d [ 195.039655] RAX: ffffffffffffffda RBX: 0000558ef4b7f370 RCX: 00007fc7b590651f [ 195.039656] RDX: 0000000018000000 RSI: 0000000000000003 RDI: 000000000000000c [ 195.039657] RBP: 0000000008000000 R08: 0000000000000000 R09: 0000000000000073 [ 195.039658] R10: 0000000008000000 R11: 0000000000000293 R12: 0000000018000000 [ 195.039658] R13: 00007fb8bbe00000 R14: 000000000000000c R15: 0000000000001000 [ 195.039661] </TASK> Fix it by not going into the "!huge_pte_none(pte)" case if we stumble over an exclusive marker. spin_unlock() + continue would get the job done. However, instead, make it clearer that there are no fall-through statements: we process each case (hwpoison, migration, marker, !none, none) and then unlock the page table to continue with the next PTE. Let's avoid "continue" statements and use a single spin_unlock() at the end. Link: https://lkml.kernel.org/r/20221222205511.675832-1-david@redhat.com Link: https://lkml.kernel.org/r/20221222205511.675832-2-david@redhat.com Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes") Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-08mm: Always release pages to the buddy allocator in memblock_free_late().Aaron Thompson1-1/+7
If CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, memblock_free_pages() only releases pages to the buddy allocator if they are not in the deferred range. This is correct for free pages (as defined by for_each_free_mem_pfn_range_in_zone()) because free pages in the deferred range will be initialized and released as part of the deferred init process. memblock_free_pages() is called by memblock_free_late(), which is used to free reserved ranges after memblock_free_all() has run. All pages in reserved ranges have been initialized at that point, and accordingly, those pages are not touched by the deferred init process. This means that currently, if the pages that memblock_free_late() intends to release are in the deferred range, they will never be released to the buddy allocator. They will forever be reserved. In addition, memblock_free_pages() calls kmsan_memblock_free_pages(), which is also correct for free pages but is not correct for reserved pages. KMSAN metadata for reserved pages is initialized by kmsan_init_shadow(), which runs shortly before memblock_free_all(). For both of these reasons, memblock_free_pages() should only be called for free pages, and memblock_free_late() should call __free_pages_core() directly instead. One case where this issue can occur in the wild is EFI boot on x86_64. The x86 EFI code reserves all EFI boot services memory ranges via memblock_reserve() and frees them later via memblock_free_late() (efi_reserve_boot_services() and efi_free_boot_services(), respectively). If any of those ranges happens to fall within the deferred init range, the pages will not be released and that memory will be unavailable. For example, on an Amazon EC2 t3.micro VM (1 GB) booting via EFI: v6.2-rc2: # grep -E 'Node|spanned|present|managed' /proc/zoneinfo Node 0, zone DMA spanned 4095 present 3999 managed 3840 Node 0, zone DMA32 spanned 246652 present 245868 managed 178867 v6.2-rc2 + patch: # grep -E 'Node|spanned|present|managed' /proc/zoneinfo Node 0, zone DMA spanned 4095 present 3999 managed 3840 Node 0, zone DMA32 spanned 246652 present 245868 managed 222816 # +43,949 pages Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set") Signed-off-by: Aaron Thompson <dev@aaront.org> Link: https://lore.kernel.org/r/01010185892de53e-e379acfb-7044-4b24-b30a-e2657c1ba989-000000@us-west-2.amazonses.com Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
2023-01-04memblock: Fix doc for memblock_phys_freeMiaoqian Lin1-1/+1
memblock_phys_free() is the counterpart to memblock_phys_alloc. Change memblock_alloc_xx() with memblock_phys_alloc_xx() to keep consistency. Signed-off-by: Miaoqian Lin <linmq006@gmail.com> Link: https://lore.kernel.org/r/20221216100304.688209-1-linmq006@gmail.com Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>