diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-02 23:55:34 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-02 23:55:34 +0300 |
commit | 6cad420cc695867b4ca710bac21fde21a4102e4b (patch) | |
tree | 890d42abc1e82c2cf5cef583584f88ca70116ce9 /mm | |
parent | 7be97138e7276c71cc9ad1752dcb502d28f4400d (diff) | |
parent | 77d6b9094819ba55353de0ef92957f3f54f2c36c (diff) | |
download | linux-6cad420cc695867b4ca710bac21fde21a4102e4b.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
"A large amount of MM, plenty more to come.
Subsystems affected by this patch series:
- tools
- kthread
- kbuild
- scripts
- ocfs2
- vfs
- mm: slub, kmemleak, pagecache, gup, swap, memcg, pagemap, mremap,
sparsemem, kasan, pagealloc, vmscan, compaction, mempolicy,
hugetlbfs, hugetlb"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (155 commits)
include/linux/huge_mm.h: check PageTail in hpage_nr_pages even when !THP
mm/hugetlb: fix build failure with HUGETLB_PAGE but not HUGEBTLBFS
selftests/vm: fix map_hugetlb length used for testing read and write
mm/hugetlb: remove unnecessary memory fetch in PageHeadHuge()
mm/hugetlb.c: clean code by removing unnecessary initialization
hugetlb_cgroup: add hugetlb_cgroup reservation docs
hugetlb_cgroup: add hugetlb_cgroup reservation tests
hugetlb: support file_region coalescing again
hugetlb_cgroup: support noreserve mappings
hugetlb_cgroup: add accounting for shared mappings
hugetlb: disable region_add file_region coalescing
hugetlb_cgroup: add reservation accounting for private mappings
mm/hugetlb_cgroup: fix hugetlb_cgroup migration
hugetlb_cgroup: add interface for charge/uncharge hugetlb reservations
hugetlb_cgroup: add hugetlb_cgroup reservation counter
hugetlbfs: Use i_mmap_rwsem to address page fault/truncate race
hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
mm/memblock.c: remove redundant assignment to variable max_addr
mm: mempolicy: require at least one nodeid for MPOL_PREFERRED
mm: mempolicy: use VM_BUG_ON_VMA in queue_pages_test_walk()
...
Diffstat (limited to 'mm')
47 files changed, 2275 insertions, 917 deletions
diff --git a/mm/Makefile b/mm/Makefile index 272e66039e70..dbc8346d16ca 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -6,6 +6,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +KCSAN_SANITIZE_kmemleak.o := n # These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of diff --git a/mm/compaction.c b/mm/compaction.c index 672d3c78c6ab..df3da2f76fdc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -894,12 +894,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Regardless of being on LRU, compound pages such as THP and - * hugetlbfs are not to be compacted. We can potentially save - * a lot of iterations if we skip them at once. The check is - * racy, but we can consider only valid values and the only - * danger is skipping too much. + * hugetlbfs are not to be compacted unless we are attempting + * an allocation much larger than the huge page size (eg CMA). + * We can potentially save a lot of iterations if we skip them + * at once. The check is racy, but we can consider only valid + * values and the only danger is skipping too much. */ - if (PageCompound(page)) { + if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); if (likely(order < MAX_ORDER)) @@ -969,7 +970,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * and it's on LRU. It can only be a THP so the order * is safe to read and it's 0 for tail pages. */ - if (unlikely(PageCompound(page))) { + if (unlikely(PageCompound(page) && !cc->alloc_contig)) { low_pfn += compound_nr(page) - 1; goto isolate_fail; } @@ -981,12 +982,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (__isolate_lru_page(page, isolate_mode) != 0) goto isolate_fail; - VM_BUG_ON_PAGE(PageCompound(page), page); + /* The whole page is taken off the LRU; skip the tail pages. */ + if (PageCompound(page)) + low_pfn += compound_nr(page) - 1; /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); - inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_cache(page), + hpage_nr_pages(page)); isolate_success: list_add(&page->lru, &cc->migratepages); @@ -1590,7 +1594,11 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ +#ifdef CONFIG_PREEMPT_RT +int sysctl_compact_unevictable_allowed __read_mostly = 0; +#else int sysctl_compact_unevictable_allowed __read_mostly = 1; +#endif static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -2174,7 +2182,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - last_migrated_pfn = 0; goto out; case ISOLATE_NONE: if (update_cached) { @@ -2310,8 +2317,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .page = NULL, }; - if (capture) - current->capture_control = &capc; + current->capture_control = &capc; ret = compact_zone(&cc, &capc); @@ -2333,6 +2339,7 @@ int sysctl_extfrag_threshold = 500; * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation * @prio: Determines how hard direct compaction should try to succeed + * @capture: Pointer to free page created by compaction will be stored here * * This is the main entry point for direct page compaction. */ diff --git a/mm/debug.c b/mm/debug.c index ecccd9f17801..2189357f0987 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -44,8 +44,10 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + struct page *head = compound_head(page); struct address_space *mapping; bool page_poisoned = PagePoisoned(page); + bool compound = PageCompound(page); /* * Accessing the pageblock without the zone lock. It could change to * "isolate" again in the meantime, but since we are just dumping the @@ -66,25 +68,43 @@ void __dump_page(struct page *page, const char *reason) goto hex_only; } - mapping = page_mapping(page); + if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) { + /* Corrupt page, cannot call page_mapping */ + mapping = page->mapping; + head = page; + compound = false; + } else { + mapping = page_mapping(page); + } /* * Avoid VM_BUG_ON() in page_mapcount(). * page->_mapcount space in struct page is used by sl[aou]b pages to * encode own info. */ - mapcount = PageSlab(page) ? 0 : page_mapcount(page); + mapcount = PageSlab(head) ? 0 : page_mapcount(page); - if (PageCompound(page)) - pr_warn("page:%px refcount:%d mapcount:%d mapping:%px " - "index:%#lx compound_mapcount: %d\n", - page, page_ref_count(page), mapcount, - page->mapping, page_to_pgoff(page), - compound_mapcount(page)); + if (compound) + if (hpage_pincount_available(page)) { + pr_warn("page:%px refcount:%d mapcount:%d mapping:%p " + "index:%#lx head:%px order:%u " + "compound_mapcount:%d compound_pincount:%d\n", + page, page_ref_count(head), mapcount, + mapping, page_to_pgoff(page), head, + compound_order(head), compound_mapcount(page), + compound_pincount(page)); + } else { + pr_warn("page:%px refcount:%d mapcount:%d mapping:%p " + "index:%#lx head:%px order:%u " + "compound_mapcount:%d\n", + page, page_ref_count(head), mapcount, + mapping, page_to_pgoff(page), head, + compound_order(head), compound_mapcount(page)); + } else - pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n", + pr_warn("page:%px refcount:%d mapcount:%d mapping:%p index:%#lx\n", page, page_ref_count(page), mapcount, - page->mapping, page_to_pgoff(page)); + mapping, page_to_pgoff(page)); if (PageKsm(page)) type = "ksm "; else if (PageAnon(page)) @@ -106,6 +126,10 @@ hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); + if (head != page) + print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), head, + sizeof(struct page), false); if (reason) pr_warn("page dumped because: %s\n", reason); diff --git a/mm/filemap.c b/mm/filemap.c index 1784478270e1..0fbdc8e30dd2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1386,7 +1386,7 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_sem is not released * even though return 0. @@ -1536,7 +1536,6 @@ out: return page; } -EXPORT_SYMBOL(find_get_entry); /** * find_lock_entry - locate, pin and lock a page cache entry @@ -1575,42 +1574,39 @@ repeat: EXPORT_SYMBOL(find_lock_entry); /** - * pagecache_get_page - find and get a page reference - * @mapping: the address_space to search - * @offset: the page index - * @fgp_flags: PCG flags - * @gfp_mask: gfp mask to use for the page cache data page allocation - * - * Looks up the page cache slot at @mapping & @offset. + * pagecache_get_page - Find and get a reference to a page. + * @mapping: The address_space to search. + * @index: The page index. + * @fgp_flags: %FGP flags modify how the page is returned. + * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified. * - * PCG flags modify how the page is returned. + * Looks up the page cache entry at @mapping & @index. * - * @fgp_flags can be: + * @fgp_flags can be zero or more of these flags: * - * - FGP_ACCESSED: the page will be marked accessed - * - FGP_LOCK: Page is return locked - * - FGP_CREAT: If page is not present then a new page is allocated using - * @gfp_mask and added to the page cache and the VM's LRU - * list. The page is returned locked and with an increased - * refcount. - * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do - * its own locking dance if the page is already in cache, or unlock the page - * before returning if we had to add the page to pagecache. + * * %FGP_ACCESSED - The page will be marked accessed. + * * %FGP_LOCK - The page is returned locked. + * * %FGP_CREAT - If no page is present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU list. + * The page is returned locked and with an increased refcount. + * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the + * page is already in cache. If the page was allocated, unlock it before + * returning so the caller can do the same dance. * - * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even - * if the GFP flags specified for FGP_CREAT are atomic. + * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even + * if the %GFP flags specified for %FGP_CREAT are atomic. * * If there is a page cache page, it is returned with an increased refcount. * - * Return: the found page or %NULL otherwise. + * Return: The found page or %NULL otherwise. */ -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, - int fgp_flags, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (xa_is_value(page)) page = NULL; if (!page) @@ -1632,7 +1628,7 @@ repeat: put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != offset, page); + VM_BUG_ON_PAGE(page->index != index, page); } if (fgp_flags & FGP_ACCESSED) @@ -1657,7 +1653,7 @@ no_page: if (fgp_flags & FGP_ACCESSED) __SetPageReferenced(page); - err = add_to_page_cache_lru(page, mapping, offset, gfp_mask); + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); if (unlikely(err)) { put_page(page); page = NULL; @@ -1962,8 +1958,7 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * * It is going insane. Fix it by quickly scaling down the readahead size. */ -static void shrink_readahead_size_eio(struct file *filp, - struct file_ra_state *ra) +static void shrink_readahead_size_eio(struct file_ra_state *ra) { ra->ra_pages /= 4; } @@ -2188,7 +2183,7 @@ readpage: goto find_page; } unlock_page(page); - shrink_readahead_size_eio(filp, ra); + shrink_readahead_size_eio(ra); error = -EIO; goto readpage_error; } @@ -2416,7 +2411,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vmf->vma->vm_flags & VM_RAND_READ) + if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; @@ -2491,7 +2486,7 @@ retry_find: if (!page) { if (fpin) goto out_retry; - return vmf_error(-ENOMEM); + return VM_FAULT_OOM; } } @@ -2560,7 +2555,7 @@ page_not_uptodate: goto retry_find; /* Things didn't work out. Return zero to tell the mm layer so. */ - shrink_readahead_size_eio(file, ra); + shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; out_retry: @@ -2823,6 +2818,14 @@ filler: unlock_page(page); goto out; } + + /* + * A previous I/O error may have been due to temporary + * failures. + * Clear page error before actual read, PG_error will be + * set again if read page fails. + */ + ClearPageError(page); goto filler; out: @@ -29,6 +29,22 @@ struct follow_page_context { unsigned int page_mask; }; +static void hpage_pincount_add(struct page *page, int refs) +{ + VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); + VM_BUG_ON_PAGE(page != compound_head(page), page); + + atomic_add(refs, compound_pincount_ptr(page)); +} + +static void hpage_pincount_sub(struct page *page, int refs) +{ + VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); + VM_BUG_ON_PAGE(page != compound_head(page), page); + + atomic_sub(refs, compound_pincount_ptr(page)); +} + /* * Return the compound head page with ref appropriately incremented, * or NULL if that failed. @@ -44,6 +60,195 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) return head; } +/* + * try_grab_compound_head() - attempt to elevate a page's refcount, by a + * flags-dependent amount. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: page's refcount will be incremented by 1. + * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * + * Return: head page (with refcount appropriately incremented) for success, or + * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's + * considered failure, and furthermore, a likely bug in the caller, so a warning + * is also emitted. + */ +static __maybe_unused struct page *try_grab_compound_head(struct page *page, + int refs, + unsigned int flags) +{ + if (flags & FOLL_GET) + return try_get_compound_head(page, refs); + else if (flags & FOLL_PIN) { + int orig_refs = refs; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast + * path, so fail and let the caller fall back to the slow path. + */ + if (unlikely(flags & FOLL_LONGTERM) && + is_migrate_cma_page(page)) + return NULL; + + /* + * When pinning a compound page of order > 1 (which is what + * hpage_pincount_available() checks for), use an exact count to + * track it, via hpage_pincount_add/_sub(). + * + * However, be sure to *also* increment the normal page refcount + * field at least once, so that the page really is pinned. + */ + if (!hpage_pincount_available(page)) + refs *= GUP_PIN_COUNTING_BIAS; + + page = try_get_compound_head(page, refs); + if (!page) + return NULL; + + if (hpage_pincount_available(page)) + hpage_pincount_add(page, refs); + + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, + orig_refs); + + return page; + } + + WARN_ON_ONCE(1); + return NULL; +} + +/** + * try_grab_page() - elevate a page's refcount by a flag-dependent amount + * + * This might not do anything at all, depending on the flags argument. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * + * @page: pointer to page to be grabbed + * @flags: gup flags: these are the FOLL_* flag values. + * + * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same + * time. Cases: + * + * FOLL_GET: page's refcount will be incremented by 1. + * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * + * Return: true for success, or if no action was required (if neither FOLL_PIN + * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or + * FOLL_PIN was set, but the page could not be grabbed. + */ +bool __must_check try_grab_page(struct page *page, unsigned int flags) +{ + WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); + + if (flags & FOLL_GET) + return try_get_page(page); + else if (flags & FOLL_PIN) { + int refs = 1; + + page = compound_head(page); + + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + + if (hpage_pincount_available(page)) + hpage_pincount_add(page, 1); + else + refs = GUP_PIN_COUNTING_BIAS; + + /* + * Similar to try_grab_compound_head(): even if using the + * hpage_pincount_add/_sub() routines, be sure to + * *also* increment the normal page refcount field at least + * once, so that the page really is pinned. + */ + page_ref_add(page, refs); + + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); + } + + return true; +} + +#ifdef CONFIG_DEV_PAGEMAP_OPS +static bool __unpin_devmap_managed_user_page(struct page *page) +{ + int count, refs = 1; + + if (!page_is_devmap_managed(page)) + return false; + + if (hpage_pincount_available(page)) + hpage_pincount_sub(page, 1); + else + refs = GUP_PIN_COUNTING_BIAS; + + count = page_ref_sub_return(page, refs); + + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); + /* + * devmap page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (count == 1) + free_devmap_managed_page(page); + else if (!count) + __put_page(page); + + return true; +} +#else +static bool __unpin_devmap_managed_user_page(struct page *page) +{ + return false; +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ + +/** + * unpin_user_page() - release a dma-pinned page + * @page: pointer to page to be released + * + * Pages that were pinned via pin_user_pages*() must be released via either + * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so + * that such pages can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special handling. + */ +void unpin_user_page(struct page *page) +{ + int refs = 1; + + page = compound_head(page); + + /* + * For devmap managed pages we need to catch refcount transition from + * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the + * page is free and we need to inform the device driver through + * callback. See include/linux/memremap.h and HMM for details. + */ + if (__unpin_devmap_managed_user_page(page)) + return; + + if (hpage_pincount_available(page)) + hpage_pincount_sub(page, 1); + else + refs = GUP_PIN_COUNTING_BIAS; + + if (page_ref_sub_and_test(page, refs)) + __put_page(page); + + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); +} +EXPORT_SYMBOL(unpin_user_page); + /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. @@ -193,6 +398,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, struct page *page; spinlock_t *ptl; pte_t *ptep, pte; + int ret; /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == @@ -230,10 +436,11 @@ retry: } page = vm_normal_page(vma, address, pte); - if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* - * Only return device mapping pages in the FOLL_GET case since - * they are only valid while holding the pgmap reference. + * Only return device mapping pages in the FOLL_GET or FOLL_PIN + * case since they are only valid while holding the pgmap + * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) @@ -250,8 +457,6 @@ retry: if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { - int ret; - ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; @@ -259,7 +464,6 @@ retry: } if (flags & FOLL_SPLIT && PageTransCompound(page)) { - int ret; get_page(page); pte_unmap_unlock(ptep, ptl); lock_page(page); @@ -271,9 +475,21 @@ retry: goto retry; } - if (flags & FOLL_GET) { - if (unlikely(!try_get_page(page))) { - page = ERR_PTR(-ENOMEM); + /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ + if (unlikely(!try_grab_page(page, flags))) { + page = ERR_PTR(-ENOMEM); + goto out; + } + /* + * We need to make the page accessible if and only if we are going + * to access its content (the FOLL_PIN case). Please see + * Documentation/core-api/pin_user_pages.rst for details. + */ + if (flags & FOLL_PIN) { + ret = arch_make_page_accessible(page); + if (ret) { + unpin_user_page(page); + page = ERR_PTR(ret); goto out; } } @@ -537,7 +753,7 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); + WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); return page; } @@ -630,12 +846,12 @@ unmap: } /* - * mmap_sem must be held on entry. If @nonblocking != NULL and - * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. - * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + * mmap_sem must be held on entry. If @locked != NULL and *@flags + * does not include FOLL_NOWAIT, the mmap_sem may be released. If it + * is, *@locked will be set to 0 and -EBUSY returned. */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, int *nonblocking) + unsigned long address, unsigned int *flags, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; @@ -647,12 +863,15 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (locked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { - VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + /* + * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED + * can co-exist + */ fault_flags |= FAULT_FLAG_TRIED; } @@ -673,8 +892,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { - if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) - *nonblocking = 0; + if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + *locked = 0; return -EBUSY; } @@ -751,7 +970,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention + * @locked: whether we're still with the mmap_sem held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: @@ -786,13 +1005,11 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * appropriate) must be called after the page is finished with, and * before put_page is called. * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. Further, if @gup_flags does not - * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in - * this case. + * If @locked != NULL, *@locked will be set to 0 when mmap_sem is + * released by an up_read(). That can happen if @gup_flags does not + * have FOLL_NOWAIT. * - * A caller using such a combination of @nonblocking and @gup_flags + * A caller using such a combination of @locked and @gup_flags * must therefore hold the mmap_sem for reading only, and recognize * when it's been released. Otherwise, it must be held for either * reading or writing and will not be released. @@ -804,7 +1021,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) + struct vm_area_struct **vmas, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; @@ -850,7 +1067,17 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags, nonblocking); + gup_flags, locked); + if (locked && *locked == 0) { + /* + * We've got a VM_FAULT_RETRY + * and we've lost mmap_sem. + * We must stop here. + */ + BUG_ON(gup_flags & FOLL_NOWAIT); + BUG_ON(ret != 0); + goto out; + } continue; } } @@ -868,7 +1095,7 @@ retry: page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { ret = faultin_page(tsk, vma, start, &foll_flags, - nonblocking); + locked); switch (ret) { case 0: goto retry; @@ -980,7 +1207,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, address = untagged_addr(address); if (unlocked) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = find_extend_vma(mm, address); @@ -1004,7 +1231,6 @@ retry: down_read(&mm->mmap_sem); if (!(fault_flags & FAULT_FLAG_TRIED)) { *unlocked = true; - fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; fault_flags |= FAULT_FLAG_TRIED; goto retry; } @@ -1088,17 +1314,36 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; + lock_dropped = true; +retry: /* * Repeat on the address that fired VM_FAULT_RETRY - * without FAULT_FLAG_ALLOW_RETRY but with - * FAULT_FLAG_TRIED. + * with both FAULT_FLAG_ALLOW_RETRY and + * FAULT_FLAG_TRIED. Note that GUP can be interrupted + * by fatal signals, so we need to check it before we + * start trying again otherwise it can loop forever. */ + + if (fatal_signal_pending(current)) + break; + *locked = 1; - lock_dropped = true; - down_read(&mm->mmap_sem); + ret = down_read_killable(&mm->mmap_sem); + if (ret) { + BUG_ON(ret > 0); + if (!pages_done) + pages_done = ret; + break; + } + ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, - pages, NULL, NULL); + pages, NULL, locked); + if (!*locked) { + /* Continue to retry until we succeeded */ + BUG_ON(ret != 0); + goto retry; + } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) @@ -1129,7 +1374,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * @vma: target vma * @start: start address * @end: end address - * @nonblocking: + * @locked: whether the mmap_sem is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * @@ -1137,14 +1382,14 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * * vma->vm_mm->mmap_sem must be held. * - * If @nonblocking is NULL, it may be held for read or write and will + * If @locked is NULL, it may be held for read or write and will * be unperturbed. * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. + * If @locked is non-NULL, it must held for read only and may be + * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) + unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; @@ -1179,7 +1424,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + NULL, NULL, locked); } /* @@ -1557,6 +1802,37 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk, } #endif /* CONFIG_FS_DAX || CONFIG_CMA */ +#ifdef CONFIG_MMU +static long __get_user_pages_remote(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + /* + * Parts of FOLL_LONGTERM behavior are incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. However, this only comes up if locked is set, and there are + * callers that do request FOLL_LONGTERM, but do not set locked. So, + * allow what we can. + */ + if (gup_flags & FOLL_LONGTERM) { + if (WARN_ON_ONCE(locked)) + return -EINVAL; + /* + * This will check the vmas (even if our vmas arg is NULL) + * and return -ENOTSUPP if DAX isn't allowed in this case: + */ + return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, + vmas, gup_flags | FOLL_TOUCH | + FOLL_REMOTE); + } + + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); +} + /* * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or @@ -1619,7 +1895,6 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk, * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -#ifdef CONFIG_MMU long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -1632,28 +1907,8 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - /* - * Parts of FOLL_LONGTERM behavior are incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. However, this only comes up if locked is set, and there are - * callers that do request FOLL_LONGTERM, but do not set locked. So, - * allow what we can. - */ - if (gup_flags & FOLL_LONGTERM) { - if (WARN_ON_ONCE(locked)) - return -EINVAL; - /* - * This will check the vmas (even if our vmas arg is NULL) - * and return -ENOTSUPP if DAX isn't allowed in this case: - */ - return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, - vmas, gup_flags | FOLL_TOUCH | - FOLL_REMOTE); - } - - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); + return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, + pages, vmas, locked); } EXPORT_SYMBOL(get_user_pages_remote); @@ -1665,6 +1920,15 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, { return 0; } + +static long __get_user_pages_remote(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + return 0; +} #endif /* !CONFIG_MMU */ /* @@ -1804,7 +2068,31 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_FAST_GUP + +static void put_compound_head(struct page *page, int refs, unsigned int flags) +{ + if (flags & FOLL_PIN) { + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, + refs); + + if (hpage_pincount_available(page)) + hpage_pincount_sub(page, refs); + else + refs *= GUP_PIN_COUNTING_BIAS; + } + + VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); + /* + * Calling put_page() for each ref is unnecessarily slow. Only the last + * ref needs a put_page(). + */ + if (refs > 1) + page_ref_sub(page, refs - 1); + put_page(page); +} + #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH + /* * WARNING: only to be used in the get_user_pages_fast() implementation. * @@ -1860,13 +2148,17 @@ static inline pte_t gup_get_pte(pte_t *ptep) #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, + unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); - put_page(page); + if (flags & FOLL_PIN) + unpin_user_page(page); + else + put_page(page); } } @@ -1899,7 +2191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); + undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) @@ -1908,17 +2200,30 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = try_get_compound_head(page, 1); + head = try_grab_compound_head(page, 1, flags); if (!head) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(head); + put_compound_head(head, 1, flags); goto pte_unmap; } VM_BUG_ON_PAGE(compound_head(page) != head, page); + /* + * We need to make the page accessible if and only if we are + * going to access its content (the FOLL_PIN case). Please + * see Documentation/core-api/pin_user_pages.rst for + * details. + */ + if (flags & FOLL_PIN) { + ret = arch_make_page_accessible(page); + if (ret) { + unpin_user_page(page); + goto pte_unmap; + } + } SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -1953,7 +2258,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; @@ -1963,12 +2269,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); + undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } SetPageReferenced(page); pages[*nr] = page; - get_page(page); + if (unlikely(!try_grab_page(page, flags))) { + undo_dev_pagemap(nr, nr_start, flags, pages); + return 0; + } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -1979,48 +2288,52 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - undo_dev_pagemap(nr, nr_start, pages); + undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { - undo_dev_pagemap(nr, nr_start, pages); + undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, - unsigned long end, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { BUILD_BUG(); return 0; @@ -2038,18 +2351,6 @@ static int record_subpages(struct page *page, unsigned long addr, return nr; } -static void put_compound_head(struct page *page, int refs) -{ - VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); - /* - * Calling put_page() for each ref is unnecessarily slow. Only the last - * ref needs a put_page(). - */ - if (refs > 1) - page_ref_sub(page, refs - 1); - put_page(page); -} - #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) @@ -2083,12 +2384,12 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, page = head + ((addr & (sz-1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_get_compound_head(head, refs); + head = try_grab_compound_head(head, refs, flags); if (!head) return 0; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_compound_head(head, refs); + put_compound_head(head, refs, flags); return 0; } @@ -2136,18 +2437,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; - return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); + return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, + pages, nr); } page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_get_compound_head(pmd_page(orig), refs); + head = try_grab_compound_head(pmd_page(orig), refs, flags); if (!head) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - put_compound_head(head, refs); + put_compound_head(head, refs, flags); return 0; } @@ -2157,7 +2459,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { struct page *head, *page; int refs; @@ -2168,18 +2471,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; - return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); + return __gup_device_huge_pud(orig, pudp, addr, end, flags, + pages, nr); } page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_get_compound_head(pud_page(orig), refs); + head = try_grab_compound_head(pud_page(orig), refs, flags); if (!head) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { - put_compound_head(head, refs); + put_compound_head(head, refs, flags); return 0; } @@ -2203,12 +2507,12 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_get_compound_head(pgd_page(orig), refs); + head = try_grab_compound_head(pgd_page(orig), refs, flags); if (!head) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - put_compound_head(head, refs); + put_compound_head(head, refs, flags); return 0; } @@ -2370,7 +2674,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, { unsigned long len, end; unsigned long flags; - int nr = 0; + int nr_pinned = 0; + /* + * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, + * because gup fast is always a "pin with a +1 page refcount" request. + */ + unsigned int gup_flags = FOLL_GET; + + if (write) + gup_flags |= FOLL_WRITE; start = untagged_addr(start) & PAGE_MASK; len = (unsigned long) nr_pages << PAGE_SHIFT; @@ -2396,11 +2708,11 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { local_irq_save(flags); - gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); + gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); } - return nr; + return nr_pinned; } EXPORT_SYMBOL_GPL(__get_user_pages_fast); @@ -2432,10 +2744,10 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, struct page **pages) { unsigned long addr, len, end; - int nr = 0, ret = 0; + int nr_pinned = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE | FOLL_PIN))) + FOLL_FORCE | FOLL_PIN | FOLL_GET))) return -EINVAL; start = untagged_addr(start) & PAGE_MASK; @@ -2451,25 +2763,25 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { local_irq_disable(); - gup_pgd_range(addr, end, gup_flags, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned); local_irq_enable(); - ret = nr; + ret = nr_pinned; } - if (nr < nr_pages) { + if (nr_pinned < nr_pages) { /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; + start += nr_pinned << PAGE_SHIFT; + pages += nr_pinned; - ret = __gup_longterm_unlocked(start, nr_pages - nr, + ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, pages); /* Have to be a bit careful with return values */ - if (nr > 0) { + if (nr_pinned > 0) { if (ret < 0) - ret = nr; + ret = nr_pinned; else - ret += nr; + ret += nr_pinned; } } @@ -2478,11 +2790,11 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, /** * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_sem. * If not successful, it will fall back to taking the lock and @@ -2502,6 +2814,13 @@ int get_user_pages_fast(unsigned long start, int nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; + /* + * The caller may or may not have explicitly set FOLL_GET; either way is + * OK. However, internally (within mm/gup.c), gup fast variants must set + * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" + * request. + */ + gup_flags |= FOLL_GET; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -2509,9 +2828,18 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * - * For now, this is a placeholder function, until various call sites are - * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, - * this is identical to get_user_pages_fast(). + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See + * get_user_pages_fast() for documentation on the function arguments, because + * the arguments here are identical. + * + * FOLL_PIN means that the pages must be released via unpin_user_page(). Please + * see Documentation/vm/pin_user_pages.rst for further details. * * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). @@ -2519,21 +2847,39 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - /* - * This is a placeholder, until the pin functionality is activated. - * Until then, just behave like the corresponding get_user_pages*() - * routine. - */ - return get_user_pages_fast(start, nr_pages, gup_flags, pages); + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process (task != current) * - * For now, this is a placeholder function, until various call sites are - * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, - * this is identical to get_user_pages_remote(). + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See + * get_user_pages_remote() for documentation on the function arguments, because + * the arguments here are identical. + * + * FOLL_PIN means that the pages must be released via unpin_user_page(). Please + * see Documentation/vm/pin_user_pages.rst for details. * * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). @@ -2543,22 +2889,33 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - /* - * This is a placeholder, until the pin functionality is activated. - * Until then, just behave like the corresponding get_user_pages*() - * routine. - */ - return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages, - vmas, locked); + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, + pages, vmas, locked); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * - * For now, this is a placeholder function, until various call sites are - * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, - * this is identical to get_user_pages(). + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and + * FOLL_PIN is set. + * + * FOLL_PIN means that the pages must be released via unpin_user_page(). Please + * see Documentation/vm/pin_user_pages.rst for details. * * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). @@ -2567,11 +2924,12 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - /* - * This is a placeholder, until the pin functionality is activated. - * Until then, just behave like the corresponding get_user_pages*() - * routine. - */ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 8dba38e79a9f..be690fa66a46 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -8,6 +8,8 @@ #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) #define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) #define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) +#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark) +#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark) struct gup_benchmark { __u64 get_delta_usec; @@ -19,6 +21,48 @@ struct gup_benchmark { __u64 expansion[10]; /* For future use */ }; +static void put_back_pages(unsigned int cmd, struct page **pages, + unsigned long nr_pages) +{ + unsigned long i; + + switch (cmd) { + case GUP_FAST_BENCHMARK: + case GUP_LONGTERM_BENCHMARK: + case GUP_BENCHMARK: + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + break; + + case PIN_FAST_BENCHMARK: + case PIN_BENCHMARK: + unpin_user_pages(pages, nr_pages); + break; + } +} + +static void verify_dma_pinned(unsigned int cmd, struct page **pages, + unsigned long nr_pages) +{ + unsigned long i; + struct page *page; + + switch (cmd) { + case PIN_FAST_BENCHMARK: + case PIN_BENCHMARK: + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + if (WARN(!page_maybe_dma_pinned(page), + "pages[%lu] is NOT dma-pinned\n", i)) { + + dump_page(page, "gup_benchmark failure"); + break; + } + } + break; + } +} + static int __gup_benchmark_ioctl(unsigned int cmd, struct gup_benchmark *gup) { @@ -66,6 +110,14 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = get_user_pages(addr, nr, gup->flags, pages + i, NULL); break; + case PIN_FAST_BENCHMARK: + nr = pin_user_pages_fast(addr, nr, gup->flags, + pages + i); + break; + case PIN_BENCHMARK: + nr = pin_user_pages(addr, nr, gup->flags, pages + i, + NULL); + break; default: kvfree(pages); ret = -EINVAL; @@ -78,15 +130,22 @@ static int __gup_benchmark_ioctl(unsigned int cmd, } end_time = ktime_get(); + /* Shifting the meaning of nr_pages: now it is actual number pinned: */ + nr_pages = i; + gup->get_delta_usec = ktime_us_delta(end_time, start_time); gup->size = addr - gup->addr; + /* + * Take an un-benchmark-timed moment to verify DMA pinned + * state: print a warning if any non-dma-pinned pages are found: + */ + verify_dma_pinned(cmd, pages, nr_pages); + start_time = ktime_get(); - for (i = 0; i < nr_pages; i++) { - if (!pages[i]) - break; - put_page(pages[i]); - } + + put_back_pages(cmd, pages, nr_pages); + end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); @@ -105,6 +164,8 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, case GUP_FAST_BENCHMARK: case GUP_LONGTERM_BENCHMARK: case GUP_BENCHMARK: + case PIN_FAST_BENCHMARK: + case PIN_BENCHMARK: break; default: return -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 24ad53b4dfc0..b1e069e68189 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -958,6 +958,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return NULL; + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; @@ -973,7 +978,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, * device mapped pages can only be returned if the * caller will manage the page reference count. */ - if (!(flags & FOLL_GET)) + if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; @@ -981,7 +986,8 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - get_page(page); + if (!try_grab_page(page, flags)) + page = ERR_PTR(-ENOMEM); return page; } @@ -1101,6 +1107,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return NULL; + if (pud_present(*pud) && pud_devmap(*pud)) /* pass */; else @@ -1112,8 +1123,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, /* * device mapped pages can only be returned if the * caller will manage the page reference count. + * + * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: */ - if (!(flags & FOLL_GET)) + if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; @@ -1121,7 +1134,8 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - get_page(page); + if (!try_grab_page(page, flags)) + page = ERR_PTR(-ENOMEM); return page; } @@ -1497,8 +1511,13 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if (!try_grab_page(page, flags)) + return ERR_PTR(-ENOMEM); + if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * We don't mlock() pte-mapped THPs. This way we can avoid @@ -1535,8 +1554,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); - if (flags & FOLL_GET) - get_page(page); out: return page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd8737a94bec..f9ea1e5197b4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -220,132 +220,303 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) return subpool_inode(file_inode(vma->vm_file)); } -/* - * Region tracking -- allows tracking of reservations and instantiated pages - * across the pages in a mapping. - * - * The region data structures are embedded into a resv_map and protected - * by a resv_map's lock. The set of regions within the resv_map represent - * reservations for huge pages, or huge pages that have already been - * instantiated within the map. The from and to elements are huge page - * indicies into the associated mapping. from indicates the starting index - * of the region. to represents the first index past the end of the region. - * - * For example, a file region structure with from == 0 and to == 4 represents - * four huge pages in a mapping. It is important to note that the to element - * represents the first element past the end of the region. This is used in - * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. - * - * Interval notation of the form [from, to) will be used to indicate that - * the endpoint from is inclusive and to is exclusive. +/* Helper that removes a struct file_region from the resv_map cache and returns + * it for use. */ -struct file_region { - struct list_head link; - long from; - long to; -}; +static struct file_region * +get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) +{ + struct file_region *nrg = NULL; + + VM_BUG_ON(resv->region_cache_count <= 0); + + resv->region_cache_count--; + nrg = list_first_entry(&resv->region_cache, struct file_region, link); + VM_BUG_ON(!nrg); + list_del(&nrg->link); + + nrg->from = from; + nrg->to = to; + + return nrg; +} + +static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, + struct file_region *rg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + nrg->reservation_counter = rg->reservation_counter; + nrg->css = rg->css; + if (rg->css) + css_get(rg->css); +#endif +} + +/* Helper that records hugetlb_cgroup uncharge info. */ +static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, + struct hstate *h, + struct resv_map *resv, + struct file_region *nrg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (h_cg) { + nrg->reservation_counter = + &h_cg->rsvd_hugepage[hstate_index(h)]; + nrg->css = &h_cg->css; + if (!resv->pages_per_hpage) + resv->pages_per_hpage = pages_per_huge_page(h); + /* pages_per_hpage should be the same for all entries in + * a resv_map. + */ + VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); + } else { + nrg->reservation_counter = NULL; + nrg->css = NULL; + } +#endif +} + +static bool has_same_uncharge_info(struct file_region *rg, + struct file_region *org) +{ +#ifdef CONFIG_CGROUP_HUGETLB + return rg && org && + rg->reservation_counter == org->reservation_counter && + rg->css == org->css; + +#else + return true; +#endif +} + +static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) +{ + struct file_region *nrg = NULL, *prg = NULL; + + prg = list_prev_entry(rg, link); + if (&prg->link != &resv->regions && prg->to == rg->from && + has_same_uncharge_info(prg, rg)) { + prg->to = rg->to; + + list_del(&rg->link); + kfree(rg); + + coalesce_file_region(resv, prg); + return; + } + + nrg = list_next_entry(rg, link); + if (&nrg->link != &resv->regions && nrg->from == rg->to && + has_same_uncharge_info(nrg, rg)) { + nrg->from = rg->from; + + list_del(&rg->link); + kfree(rg); + + coalesce_file_region(resv, nrg); + return; + } +} /* Must be called with resv->lock held. Calling this with count_only == true * will count the number of pages to be added but will not modify the linked - * list. + * list. If regions_needed != NULL and count_only == true, then regions_needed + * will indicate the number of file_regions needed in the cache to carry out to + * add the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, + struct hugetlb_cgroup *h_cg, + struct hstate *h, long *regions_needed, bool count_only) { - long chg = 0; + long add = 0; struct list_head *head = &resv->regions; + long last_accounted_offset = f; struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; + if (regions_needed) + *regions_needed = 0; - chg = t - f; + /* In this loop, we essentially handle an entry for the range + * [last_accounted_offset, rg->from), at every iteration, with some + * bounds checking. + */ + list_for_each_entry_safe(rg, trg, head, link) { + /* Skip irrelevant regions that start before our range. */ + if (rg->from < f) { + /* If this region ends after the last accounted offset, + * then we need to update last_accounted_offset. + */ + if (rg->to > last_accounted_offset) + last_accounted_offset = rg->to; + continue; + } - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; + /* When we find a region that starts beyond our range, we've + * finished. + */ if (rg->from > t) break; - /* We overlap with this area, if it extends further than - * us then we must extend ourselves. Account for its - * existing reservation. + /* Add an entry for last_accounted_offset -> rg->from, and + * update last_accounted_offset. + */ + if (rg->from > last_accounted_offset) { + add += rg->from - last_accounted_offset; + if (!count_only) { + nrg = get_file_region_entry_from_cache( + resv, last_accounted_offset, rg->from); + record_hugetlb_cgroup_uncharge_info(h_cg, h, + resv, nrg); + list_add(&nrg->link, rg->link.prev); + coalesce_file_region(resv, nrg); + } else if (regions_needed) + *regions_needed += 1; + } + + last_accounted_offset = rg->to; + } + + /* Handle the case where our range extends beyond + * last_accounted_offset. + */ + if (last_accounted_offset < t) { + add += t - last_accounted_offset; + if (!count_only) { + nrg = get_file_region_entry_from_cache( + resv, last_accounted_offset, t); + record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); + list_add(&nrg->link, rg->link.prev); + coalesce_file_region(resv, nrg); + } else if (regions_needed) + *regions_needed += 1; + } + + VM_BUG_ON(add < 0); + return add; +} + +/* Must be called with resv->lock acquired. Will drop lock to allocate entries. + */ +static int allocate_file_region_entries(struct resv_map *resv, + int regions_needed) + __must_hold(&resv->lock) +{ + struct list_head allocated_regions; + int to_allocate = 0, i = 0; + struct file_region *trg = NULL, *rg = NULL; + + VM_BUG_ON(regions_needed < 0); + + INIT_LIST_HEAD(&allocated_regions); + + /* + * Check for sufficient descriptors in the cache to accommodate + * the number of in progress add operations plus regions_needed. + * + * This is a while loop because when we drop the lock, some other call + * to region_add or region_del may have consumed some region_entries, + * so we keep looping here until we finally have enough entries for + * (adds_in_progress + regions_needed). + */ + while (resv->region_cache_count < + (resv->adds_in_progress + regions_needed)) { + to_allocate = resv->adds_in_progress + regions_needed - + resv->region_cache_count; + + /* At this point, we should have enough entries in the cache + * for all the existings adds_in_progress. We should only be + * needing to allocate for regions_needed. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; + VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); + + spin_unlock(&resv->lock); + for (i = 0; i < to_allocate; i++) { + trg = kmalloc(sizeof(*trg), GFP_KERNEL); + if (!trg) + goto out_of_memory; + list_add(&trg->link, &allocated_regions); } - chg -= rg->to - rg->from; - if (!count_only && rg != nrg) { + spin_lock(&resv->lock); + + list_for_each_entry_safe(rg, trg, &allocated_regions, link) { list_del(&rg->link); - kfree(rg); + list_add(&rg->link, &resv->region_cache); + resv->region_cache_count++; } } - if (!count_only) { - nrg->from = f; - nrg->to = t; - } + return 0; - return chg; +out_of_memory: + list_for_each_entry_safe(rg, trg, &allocated_regions, link) { + list_del(&rg->link); + kfree(rg); + } + return -ENOMEM; } /* * Add the huge page range represented by [f, t) to the reserve - * map. Existing regions will be expanded to accommodate the specified - * range, or a region will be taken from the cache. Sufficient regions - * must exist in the cache due to the previous call to region_chg with - * the same range. + * map. Regions will be taken from the cache to fill in this range. + * Sufficient regions should exist in the cache due to the previous + * call to region_chg with the same range, but in some cases the cache will not + * have sufficient entries due to races with other code doing region_add or + * region_del. The extra needed entries will be allocated. * - * Return the number of new huge pages added to the map. This - * number is greater than or equal to zero. + * regions_needed is the out value provided by a previous call to region_chg. + * + * Return the number of new huge pages added to the map. This number is greater + * than or equal to zero. If file_region entries needed to be allocated for + * this operation and we were not able to allocate, it ruturns -ENOMEM. + * region_add of regions of length 1 never allocate file_regions and cannot + * fail; region_chg will always allocate at least 1 entry and a region_add for + * 1 page will only require at most 1 entry. */ -static long region_add(struct resv_map *resv, long f, long t) +static long region_add(struct resv_map *resv, long f, long t, + long in_regions_needed, struct hstate *h, + struct hugetlb_cgroup *h_cg) { - struct list_head *head = &resv->regions; - struct file_region *rg, *nrg; - long add = 0; + long add = 0, actual_regions_needed = 0; spin_lock(&resv->lock); - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; +retry: + + /* Count how many regions are actually needed to execute this add. */ + add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed, + true); /* - * If no region exists which can be expanded to include the - * specified range, pull a region descriptor from the cache - * and use it for this range. + * Check for sufficient descriptors in the cache to accommodate + * this add operation. Note that actual_regions_needed may be greater + * than in_regions_needed, as the resv_map may have been modified since + * the region_chg call. In this case, we need to make sure that we + * allocate extra entries, such that we have enough for all the + * existing adds_in_progress, plus the excess needed for this + * operation. */ - if (&rg->link == head || t < rg->from) { - VM_BUG_ON(resv->region_cache_count <= 0); - - resv->region_cache_count--; - nrg = list_first_entry(&resv->region_cache, struct file_region, - link); - list_del(&nrg->link); + if (actual_regions_needed > in_regions_needed && + resv->region_cache_count < + resv->adds_in_progress + + (actual_regions_needed - in_regions_needed)) { + /* region_add operation of range 1 should never need to + * allocate file_region entries. + */ + VM_BUG_ON(t - f <= 1); - nrg->from = f; - nrg->to = t; - list_add(&nrg->link, rg->link.prev); + if (allocate_file_region_entries( + resv, actual_regions_needed - in_regions_needed)) { + return -ENOMEM; + } - add += t - f; - goto out_locked; + goto retry; } - add = add_reservation_in_range(resv, f, t, false); + add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false); + + resv->adds_in_progress -= in_regions_needed; -out_locked: - resv->adds_in_progress--; spin_unlock(&resv->lock); VM_BUG_ON(add < 0); return add; @@ -358,46 +529,37 @@ out_locked: * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the - * map. A new file_region structure is added to the cache - * as a placeholder, so that the subsequent region_add - * call will have all the regions it needs and will not fail. + * map. A number of new file_region structures is added to the cache as a + * placeholder, for the subsequent region_add call to use. At least 1 + * file_region structure is added. + * + * out_regions_needed is the number of regions added to the + * resv->adds_in_progress. This value needs to be provided to a follow up call + * to region_add or region_abort for proper accounting. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to * zero. -ENOMEM is returned if a new file_region structure or cache entry * is needed and can not be allocated. */ -static long region_chg(struct resv_map *resv, long f, long t) +static long region_chg(struct resv_map *resv, long f, long t, + long *out_regions_needed) { long chg = 0; spin_lock(&resv->lock); -retry_locked: - resv->adds_in_progress++; - /* - * Check for sufficient descriptors in the cache to accommodate - * the number of in progress add operations. - */ - if (resv->adds_in_progress > resv->region_cache_count) { - struct file_region *trg; - - VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); - /* Must drop lock to allocate a new descriptor. */ - resv->adds_in_progress--; - spin_unlock(&resv->lock); + /* Count how many hugepages in this range are NOT respresented. */ + chg = add_reservation_in_range(resv, f, t, NULL, NULL, + out_regions_needed, true); - trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) - return -ENOMEM; + if (*out_regions_needed == 0) + *out_regions_needed = 1; - spin_lock(&resv->lock); - list_add(&trg->link, &resv->region_cache); - resv->region_cache_count++; - goto retry_locked; - } + if (allocate_file_region_entries(resv, *out_regions_needed)) + return -ENOMEM; - chg = add_reservation_in_range(resv, f, t, true); + resv->adds_in_progress += *out_regions_needed; spin_unlock(&resv->lock); return chg; @@ -408,17 +570,20 @@ retry_locked: * of the resv_map keeps track of the operations in progress between * calls to region_chg and region_add. Operations are sometimes * aborted after the call to region_chg. In such cases, region_abort - * is called to decrement the adds_in_progress counter. + * is called to decrement the adds_in_progress counter. regions_needed + * is the value returned by the region_chg call, it is used to decrement + * the adds_in_progress counter. * * NOTE: The range arguments [f, t) are not needed or used in this * routine. They are kept to make reading the calling code easier as * arguments will match the associated region_chg call. */ -static void region_abort(struct resv_map *resv, long f, long t) +static void region_abort(struct resv_map *resv, long f, long t, + long regions_needed) { spin_lock(&resv->lock); VM_BUG_ON(!resv->region_cache_count); - resv->adds_in_progress--; + resv->adds_in_progress -= regions_needed; spin_unlock(&resv->lock); } @@ -486,11 +651,17 @@ retry: /* New entry for end of split region */ nrg->from = t; nrg->to = rg->to; + + copy_hugetlb_cgroup_uncharge_info(nrg, rg); + INIT_LIST_HEAD(&nrg->link); /* Original entry is trimmed */ rg->to = f; + hugetlb_cgroup_uncharge_file_region( + resv, rg, nrg->to - nrg->from); + list_add(&nrg->link, &rg->link); nrg = NULL; break; @@ -498,6 +669,8 @@ retry: if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; + hugetlb_cgroup_uncharge_file_region(resv, rg, + rg->to - rg->from); list_del(&rg->link); kfree(rg); continue; @@ -506,9 +679,15 @@ retry: if (f <= rg->from) { /* Trim beginning of region */ del += t - rg->from; rg->from = t; + + hugetlb_cgroup_uncharge_file_region(resv, rg, + t - rg->from); } else { /* Trim end of region */ del += rg->to - f; rg->to = f; + + hugetlb_cgroup_uncharge_file_region(resv, rg, + rg->to - f); } } @@ -650,6 +829,25 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } +static void +resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, + struct hugetlb_cgroup *h_cg, + struct hstate *h) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (!h_cg || !h) { + resv_map->reservation_counter = NULL; + resv_map->pages_per_hpage = 0; + resv_map->css = NULL; + } else { + resv_map->reservation_counter = + &h_cg->rsvd_hugepage[hstate_index(h)]; + resv_map->pages_per_hpage = pages_per_huge_page(h); + resv_map->css = &h_cg->css; + } +#endif +} + struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); @@ -666,6 +864,13 @@ struct resv_map *resv_map_alloc(void) INIT_LIST_HEAD(&resv_map->regions); resv_map->adds_in_progress = 0; + /* + * Initialize these to 0. On shared mappings, 0's here indicate these + * fields don't do cgroup accounting. On private mappings, these will be + * re-initialized to the proper values, to indicate that hugetlb cgroup + * reservations are to be un-charged from here. + */ + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); INIT_LIST_HEAD(&resv_map->region_cache); list_add(&rg->link, &resv_map->region_cache); @@ -1009,6 +1214,9 @@ static void destroy_compound_gigantic_page(struct page *page, struct page *p = page + 1; atomic_set(compound_mapcount_ptr(page), 0); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { clear_compound_head(p); set_page_refcounted(p); @@ -1069,6 +1277,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { @@ -1180,6 +1389,8 @@ static void __free_huge_page(struct page *page) clear_page_huge_active(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), + pages_per_huge_page(h), page); if (restore_reserve) h->resv_huge_pages++; @@ -1254,6 +1465,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); + set_hugetlb_cgroup_rsvd(page, NULL); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); @@ -1287,6 +1499,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); + + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); } /* @@ -1313,7 +1528,107 @@ int PageHeadHuge(struct page *page_head) if (!PageHead(page_head)) return 0; - return get_compound_page_dtor(page_head) == free_huge_page; + return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; +} + +/* + * Find address_space associated with hugetlbfs page. + * Upon entry page is locked and page 'was' mapped although mapped state + * could change. If necessary, use anon_vma to find vma and associated + * address space. The returned mapping may be stale, but it can not be + * invalid as page lock (which is held) is required to destroy mapping. + */ +static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) +{ + struct anon_vma *anon_vma; + pgoff_t pgoff_start, pgoff_end; + struct anon_vma_chain *avc; + struct address_space *mapping = page_mapping(hpage); + + /* Simple file based mapping */ + if (mapping) + return mapping; + + /* + * Even anonymous hugetlbfs mappings are associated with an + * underlying hugetlbfs file (see hugetlb_file_setup in mmap + * code). Find a vma associated with the anonymous vma, and + * use the file pointer to get address_space. + */ + anon_vma = page_lock_anon_vma_read(hpage); + if (!anon_vma) + return mapping; /* NULL */ + + /* Use first found vma */ + pgoff_start = page_to_pgoff(hpage); + pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff_start, pgoff_end) { + struct vm_area_struct *vma = avc->vma; + + mapping = vma->vm_file->f_mapping; + break; + } + + anon_vma_unlock_read(anon_vma); + return mapping; +} + +/* + * Find and lock address space (mapping) in write mode. + * + * Upon entry, the page is locked which allows us to find the mapping + * even in the case of an anon page. However, locking order dictates + * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs + * specific. So, we first try to lock the sema while still holding the + * page lock. If this works, great! If not, then we need to drop the + * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of + * course, need to revalidate state along the way. + */ +struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +{ + struct address_space *mapping, *mapping2; + + mapping = _get_hugetlb_page_mapping(hpage); +retry: + if (!mapping) + return mapping; + + /* + * If no contention, take lock and return + */ + if (i_mmap_trylock_write(mapping)) + return mapping; + + /* + * Must drop page lock and wait on mapping sema. + * Note: Once page lock is dropped, mapping could become invalid. + * As a hack, increase map count until we lock page again. + */ + atomic_inc(&hpage->_mapcount); + unlock_page(hpage); + i_mmap_lock_write(mapping); + lock_page(hpage); + atomic_add_negative(-1, &hpage->_mapcount); + + /* verify page is still mapped */ + if (!page_mapped(hpage)) { + i_mmap_unlock_write(mapping); + return NULL; + } + + /* + * Get address space again and verify it is the same one + * we locked. If not, drop lock and retry. + */ + mapping2 = _get_hugetlb_page_mapping(hpage); + if (mapping2 != mapping) { + i_mmap_unlock_write(mapping); + mapping = mapping2; + goto retry; + } + + return mapping; } pgoff_t __basepage_index(struct page *page) @@ -1870,6 +2185,7 @@ static long __vma_reservation_common(struct hstate *h, struct resv_map *resv; pgoff_t idx; long ret; + long dummy_out_regions_needed; resv = vma_resv_map(vma); if (!resv) @@ -1878,20 +2194,29 @@ static long __vma_reservation_common(struct hstate *h, idx = vma_hugecache_offset(h, vma, addr); switch (mode) { case VMA_NEEDS_RESV: - ret = region_chg(resv, idx, idx + 1); + ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); + /* We assume that vma_reservation_* routines always operate on + * 1 page, and that adding to resv map a 1 page entry can only + * ever require 1 region. + */ + VM_BUG_ON(dummy_out_regions_needed != 1); break; case VMA_COMMIT_RESV: - ret = region_add(resv, idx, idx + 1); + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); break; case VMA_END_RESV: - region_abort(resv, idx, idx + 1); + region_abort(resv, idx, idx + 1, 1); ret = 0; break; case VMA_ADD_RESV: - if (vma->vm_flags & VM_MAYSHARE) - ret = region_add(resv, idx, idx + 1); - else { - region_abort(resv, idx, idx + 1); + if (vma->vm_flags & VM_MAYSHARE) { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } else { + region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } break; @@ -2002,6 +2327,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; + bool deferred_reserve; idx = hstate_index(h); /* @@ -2039,9 +2365,19 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, gbl_chg = 1; } + /* If this allocation is not consuming a reservation, charge it now. + */ + deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + if (deferred_reserve) { + ret = hugetlb_cgroup_charge_cgroup_rsvd( + idx, pages_per_huge_page(h), &h_cg); + if (ret) + goto out_subpool_put; + } + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) - goto out_subpool_put; + goto out_uncharge_cgroup_reservation; spin_lock(&hugetlb_lock); /* @@ -2064,6 +2400,14 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + /* If allocation is not consuming a reservation, also store the + * hugetlb_cgroup pointer on the page. + */ + if (deferred_reserve) { + hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), + h_cg, page); + } + spin_unlock(&hugetlb_lock); set_page_private(page, (unsigned long)spool); @@ -2088,6 +2432,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_uncharge_cgroup_reservation: + if (deferred_reserve) + hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), + h_cg); out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); @@ -3188,9 +3536,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(resv, start, end); - - kref_put(&resv->refs, resv_map_release); - + hugetlb_cgroup_uncharge_counter(resv, start, end); if (reserve) { /* * Decrement reserve counts. The global reserve count may be @@ -3199,6 +3545,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) gbl_reserve = hugepage_subpool_put_pages(spool, reserve); hugetlb_acct_memory(h, -gbl_reserve); } + + kref_put(&resv->refs, resv_map_release); } static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) @@ -3306,6 +3654,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + struct address_space *mapping = vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; @@ -3316,6 +3665,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); + } else { + /* + * For shared mappings i_mmap_rwsem must be held to call + * huge_pte_alloc, otherwise the returned ptep could go + * away if part of a shared pmd and another thread calls + * huge_pmd_unshare. + */ + i_mmap_lock_read(mapping); } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { @@ -3393,6 +3750,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (cow) mmu_notifier_invalidate_range_end(&range); + else + i_mmap_unlock_read(mapping); return ret; } @@ -3812,16 +4171,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * Use page lock to guard against racing truncation - * before we get page_table_lock. + * We can not race with truncation due to holding i_mmap_rwsem. + * i_size is modified when holding i_mmap_rwsem, so check here + * once for faults beyond end of file. */ + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + retry: page = find_lock_page(mapping, idx); if (!page) { - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - /* * Check for page in userfault range */ @@ -3841,13 +4201,15 @@ retry: }; /* - * hugetlb_fault_mutex must be dropped before - * handling userfault. Reacquire after handling - * fault to make calling code simpler. + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); ret = handle_userfault(&vmf, VM_UFFD_MISSING); + i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } @@ -3925,10 +4287,6 @@ retry: } ptl = huge_pte_lock(h, mm, ptep); - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto backout; - ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; @@ -4012,6 +4370,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, mm, ptep); @@ -4025,14 +4388,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } + /* + * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold + * until finished with ptep. This serves two purposes: + * 1) It prevents huge_pmd_unshare from being called elsewhere + * and making the ptep no longer valid. + * 2) It synchronizes us with i_size modifications during truncation. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something has changed. + */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); + i_mmap_lock_read(mapping); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + if (!ptep) { + i_mmap_unlock_read(mapping); + return VM_FAULT_OOM; + } /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ + idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4120,6 +4500,7 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -4266,7 +4647,7 @@ out_release_nounlock: long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *nonblocking) + long i, unsigned int flags, int *locked) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -4337,14 +4718,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (locked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_KILLABLE; if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (flags & FOLL_TRIED) { - VM_WARN_ON_ONCE(fault_flags & - FAULT_FLAG_ALLOW_RETRY); + /* + * Note: FAULT_FLAG_ALLOW_RETRY and + * FAULT_FLAG_TRIED can co-exist + */ fault_flags |= FAULT_FLAG_TRIED; } ret = hugetlb_fault(mm, vma, vaddr, fault_flags); @@ -4354,9 +4738,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } if (ret & VM_FAULT_RETRY) { - if (nonblocking && + if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) - *nonblocking = 0; + *locked = 0; *nr_pages = 0; /* * VM_FAULT_RETRY must not return an @@ -4376,19 +4760,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); /* - * Instead of doing 'try_get_page()' below in the same_page - * loop, just check the count once here. - */ - if (unlikely(page_count(page) <= 0)) { - if (pages) { - spin_unlock(ptl); - remainder = 0; - err = -ENOMEM; - break; - } - } - - /* * If subpage information not requested, update counters * and skip the same_page loop below. */ @@ -4405,7 +4776,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page(pages[i]); + /* + * try_grab_page() should always succeed here, because: + * a) we hold the ptl lock, and b) we've just checked + * that the huge page is present in the page tables. If + * the huge page is present, then the tail pages must + * also be present. The ptl prevents the head page and + * tail pages from being rearranged in any way. So this + * page must be available at this point, unless the page + * refcount overflowed: + */ + if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } } if (vmas) @@ -4541,11 +4927,12 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long ret, chg; + long ret, chg, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; - long gbl_reserve; + struct hugetlb_cgroup *h_cg = NULL; + long gbl_reserve, regions_needed = 0; /* This should never happen */ if (from > to) { @@ -4575,9 +4962,10 @@ int hugetlb_reserve_pages(struct inode *inode, */ resv_map = inode_resv_map(inode); - chg = region_chg(resv_map, from, to); + chg = region_chg(resv_map, from, to, ®ions_needed); } else { + /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; @@ -4593,6 +4981,21 @@ int hugetlb_reserve_pages(struct inode *inode, goto out_err; } + ret = hugetlb_cgroup_charge_cgroup_rsvd( + hstate_index(h), chg * pages_per_huge_page(h), &h_cg); + + if (ret < 0) { + ret = -ENOMEM; + goto out_err; + } + + if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + /* For private mappings, the hugetlb_cgroup uncharge info hangs + * of the resv_map. + */ + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); + } + /* * There must be enough pages in the subpool for the mapping. If * the subpool has a minimum size, there may be some global @@ -4601,7 +5004,7 @@ int hugetlb_reserve_pages(struct inode *inode, gbl_reserve = hugepage_subpool_get_pages(spool, chg); if (gbl_reserve < 0) { ret = -ENOSPC; - goto out_err; + goto out_uncharge_cgroup; } /* @@ -4610,9 +5013,7 @@ int hugetlb_reserve_pages(struct inode *inode, */ ret = hugetlb_acct_memory(h, gbl_reserve); if (ret < 0) { - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); - goto out_err; + goto out_put_pages; } /* @@ -4627,9 +5028,12 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) { - long add = region_add(resv_map, from, to); + add = region_add(resv_map, from, to, regions_needed, h, h_cg); - if (unlikely(chg > add)) { + if (unlikely(add < 0)) { + hugetlb_acct_memory(h, -gbl_reserve); + goto out_put_pages; + } else if (unlikely(chg > add)) { /* * pages in this range were added to the reserve * map between region_chg and region_add. This @@ -4639,17 +5043,29 @@ int hugetlb_reserve_pages(struct inode *inode, */ long rsv_adjust; + hugetlb_cgroup_uncharge_cgroup_rsvd( + hstate_index(h), + (chg - add) * pages_per_huge_page(h), h_cg); + rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); } } return 0; +out_put_pages: + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), + chg * pages_per_huge_page(h), h_cg); out_err: if (!vma || vma->vm_flags & VM_MAYSHARE) - /* Don't call region_abort if region_chg failed */ - if (chg >= 0) - region_abort(resv_map, from, to); + /* Only call region_abort if the region_chg succeeded but the + * region_add failed or didn't run. + */ + if (chg >= 0 && add < 0) + region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; @@ -4740,7 +5156,7 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { - unsigned long check_addr = *start; + unsigned long check_addr; if (!(vma->vm_flags & VM_MAYSHARE)) return; @@ -4765,10 +5181,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_rwsem section - otherwise - * racing tasks could either miss the sharing (see huge_pte_offset) or select a - * bad pmd for sharing. + * code much cleaner. + * + * This routine must be called with i_mmap_rwsem held in at least read mode. + * For hugetlbfs, this prevents removal of any page table entries associated + * with the address space. This is important as we are setting up sharing + * based on existing page table entries (mappings). */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -4785,7 +5203,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4815,7 +5232,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_read(mapping); return pte; } @@ -4826,7 +5242,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with page table lock held. + * Called with page table lock held and i_mmap_rwsem held in write mode. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user @@ -4965,6 +5381,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, struct page *page = NULL; spinlock_t *ptl; pte_t pte; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return NULL; + retry: ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); @@ -4977,8 +5399,18 @@ retry: pte = huge_ptep_get((pte_t *)pmd); if (pte_present(pte)) { page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); - if (flags & FOLL_GET) - get_page(page); + /* + * try_grab_page() should always succeed here, because: a) we + * hold the pmd (ptl) lock, and b) we've just checked that the + * huge pmd (head) page is present in the page tables. The ptl + * prevents the head page and tail pages from being rearranged + * in any way. So this page must be available at this point, + * unless the page refcount overflowed: + */ + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); @@ -4999,7 +5431,7 @@ struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags) { - if (flags & FOLL_GET) + if (flags & (FOLL_GET | FOLL_PIN)) return NULL; return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); @@ -5008,7 +5440,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, struct page * __weak follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) { - if (flags & FOLL_GET) + if (flags & (FOLL_GET | FOLL_PIN)) return NULL; return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 5280bcf459af..c2d7ae6cabd1 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -23,29 +23,6 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> -enum hugetlb_memory_event { - HUGETLB_MAX, - HUGETLB_NR_MEMORY_EVENTS, -}; - -struct hugetlb_cgroup { - struct cgroup_subsys_state css; - - /* - * the counter to account for hugepages from hugetlb. - */ - struct page_counter hugepage[HUGE_MAX_HSTATE]; - - atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; - atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; - - /* Handle for "hugetlb.events" */ - struct cgroup_file events_file[HUGE_MAX_HSTATE]; - - /* Handle for "hugetlb.events.local" */ - struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; -}; - #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -55,6 +32,27 @@ struct hugetlb_cgroup { static struct hugetlb_cgroup *root_h_cgroup __read_mostly; +static inline struct page_counter * +__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, + bool rsvd) +{ + if (rsvd) + return &h_cg->rsvd_hugepage[idx]; + return &h_cg->hugepage[idx]; +} + +static inline struct page_counter * +hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) +{ + return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); +} + +static inline struct page_counter * +hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) +{ + return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); +} + static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { @@ -83,8 +81,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) int idx; for (idx = 0; idx < hugetlb_max_hstate; idx++) { - if (page_counter_read(&h_cg->hugepage[idx])) + if (page_counter_read( + hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) || + page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd( + h_cg, idx))) { return true; + } } return false; } @@ -95,18 +97,34 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { - struct page_counter *counter = &h_cgroup->hugepage[idx]; - struct page_counter *parent = NULL; + struct page_counter *fault_parent = NULL; + struct page_counter *rsvd_parent = NULL; unsigned long limit; int ret; - if (parent_h_cgroup) - parent = &parent_h_cgroup->hugepage[idx]; - page_counter_init(counter, parent); + if (parent_h_cgroup) { + fault_parent = hugetlb_cgroup_counter_from_cgroup( + parent_h_cgroup, idx); + rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( + parent_h_cgroup, idx); + } + page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, + idx), + fault_parent); + page_counter_init( + hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), + rsvd_parent); limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_set_max(counter, limit); + + ret = page_counter_set_max( + hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), + limit); + VM_BUG_ON(ret); + ret = page_counter_set_max( + hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), + limit); VM_BUG_ON(ret); } } @@ -136,7 +154,6 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) kfree(h_cgroup); } - /* * Should be called with hugetlb_lock held. * Since we are holding hugetlb_lock, pages cannot get moved from @@ -213,8 +230,9 @@ static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, !hugetlb_cgroup_is_root(hugetlb)); } -int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, - struct hugetlb_cgroup **ptr) +static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr, + bool rsvd) { int ret = 0; struct page_counter *counter; @@ -237,50 +255,103 @@ again: } rcu_read_unlock(); - if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, - &counter)) { + if (!page_counter_try_charge( + __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), + nr_pages, &counter)) { ret = -ENOMEM; hugetlb_event(h_cg, idx, HUGETLB_MAX); + css_put(&h_cg->css); + goto done; } - css_put(&h_cg->css); + /* Reservations take a reference to the css because they do not get + * reparented. + */ + if (!rsvd) + css_put(&h_cg->css); done: *ptr = h_cg; return ret; } +int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); +} + +int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); +} + /* Should be called with hugetlb_lock held */ -void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, - struct hugetlb_cgroup *h_cg, - struct page *page) +static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; - set_hugetlb_cgroup(page, h_cg); + __set_hugetlb_cgroup(page, h_cg, rsvd); return; } +void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); +} + +void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); +} + /* * Should be called with hugetlb_lock held */ -void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); - h_cg = hugetlb_cgroup_from_page(page); + h_cg = __hugetlb_cgroup_from_page(page, rsvd); if (unlikely(!h_cg)) return; - set_hugetlb_cgroup(page, NULL); - page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); + __set_hugetlb_cgroup(page, NULL, rsvd); + + page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, + rsvd), + nr_pages); + + if (rsvd) + css_put(&h_cg->css); + return; } -void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, - struct hugetlb_cgroup *h_cg) +void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page) +{ + __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); +} + +void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, + struct page *page) +{ + __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); +} + +static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; @@ -288,34 +359,91 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) return; - page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); - return; + page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, + rsvd), + nr_pages); + + if (rsvd) + css_put(&h_cg->css); +} + +void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); +} + +void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); +} + +void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, + unsigned long end) +{ + if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || + !resv->css) + return; + + page_counter_uncharge(resv->reservation_counter, + (end - start) * resv->pages_per_hpage); + css_put(resv->css); +} + +void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, + struct file_region *rg, + unsigned long nr_pages) +{ + if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) + return; + + if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 && + !resv->reservation_counter) { + page_counter_uncharge(rg->reservation_counter, + nr_pages * resv->pages_per_hpage); + css_put(rg->css); + } } enum { RES_USAGE, + RES_RSVD_USAGE, RES_LIMIT, + RES_RSVD_LIMIT, RES_MAX_USAGE, + RES_RSVD_MAX_USAGE, RES_FAILCNT, + RES_RSVD_FAILCNT, }; static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct page_counter *counter; + struct page_counter *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; + rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_RSVD_USAGE: + return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; + case RES_RSVD_LIMIT: + return (u64)rsvd_counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; + case RES_RSVD_MAX_USAGE: + return (u64)rsvd_counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; + case RES_RSVD_FAILCNT: + return rsvd_counter->failcnt; default: BUG(); } @@ -337,10 +465,16 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 1 << huge_page_order(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { + case RES_RSVD_USAGE: + counter = &h_cg->rsvd_hugepage[idx]; + /* Fall through. */ case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; + case RES_RSVD_LIMIT: + counter = &h_cg->rsvd_hugepage[idx]; + /* Fall through. */ case RES_LIMIT: val = (u64)counter->max; if (val == limit) @@ -364,6 +498,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, int ret, idx; unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + bool rsvd = false; if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ return -EINVAL; @@ -377,9 +512,14 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_RSVD_LIMIT: + rsvd = true; + /* Fall through. */ case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max( + __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), + nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: @@ -405,18 +545,25 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret = 0; - struct page_counter *counter; + struct page_counter *counter, *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; + rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: page_counter_reset_watermark(counter); break; + case RES_RSVD_MAX_USAGE: + page_counter_reset_watermark(rsvd_counter); + break; case RES_FAILCNT: counter->failcnt = 0; break; + case RES_RSVD_FAILCNT: + rsvd_counter->failcnt = 0; + break; default: ret = -EINVAL; break; @@ -471,7 +618,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) struct hstate *h = &hstates[idx]; /* format the size */ - mem_fmt(buf, 32, huge_page_size(h)); + mem_fmt(buf, sizeof(buf), huge_page_size(h)); /* Add the limit file */ cft = &h->cgroup_files_dfl[0]; @@ -481,15 +628,30 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) cft->write = hugetlb_cgroup_write_dfl; cft->flags = CFTYPE_NOT_ON_ROOT; - /* Add the current usage file */ + /* Add the reservation limit file */ cft = &h->cgroup_files_dfl[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->write = hugetlb_cgroup_write_dfl; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the current usage file */ + cft = &h->cgroup_files_dfl[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->flags = CFTYPE_NOT_ON_ROOT; + /* Add the current reservation usage file */ + cft = &h->cgroup_files_dfl[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->flags = CFTYPE_NOT_ON_ROOT; + /* Add the events file */ - cft = &h->cgroup_files_dfl[2]; + cft = &h->cgroup_files_dfl[4]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_show; @@ -497,7 +659,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the events.local file */ - cft = &h->cgroup_files_dfl[3]; + cft = &h->cgroup_files_dfl[5]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_local_show; @@ -506,7 +668,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) cft->flags = CFTYPE_NOT_ON_ROOT; /* NULL terminate the last cft */ - cft = &h->cgroup_files_dfl[4]; + cft = &h->cgroup_files_dfl[6]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, @@ -520,7 +682,7 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) struct hstate *h = &hstates[idx]; /* format the size */ - mem_fmt(buf, 32, huge_page_size(h)); + mem_fmt(buf, sizeof(buf), huge_page_size(h)); /* Add the limit file */ cft = &h->cgroup_files_legacy[0]; @@ -529,28 +691,55 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) cft->read_u64 = hugetlb_cgroup_read_u64; cft->write = hugetlb_cgroup_write_legacy; - /* Add the usage file */ + /* Add the reservation limit file */ cft = &h->cgroup_files_legacy[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); + cft->read_u64 = hugetlb_cgroup_read_u64; + cft->write = hugetlb_cgroup_write_legacy; + + /* Add the usage file */ + cft = &h->cgroup_files_legacy[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; + /* Add the reservation usage file */ + cft = &h->cgroup_files_legacy[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); + cft->read_u64 = hugetlb_cgroup_read_u64; + /* Add the MAX usage file */ - cft = &h->cgroup_files_legacy[2]; + cft = &h->cgroup_files_legacy[4]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; + /* Add the MAX reservation usage file */ + cft = &h->cgroup_files_legacy[5]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + /* Add the failcntfile */ - cft = &h->cgroup_files_legacy[3]; + cft = &h->cgroup_files_legacy[6]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); - cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); + cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the reservation failcntfile */ + cft = &h->cgroup_files_legacy[7]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ - cft = &h->cgroup_files_legacy[4]; + cft = &h->cgroup_files_legacy[8]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, @@ -585,6 +774,7 @@ void __init hugetlb_cgroup_file_init(void) void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) { struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = page_hstate(oldhpage); if (hugetlb_cgroup_disabled()) @@ -593,10 +783,13 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); spin_lock(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); + h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); + set_hugetlb_cgroup_rsvd(oldhpage, NULL); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(newhpage, h_cg); + set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); spin_unlock(&hugetlb_lock); return; diff --git a/mm/internal.h b/mm/internal.h index 3cf20ab3ca01..2d58ae15a958 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -63,6 +63,29 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, ra->start, ra->size, ra->async_size); } +/** + * page_evictable - test whether a page is evictable + * @page: the page to test + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +static inline bool page_evictable(struct page *page) +{ + bool ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + rcu_read_unlock(); + return ret; +} + /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. @@ -206,6 +229,7 @@ struct compact_control { bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ bool rescan; /* Rescanning the same pageblock */ + bool alloc_contig; /* alloc_contig_range allocation */ }; /* @@ -377,10 +401,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. + * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { + if (fault_flag_allow_retry_first(flags) && + !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); up_read(&vmf->vma->vm_mm->mmap_sem); } @@ -532,7 +556,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif -#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */ +#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6aa51723b92b..e61b4a492218 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -105,7 +105,8 @@ EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - check_memory_region((unsigned long)addr, len, true, _RET_IP_); + if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_)) + return NULL; return __memset(addr, c, len); } @@ -114,8 +115,9 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - check_memory_region((unsigned long)src, len, false, _RET_IP_); - check_memory_region((unsigned long)dest, len, true, _RET_IP_); + if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || + !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + return NULL; return __memmove(dest, src, len); } @@ -124,8 +126,9 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - check_memory_region((unsigned long)src, len, false, _RET_IP_); - check_memory_region((unsigned long)dest, len, true, _RET_IP_); + if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || + !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + return NULL; return __memcpy(dest, src, len); } @@ -634,12 +637,21 @@ void kasan_free_shadow(const struct vm_struct *vm) #endif extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); +extern bool report_enabled(void); -void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { unsigned long flags = user_access_save(); - __kasan_report(addr, size, is_write, ip); + bool ret = false; + + if (likely(report_enabled())) { + __kasan_report(addr, size, is_write, ip); + ret = true; + } + user_access_restore(flags); + + return ret; } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 616f9dd82d12..56ff8885fe2e 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -173,17 +173,18 @@ static __always_inline bool check_memory_region_inline(unsigned long addr, if (unlikely(size == 0)) return true; + if (unlikely(addr + size < addr)) + return !kasan_report(addr, size, write, ret_ip); + if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, ret_ip); - return false; + return !kasan_report(addr, size, write, ret_ip); } if (likely(!memory_is_poisoned(addr, size))) return true; - kasan_report(addr, size, write, ret_ip); - return false; + return !kasan_report(addr, size, write, ret_ip); } bool check_memory_region(unsigned long addr, size_t size, bool write, diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c index 2d97efd4954f..e200acb2d292 100644 --- a/mm/kasan/generic_report.c +++ b/mm/kasan/generic_report.c @@ -110,6 +110,17 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) const char *get_bug_type(struct kasan_access_info *info) { + /* + * If access_size is a negative number, then it has reason to be + * defined as out-of-bounds bug type. + * + * Casting negative numbers to size_t would indeed turn up as + * a large size_t and its value will be larger than ULONG_MAX/2, + * so that this can qualify as out-of-bounds. + */ + if (info->access_addr + info->access_size < info->access_addr) + return "out-of-bounds"; + if (addr_has_shadow(info->access_addr)) return get_shadow_bug_type(info); return get_wild_bug_type(info); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3a083274628e..e8f37199d885 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -153,7 +153,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, void *find_first_bad_addr(void *addr, size_t size); const char *get_bug_type(struct kasan_access_info *info); -void kasan_report(unsigned long addr, size_t size, +bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 5ef9f24f566b..cf5c17d5e361 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -446,7 +446,7 @@ static void print_shadow_for_address(const void *addr) } } -static bool report_enabled(void) +bool report_enabled(void) { if (current->kasan_depth) return false; @@ -478,9 +478,6 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon void *untagged_addr; unsigned long flags; - if (likely(!report_enabled())) - return; - disable_trace_on_warning(); tagged_addr = (void *)addr; diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 0e987c9ca052..25b7734e7013 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -86,6 +86,9 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, if (unlikely(size == 0)) return true; + if (unlikely(addr + size < addr)) + return !kasan_report(addr, size, write, ret_ip); + tag = get_tag((const void *)addr); /* @@ -111,15 +114,13 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, untagged_addr = reset_tag((const void *)addr); if (unlikely(untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, ret_ip); - return false; + return !kasan_report(addr, size, write, ret_ip); } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { if (*shadow != tag) { - kasan_report(addr, size, write, ret_ip); - return false; + return !kasan_report(addr, size, write, ret_ip); } } diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c index 969ae08f59d7..bee43717d6f0 100644 --- a/mm/kasan/tags_report.c +++ b/mm/kasan/tags_report.c @@ -60,6 +60,17 @@ const char *get_bug_type(struct kasan_access_info *info) } #endif + /* + * If access_size is a negative number, then it has reason to be + * defined as out-of-bounds bug type. + * + * Casting negative numbers to size_t would indeed turn up as + * a large size_t and its value will be larger than ULONG_MAX/2, + * so that this can qualify as out-of-bounds. + */ + if (info->access_addr + info->access_size < info->access_addr) + return "out-of-bounds"; + return "invalid-access"; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b679908743cb..c659c68728bc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -308,8 +308,6 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ -#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) - int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { @@ -423,7 +421,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, } if (!vma->anon_vma || vma->vm_ops) return false; - if (is_vma_temporary_stack(vma)) + if (vma_is_temporary_stack(vma)) return false; return !(vm_flags & VM_NO_KHUGEPAGED); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3a4259eeb5a0..e362dc3d2028 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1947,7 +1947,7 @@ void __init kmemleak_init(void) create_object((unsigned long)__bss_start, __bss_stop - __bss_start, KMEMLEAK_GREY, GFP_ATOMIC); /* only register .data..ro_after_init if not within .data */ - if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) + if (&__start_ro_after_init < &_sdata || &__end_ro_after_init > &_edata) create_object((unsigned long)__start_ro_after_init, __end_ro_after_init - __start_ro_after_init, KMEMLEAK_GREY, GFP_ATOMIC); diff --git a/mm/list_lru.c b/mm/list_lru.c index 0f1f6b06b7f3..8de5e3784ee4 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -57,16 +57,6 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) return &nlru->lru; } -static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - struct page *page; - - if (!memcg_kmem_enabled()) - return NULL; - page = virt_to_head_page(ptr); - return memcg_from_slab_page(page); -} - static inline struct list_lru_one * list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, struct mem_cgroup **memcg_ptr) @@ -77,7 +67,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, if (!nlru->memcg_lrus) goto out; - memcg = mem_cgroup_from_kmem(ptr); + memcg = mem_cgroup_from_obj(ptr); if (!memcg) goto out; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 71070dda9643..2c7d03675903 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -111,26 +111,60 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, return 0; } -/* wp_clean_pmd_entry - The pagewalk pmd callback. */ +/* + * wp_clean_pmd_entry - The pagewalk pmd callback. + * + * Dirty-tracking should take place on the PTE level, so + * WARN() if encountering a dirty huge pmd. + * Furthermore, never split huge pmds, since that currently + * causes dirty info loss. The pagefault handler should do + * that if needed. + */ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - /* Dirty-tracking should be handled on the pte level */ pmd_t pmdval = pmd_read_atomic(pmd); + if (!pmd_trans_unstable(&pmdval)) + return 0; + + if (pmd_none(pmdval)) { + walk->action = ACTION_AGAIN; + return 0; + } + + /* Huge pmd, present or migrated */ + walk->action = ACTION_CONTINUE; if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); return 0; } -/* wp_clean_pud_entry - The pagewalk pud callback. */ +/* + * wp_clean_pud_entry - The pagewalk pud callback. + * + * Dirty-tracking should take place on the PTE level, so + * WARN() if encountering a dirty huge puds. + * Furthermore, never split huge puds, since that currently + * causes dirty info loss. The pagefault handler should do + * that if needed. + */ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { - /* Dirty-tracking should be handled on the pte level */ pud_t pudval = READ_ONCE(*pud); + if (!pud_trans_unstable(&pudval)) + return 0; + + if (pud_none(pudval)) { + walk->action = ACTION_AGAIN; + return 0; + } + + /* Huge pud */ + walk->action = ACTION_CONTINUE; if (pud_trans_huge(pudval) || pud_devmap(pudval)) WARN_ON(pud_write(pudval) || pud_dirty(pudval)); diff --git a/mm/memblock.c b/mm/memblock.c index eba94ee3de0b..4d06bbaded0f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1698,7 +1698,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = PHYS_ADDR_MAX; + phys_addr_t max_addr; if (!limit) return; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7ddf91c4295f..ca194864d802 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -334,7 +334,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, if (!old) return 0; - new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); if (!new) return -ENOMEM; @@ -378,7 +378,7 @@ static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) mutex_lock(&memcg_shrinker_map_mutex); size = memcg_shrinker_map_size; for_each_node(nid) { - map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); + map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); if (!map) { memcg_free_shrinker_maps(memcg); ret = -ENOMEM; @@ -656,7 +656,7 @@ retry: */ __mem_cgroup_remove_exceeded(mz, mctz); if (!soft_limit_excess(mz->memcg) || - !css_tryget_online(&mz->memcg->css)) + !css_tryget(&mz->memcg->css)) goto retry; done: return mz; @@ -759,13 +759,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { - struct page *page = virt_to_head_page(p); - pg_data_t *pgdat = page_pgdat(page); + pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; struct lruvec *lruvec; rcu_read_lock(); - memcg = memcg_from_slab_page(page); + memcg = mem_cgroup_from_obj(p); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg || memcg == root_mem_cgroup) { @@ -973,7 +972,8 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) return NULL; rcu_read_lock(); - if (!memcg || !css_tryget_online(&memcg->css)) + /* Page should not get uncharged and freed memcg under us. */ + if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) memcg = root_mem_cgroup; rcu_read_unlock(); return memcg; @@ -986,10 +986,13 @@ EXPORT_SYMBOL(get_mem_cgroup_from_page); static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) { if (unlikely(current->active_memcg)) { - struct mem_cgroup *memcg = root_mem_cgroup; + struct mem_cgroup *memcg; rcu_read_lock(); - if (css_tryget_online(¤t->active_memcg->css)) + /* current->active_memcg must hold a ref. */ + if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css))) + memcg = root_mem_cgroup; + else memcg = current->active_memcg; rcu_read_unlock(); return memcg; @@ -1518,11 +1521,11 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.max), memcg->memory.failcnt); + K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->swap)), - K((u64)memcg->swap.max), memcg->swap.failcnt); + K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); else { pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), @@ -1549,13 +1552,13 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max; - max = memcg->memory.max; + max = READ_ONCE(memcg->memory.max); if (mem_cgroup_swappiness(memcg)) { unsigned long memsw_max; unsigned long swap_max; memsw_max = memcg->memsw.max; - swap_max = memcg->swap.max; + swap_max = READ_ONCE(memcg->swap.max); swap_max = min(swap_max, (unsigned long)total_swap_pages); max = min(max + swap_max, memsw_max); } @@ -1928,6 +1931,14 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, goto out; /* + * If the victim task has been asynchronously moved to a different + * memory cgroup, we might end up killing tasks outside oom_domain. + * In this case it's better to ignore memory.group.oom. + */ + if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) + goto out; + + /* * Traverse the memory cgroup hierarchy from the victim task's * cgroup up to the OOMing cgroup (or root) to find the * highest-level memory cgroup with oom.group set. @@ -2239,7 +2250,7 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= memcg->high) + if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2579,7 +2590,7 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > memcg->high) { + if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { /* Don't bother a random interrupted task */ if (in_interrupt()) { schedule_work(&memcg->high_work); @@ -2882,18 +2893,16 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * __memcg_kmem_charge_memcg: charge a kmem page - * @page: page to charge - * @gfp: reclaim mode - * @order: allocation order + * __memcg_kmem_charge: charge a number of kernel pages to a memcg * @memcg: memory cgroup to charge + * @gfp: reclaim mode + * @nr_pages: number of pages to charge * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg) +int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages) { - unsigned int nr_pages = 1 << order; struct page_counter *counter; int ret; @@ -2920,14 +2929,29 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * __memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + +/** + * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; @@ -2937,7 +2961,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { page->mem_cgroup = memcg; __SetPageKmemcg(page); @@ -2948,26 +2972,11 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) } /** - * __memcg_kmem_uncharge_memcg: uncharge a kmem page - * @memcg: memcg to uncharge - * @nr_pages: number of pages to uncharge - */ -void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); -} -/** - * __memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge_page: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void __memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge_page(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2976,7 +2985,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - __memcg_kmem_uncharge_memcg(memcg, nr_pages); + __memcg_kmem_uncharge(memcg, nr_pages); page->mem_cgroup = NULL; /* slab pages do not have PageKmemcg flag set */ @@ -3067,7 +3076,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, * Make sure that the new limit (memsw or memory limit) doesn't * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? max >= memcg->memory.max : + limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : max <= memcg->memsw.max; if (!limits_invariant) { mutex_unlock(&memcg_max_mutex); @@ -3814,8 +3823,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.max); - memsw = min(memsw, mi->memsw.max); + memory = min(memory, READ_ONCE(mi->memory.max)); + memsw = min(memsw, READ_ONCE(mi->memsw.max)); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -4324,7 +4333,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.max, memcg->high); + unsigned long ceiling = min(READ_ONCE(memcg->memory.max), + READ_ONCE(memcg->high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4792,7 +4802,8 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +#if defined(CONFIG_MEMCG_KMEM) && \ + (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", .seq_start = memcg_slab_start, @@ -4861,7 +4872,8 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) } } -static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) +static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, + unsigned int n) { refcount_add(n, &memcg->id.ref); } @@ -5044,7 +5056,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (!memcg) return ERR_PTR(error); - memcg->high = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); @@ -5197,7 +5209,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - memcg->high = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); } @@ -6013,7 +6025,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - memcg->high = high; + WRITE_ONCE(memcg->high, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6236,6 +6248,117 @@ struct cgroup_subsys memory_cgrp_subsys = { .early_init = 0, }; +/* + * This function calculates an individual cgroup's effective + * protection which is derived from its own memory.min/low, its + * parent's and siblings' settings, as well as the actual memory + * distribution in the tree. + * + * The following rules apply to the effective protection values: + * + * 1. At the first level of reclaim, effective protection is equal to + * the declared protection in memory.min and memory.low. + * + * 2. To enable safe delegation of the protection configuration, at + * subsequent levels the effective protection is capped to the + * parent's effective protection. + * + * 3. To make complex and dynamic subtrees easier to configure, the + * user is allowed to overcommit the declared protection at a given + * level. If that is the case, the parent's effective protection is + * distributed to the children in proportion to how much protection + * they have declared and how much of it they are utilizing. + * + * This makes distribution proportional, but also work-conserving: + * if one cgroup claims much more protection than it uses memory, + * the unused remainder is available to its siblings. + * + * 4. Conversely, when the declared protection is undercommitted at a + * given level, the distribution of the larger parental protection + * budget is NOT proportional. A cgroup's protection from a sibling + * is capped to its own memory.min/low setting. + * + * 5. However, to allow protecting recursive subtrees from each other + * without having to declare each individual cgroup's fixed share + * of the ancestor's claim to protection, any unutilized - + * "floating" - protection from up the tree is distributed in + * proportion to each cgroup's *usage*. This makes the protection + * neutral wrt sibling cgroups and lets them compete freely over + * the shared parental protection budget, but it protects the + * subtree as a whole from neighboring subtrees. + * + * Note that 4. and 5. are not in conflict: 4. is about protecting + * against immediate siblings whereas 5. is about protecting against + * neighboring subtrees. + */ +static unsigned long effective_protection(unsigned long usage, + unsigned long parent_usage, + unsigned long setting, + unsigned long parent_effective, + unsigned long siblings_protected) +{ + unsigned long protected; + unsigned long ep; + + protected = min(usage, setting); + /* + * If all cgroups at this level combined claim and use more + * protection then what the parent affords them, distribute + * shares in proportion to utilization. + * + * We are using actual utilization rather than the statically + * claimed protection in order to be work-conserving: claimed + * but unused protection is available to siblings that would + * otherwise get a smaller chunk than what they claimed. + */ + if (siblings_protected > parent_effective) + return protected * parent_effective / siblings_protected; + + /* + * Ok, utilized protection of all children is within what the + * parent affords them, so we know whatever this child claims + * and utilizes is effectively protected. + * + * If there is unprotected usage beyond this value, reclaim + * will apply pressure in proportion to that amount. + * + * If there is unutilized protection, the cgroup will be fully + * shielded from reclaim, but we do return a smaller value for + * protection than what the group could enjoy in theory. This + * is okay. With the overcommit distribution above, effective + * protection is always dependent on how memory is actually + * consumed among the siblings anyway. + */ + ep = protected; + + /* + * If the children aren't claiming (all of) the protection + * afforded to them by the parent, distribute the remainder in + * proportion to the (unprotected) memory of each cgroup. That + * way, cgroups that aren't explicitly prioritized wrt each + * other compete freely over the allowance, but they are + * collectively protected from neighboring trees. + * + * We're using unprotected memory for the weight so that if + * some cgroups DO claim explicit protection, we don't protect + * the same bytes twice. + */ + if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) + return ep; + + if (parent_effective > siblings_protected && usage > protected) { + unsigned long unclaimed; + + unclaimed = parent_effective - siblings_protected; + unclaimed *= usage - protected; + unclaimed /= parent_usage - siblings_protected; + + ep += unclaimed; + } + + return ep; +} + /** * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked @@ -6249,70 +6372,12 @@ struct cgroup_subsys memory_cgrp_subsys = { * MEMCG_PROT_LOW: cgroup memory is protected as long there is * an unprotected supply of reclaimable memory from other cgroups. * MEMCG_PROT_MIN: cgroup memory is protected - * - * @root is exclusive; it is never protected when looked at directly - * - * To provide a proper hierarchical behavior, effective memory.min/low values - * are used. Below is the description of how effective memory.low is calculated. - * Effective memory.min values is calculated in the same way. - * - * Effective memory.low is always equal or less than the original memory.low. - * If there is no memory.low overcommittment (which is always true for - * top-level memory cgroups), these two values are equal. - * Otherwise, it's a part of parent's effective memory.low, - * calculated as a cgroup's memory.low usage divided by sum of sibling's - * memory.low usages, where memory.low usage is the size of actually - * protected memory. - * - * low_usage - * elow = min( memory.low, parent->elow * ------------------ ), - * siblings_low_usage - * - * | memory.current, if memory.current < memory.low - * low_usage = | - * | 0, otherwise. - * - * - * Such definition of the effective memory.low provides the expected - * hierarchical behavior: parent's memory.low value is limiting - * children, unprotected memory is reclaimed first and cgroups, - * which are not using their guarantee do not affect actual memory - * distribution. - * - * For example, if there are memcgs A, A/B, A/C, A/D and A/E: - * - * A A/memory.low = 2G, A/memory.current = 6G - * //\\ - * BC DE B/memory.low = 3G B/memory.current = 2G - * C/memory.low = 1G C/memory.current = 2G - * D/memory.low = 0 D/memory.current = 2G - * E/memory.low = 10G E/memory.current = 0 - * - * and the memory pressure is applied, the following memory distribution - * is expected (approximately): - * - * A/memory.current = 2G - * - * B/memory.current = 1.3G - * C/memory.current = 0.6G - * D/memory.current = 0 - * E/memory.current = 0 - * - * These calculations require constant tracking of the actual low usages - * (see propagate_protected_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_protected() - * path for each memory cgroup top-down from the reclaim, - * it's possible to optimize this part, and save calculated elow - * for next usage. This part is intentionally racy, but it's ok, - * as memory.low is a best-effort mechanism. */ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg) { + unsigned long usage, parent_usage; struct mem_cgroup *parent; - unsigned long emin, parent_emin; - unsigned long elow, parent_elow; - unsigned long usage; if (mem_cgroup_disabled()) return MEMCG_PROT_NONE; @@ -6326,52 +6391,32 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (!usage) return MEMCG_PROT_NONE; - emin = memcg->memory.min; - elow = memcg->memory.low; - parent = parent_mem_cgroup(memcg); /* No parent means a non-hierarchical mode on v1 memcg */ if (!parent) return MEMCG_PROT_NONE; - if (parent == root) - goto exit; - - parent_emin = READ_ONCE(parent->memory.emin); - emin = min(emin, parent_emin); - if (emin && parent_emin) { - unsigned long min_usage, siblings_min_usage; - - min_usage = min(usage, memcg->memory.min); - siblings_min_usage = atomic_long_read( - &parent->memory.children_min_usage); - - if (min_usage && siblings_min_usage) - emin = min(emin, parent_emin * min_usage / - siblings_min_usage); + if (parent == root) { + memcg->memory.emin = READ_ONCE(memcg->memory.min); + memcg->memory.elow = memcg->memory.low; + goto out; } - parent_elow = READ_ONCE(parent->memory.elow); - elow = min(elow, parent_elow); - if (elow && parent_elow) { - unsigned long low_usage, siblings_low_usage; + parent_usage = page_counter_read(&parent->memory); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); + WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, + READ_ONCE(memcg->memory.min), + READ_ONCE(parent->memory.emin), + atomic_long_read(&parent->memory.children_min_usage))); - if (low_usage && siblings_low_usage) - elow = min(elow, parent_elow * low_usage / - siblings_low_usage); - } + WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, + memcg->memory.low, READ_ONCE(parent->memory.elow), + atomic_long_read(&parent->memory.children_low_usage))); -exit: - memcg->memory.emin = emin; - memcg->memory.elow = elow; - - if (usage <= emin) +out: + if (usage <= memcg->memory.emin) return MEMCG_PROT_MIN; - else if (usage <= elow) + else if (usage <= memcg->memory.elow) return MEMCG_PROT_LOW; else return MEMCG_PROT_NONE; @@ -6759,7 +6804,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; - if (css_tryget_online(&memcg->css)) + if (css_tryget(&memcg->css)) sk->sk_memcg = memcg; out: rcu_read_unlock(); @@ -7080,7 +7125,8 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) + if (page_counter_read(&memcg->swap) * 2 >= + READ_ONCE(memcg->swap.max)) return true; return false; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 41c634f45d45..1c961cd26c0b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -954,7 +954,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - bool unmap_success; + bool unmap_success = true; int kill = 1, forcekill; struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); @@ -1016,7 +1016,32 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - unmap_success = try_to_unmap(hpage, ttu); + if (!PageHuge(hpage)) { + unmap_success = try_to_unmap(hpage, ttu); + } else { + /* + * For hugetlb pages, try_to_unmap could potentially call + * huge_pmd_unshare. Because of this, take semaphore in + * write mode here and set TTU_RMAP_LOCKED to indicate we + * have taken the lock at this higer level. + * + * Note that the call to hugetlb_page_mapping_lock_write + * is necessary even if mapping is already set. It handles + * ugliness of potentially having to drop page lock to obtain + * i_mmap_rwsem. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + + if (mapping) { + unmap_success = try_to_unmap(hpage, + ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else { + pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n", + pfn); + unmap_success = false; + } + } if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); diff --git a/mm/memory.c b/mm/memory.c index e8bfdf0d9d1d..5c356a57b892 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1939,7 +1939,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target user address to start at - * @pfn: physical address of kernel memory + * @pfn: page frame number of kernel physical memory address * @size: size of map area * @prot: page protection flags for this mapping * @@ -2009,7 +2009,7 @@ EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to - * @start: start of area + * @start: start of the physical memory to be mapped * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 977c641f78cf..5fb427aed612 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -557,9 +557,10 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { + int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; - unsigned long flags = qp->flags; + unsigned long flags = (qp->flags & MPOL_MF_VALID); struct page *page; spinlock_t *ptl; pte_t entry; @@ -571,16 +572,44 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, page = pte_page(entry); if (!queue_pages_required(page, qp)) goto unlock; + + if (flags == MPOL_MF_STRICT) { + /* + * STRICT alone means only detecting misplaced page and no + * need to further check other vma. + */ + ret = -EIO; + goto unlock; + } + + if (!vma_migratable(walk->vma)) { + /* + * Must be STRICT with MOVE*, otherwise .test_walk() have + * stopped walking current vma. + * Detecting misplaced page but allow migrating pages which + * have been queued. + */ + ret = 1; + goto unlock; + } + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) - isolate_huge_page(page, qp->pagelist); + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { + if (!isolate_huge_page(page, qp->pagelist) && + (flags & MPOL_MF_STRICT)) + /* + * Failed to isolate page but allow migrating pages + * which have been queued. + */ + ret = 1; + } unlock: spin_unlock(ptl); #else BUG(); #endif - return 0; + return ret; } #ifdef CONFIG_NUMA_BALANCING @@ -621,7 +650,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long flags = qp->flags; /* range check first */ - VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end)); + VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma); if (!qp->first) { qp->first = vma; @@ -1714,6 +1743,34 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, #endif /* CONFIG_COMPAT */ +bool vma_migratable(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return false; + + /* + * DAX device mappings require predictable access latency, so avoid + * incurring periodic faults. + */ + if (vma_is_dax(vma)) + return false; + + if (is_vm_hugetlb_page(vma) && + !hugepage_migration_supported(hstate_vma(vma))) + return false; + + /* + * Migration allocates pages in the highest zone. If we cannot + * do so then migration (at least from node to node) is not + * possible. + */ + if (vma->vm_file && + gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) + < policy_zone) + return false; + return true; +} + struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { @@ -2841,7 +2898,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) switch (mode) { case MPOL_PREFERRED: /* - * Insist on a nodelist of one node only + * Insist on a nodelist of one node only, although later + * we use first_node(nodes) to grab a single node, so here + * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; @@ -2849,6 +2908,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) rest++; if (*rest) goto out; + if (nodes_empty(nodes)) + goto out; } break; case MPOL_INTERLEAVE: diff --git a/mm/migrate.c b/mm/migrate.c index 7605d2c23433..7ded07081be9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1282,6 +1282,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; /* * Migratability of hugepages depends on architectures and their size. @@ -1329,18 +1330,36 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { + /* + * try_to_unmap could potentially call huge_pmd_unshare. + * Because of this, take semaphore in write mode here and + * set TTU_RMAP_LOCKED to let lower levels know we have + * taken the lock. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (unlikely(!mapping)) + goto unlock_put_anon; + try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| + TTU_RMAP_LOCKED); page_was_mapped = 1; + /* + * Leave mapping locked until after subsequent call to + * remove_migration_ptes() + */ } if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, mode); - if (page_was_mapped) + if (page_was_mapped) { remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true); + i_mmap_unlock_write(mapping); + } +unlock_put_anon: unlock_page(new_hpage); put_anon: diff --git a/mm/mmap.c b/mm/mmap.c index d681a20eb4ea..94ae18398c59 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -53,6 +53,9 @@ #include <asm/tlb.h> #include <asm/mmu_context.h> +#define CREATE_TRACE_POINTS +#include <trace/events/mmap.h> + #include "internal.h" #ifndef arch_mmap_check @@ -1848,7 +1851,7 @@ unacct_error: return error; } -unsigned long unmapped_area(struct vm_unmapped_area_info *info) +static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { /* * We implement the search by looking for an rbtree node that @@ -1951,7 +1954,7 @@ found: return gap_start; } -unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -2050,6 +2053,27 @@ found_highest: return gap_end; } +/* + * Search for an unmapped address range. + * + * We are looking for a range that: + * - does not intersect with any VMA; + * - is contained within the [low_limit, high_limit) interval; + * - is at least the desired size. + * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) + */ +unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long addr; + + if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) + addr = unmapped_area_topdown(info); + else + addr = unmapped_area(info); + + trace_vm_unmapped_area(addr, info); + return addr; +} #ifndef arch_get_mmap_end #define arch_get_mmap_end(addr) (TASK_SIZE) diff --git a/mm/mremap.c b/mm/mremap.c index d28f08a36b96..a7e282ead438 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -133,7 +133,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * such races: * * - During exec() shift_arg_pages(), we use a specially tagged vma - * which rmap call sites look for using is_vma_temporary_stack(). + * which rmap call sites look for using vma_is_temporary_stack(). * * - During mremap(), new_vma is often known to be placed after vma * in rmap traversal order. This ensures rmap will always observe @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr, - bool *locked, struct vm_userfaultfd_ctx *uf, - struct list_head *uf_unmap) + bool *locked, unsigned long flags, + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -408,11 +408,32 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn_moved(vma); + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { + if (vm_flags & VM_ACCOUNT) { + /* Always put back VM_ACCOUNT since we won't unmap */ + vma->vm_flags |= VM_ACCOUNT; + + vm_acct_memory(vma_pages(new_vma)); + } + + /* We always clear VM_LOCKED[ONFAULT] on the old vma */ + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + + /* Because we won't unmap we don't need to touch locked_vm */ + goto out; + } + if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; } + + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + *locked = true; + } +out: mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ @@ -422,16 +443,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma->vm_next->vm_flags |= VM_ACCOUNT; } - if (vm_flags & VM_LOCKED) { - mm->locked_vm += new_len >> PAGE_SHIFT; - *locked = true; - } - return new_addr; } static struct vm_area_struct *vma_to_resize(unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long *p) + unsigned long old_len, unsigned long new_len, unsigned long flags, + unsigned long *p) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = find_vma(mm, addr); @@ -453,6 +470,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EINVAL); } + if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || + vma->vm_flags & VM_SHARED)) + return ERR_PTR(-EINVAL); + if (is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); @@ -497,7 +518,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, - struct vm_userfaultfd_ctx *uf, + unsigned long flags, struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap_early, struct list_head *uf_unmap) { @@ -505,7 +526,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; - unsigned long map_flags; + unsigned long map_flags = 0; if (offset_in_page(new_addr)) goto out; @@ -534,9 +555,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if ((mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; - ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); - if (ret) - goto out; + if (flags & MREMAP_FIXED) { + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); + if (ret) + goto out; + } if (old_len >= new_len) { ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); @@ -545,13 +568,22 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags, &charged); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } - map_flags = MAP_FIXED; + /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ + if (flags & MREMAP_DONTUNMAP && + !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { + ret = -ENOMEM; + goto out; + } + + if (flags & MREMAP_FIXED) + map_flags |= MAP_FIXED; + if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; @@ -561,10 +593,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (IS_ERR_VALUE(ret)) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, + /* We got a new mapping */ + if (!(flags & MREMAP_FIXED)) + new_addr = ret; + + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, uf_unmap); + if (!(offset_in_page(ret))) goto out; + out1: vm_unacct_memory(charged); @@ -618,12 +656,21 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr); - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret; if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; + /* + * MREMAP_DONTUNMAP is always a move and it does not allow resizing + * in the process. + */ + if (flags & MREMAP_DONTUNMAP && + (!(flags & MREMAP_MAYMOVE) || old_len != new_len)) + return ret; + + if (offset_in_page(addr)) return ret; @@ -641,9 +688,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; - if (flags & MREMAP_FIXED) { + if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked, &uf, &uf_unmap_early, &uf_unmap); + &locked, flags, &uf, &uf_unmap_early, + &uf_unmap); goto out; } @@ -671,7 +719,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags, &charged); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -721,7 +769,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } ret = move_vma(vma, addr, old_len, new_len, new_addr, - &locked, &uf, &uf_unmap); + &locked, flags, &uf, &uf_unmap); } out: if (offset_in_page(ret)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2caf780a42e7..7326b54ab728 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2182,12 +2182,12 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { + tag_pages_for_writeback(mapping, index, end); tag = PAGECACHE_TAG_TOWRITE; - else + } else { tag = PAGECACHE_TAG_DIRTY; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); + } done_index = index; while (!done && (index <= end)) { int i; @@ -2655,7 +2655,7 @@ int clear_page_dirty_for_io(struct page *page) struct address_space *mapping = page_mapping(page); int ret = 0; - BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; @@ -2764,7 +2764,7 @@ int test_clear_page_writeback(struct page *page) int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; + int ret, access_ret; lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { @@ -2807,6 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); + access_ret = arch_make_page_accessible(page); + /* + * If writeback has been triggered on a page that cannot be made + * accessible, it is too late to recover here. + */ + VM_BUG_ON_PAGE(access_ret != 0, page); + return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c4eb750a199..e5f76da8cd4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -95,7 +95,6 @@ DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); -int _node_numa_mem_[MAX_NUMNODES]; #endif /* work_structs for global per-cpu drains */ @@ -689,6 +688,8 @@ void prep_compound_page(struct page *page, unsigned int order) set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -791,32 +792,25 @@ static inline void set_page_order(struct page *page, unsigned int order) * * For recording page's order, we use page_private(page). */ -static inline int page_is_buddy(struct page *page, struct page *buddy, +static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (page_is_guard(buddy) && page_order(buddy) == order) { - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + if (!page_is_guard(buddy) && !PageBuddy(buddy)) + return false; - return 1; - } + if (page_order(buddy) != order) + return false; - if (PageBuddy(buddy) && page_order(buddy) == order) { - /* - * zone check is done late to avoid uselessly - * calculating zone/node ids for pages that could - * never merge. - */ - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; + /* + * zone check is done late to avoid uselessly calculating + * zone/node ids for pages that could never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return false; - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - return 1; - } - return 0; + return true; } #ifdef CONFIG_COMPACTION @@ -1152,7 +1146,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (PageMappingFlags(page)) page->mapping = NULL; if (memcg_kmem_enabled() && PageKmemcg(page)) - __memcg_kmem_uncharge(page, order); + __memcg_kmem_uncharge_page(page, order); if (check_free) bad += free_pages_check(page); if (bad) @@ -3459,8 +3453,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && - !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) return true; } return false; @@ -3535,10 +3528,13 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) static inline unsigned int alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) { - unsigned int alloc_flags = 0; + unsigned int alloc_flags; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) - alloc_flags |= ALLOC_KSWAPD; + /* + * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save a branch. + */ + alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); #ifdef CONFIG_ZONE_DMA32 if (!zone) @@ -4174,8 +4170,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask) { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + /* + * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save two branches. + */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller @@ -4183,7 +4184,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); if (gfp_mask & __GFP_ATOMIC) { /* @@ -4200,9 +4202,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) - alloc_flags |= ALLOC_KSWAPD; - #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -4745,14 +4744,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * Restore the original nodemask if it was potentially replaced with * &cpuset_current_mems_allowed to optimize the fast-path attempt. */ - if (unlikely(ac.nodemask != nodemask)) - ac.nodemask = nodemask; + ac.nodemask = nodemask; page = __alloc_pages_slowpath(alloc_mask, order, &ac); out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { + unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { __free_pages(page, order); page = NULL; } @@ -7867,8 +7865,8 @@ int __meminit init_per_zone_wmark_min(void) min_free_kbytes = new_min_free_kbytes; if (min_free_kbytes < 128) min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + if (min_free_kbytes > 262144) + min_free_kbytes = 262144; } else { pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); @@ -8253,15 +8251,20 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, /* * Hugepages are not in LRU lists, but they're movable. + * THPs are on the LRU, but need to be counted as #small pages. * We need not scan over tail pages because we don't * handle each tail page individually in migration. */ - if (PageHuge(page)) { + if (PageHuge(page) || PageTransCompound(page)) { struct page *head = compound_head(page); unsigned int skip_pages; - if (!hugepage_migration_supported(page_hstate(head))) + if (PageHuge(page)) { + if (!hugepage_migration_supported(page_hstate(head))) + return page; + } else if (!PageLRU(head) && !__PageMovable(head)) { return page; + } skip_pages = compound_nr(head) - (page - head); iter += skip_pages - 1; @@ -8402,6 +8405,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .ignore_skip_hint = true, .no_set_skip_hint = true, .gfp_mask = current_gfp_context(gfp_mask), + .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_counter.c b/mm/page_counter.c index de31470655f6..c56db2d5e159 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -17,29 +17,24 @@ static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; + unsigned long low, min; long delta; if (!c->parent) return; - if (c->min || atomic_long_read(&c->min_usage)) { - if (usage <= c->min) - protected = usage; - else - protected = 0; - + min = READ_ONCE(c->min); + if (min || atomic_long_read(&c->min_usage)) { + protected = min(usage, min); old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } - if (c->low || atomic_long_read(&c->low_usage)) { - if (usage <= c->low) - protected = usage; - else - protected = 0; - + low = READ_ONCE(c->low); + if (low || atomic_long_read(&c->low_usage)) { + protected = min(usage, low); old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) @@ -213,7 +208,7 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; - counter->min = nr_pages; + WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); @@ -230,7 +225,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; - counter->low = nr_pages; + WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); diff --git a/mm/page_ext.c b/mm/page_ext.c index 4ade843ff588..08ded037f89f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -304,7 +304,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) + if (!pfn_in_present_section(pfn)) continue; fail = init_section_page_ext(pfn, nid); } diff --git a/mm/rmap.c b/mm/rmap.c index b3e381919835..2df75a119c83 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -22,9 +22,10 @@ * * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem - * page->flags PG_locked (lock_page) + * page->flags PG_locked (lock_page) * (see huegtlbfs below) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -43,6 +44,11 @@ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock + * + * * hugetlbfs PageHuge() pages take locks in this order: + * mapping->i_mmap_rwsem + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * page->flags PG_locked (lock_page) */ #include <linux/mm.h> @@ -1178,6 +1184,9 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); + __inc_node_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ @@ -1406,6 +1415,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * If sharing is possible, start and end will be adjusted * accordingly. + * + * If called for a huge page, caller must hold i_mmap_rwsem + * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); @@ -1453,6 +1465,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address = pvmw.address; if (PageHuge(page)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (huge_pmd_unshare(mm, &address, pvmw.pte)) { /* * huge_pmd_unshare unmapped an entire PMD @@ -1696,23 +1714,9 @@ discard: return ret; } -bool is_vma_temporary_stack(struct vm_area_struct *vma) -{ - int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); - - if (!maybe_stack) - return false; - - if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == - VM_STACK_INCOMPLETE_SETUP) - return true; - - return false; -} - static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { - return is_vma_temporary_stack(vma); + return vma_is_temporary_stack(vma); } static int page_mapcount_is_zero(struct page *page) @@ -1974,6 +1978,9 @@ void hugepage_add_new_anon_rmap(struct page *page, { BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(compound_mapcount_ptr(page), 0); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); + __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/shuffle.c b/mm/shuffle.c index b3fe97fd6654..c716059cbd3c 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -72,7 +72,7 @@ static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) return NULL; /* ...is the pfn in a present section or a hole? */ - if (!pfn_present(pfn)) + if (!pfn_in_present_section(pfn)) return NULL; /* ...is the page free and currently on a free_area list? */ diff --git a/mm/slab.h b/mm/slab.h index 7e94700aa78c..207c83ef6e06 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -348,6 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + unsigned int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; @@ -360,21 +361,21 @@ static __always_inline int memcg_charge_slab(struct page *page, if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), - (1 << order)); - percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + nr_pages); + percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages); return 0; } - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = memcg_kmem_charge(memcg, gfp, nr_pages); if (ret) goto out; lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages); /* transer try_charge() page references to kmem_cache */ - percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); - css_put_many(&memcg->css, 1 << order); + percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages); + css_put_many(&memcg->css, nr_pages); out: css_put(&memcg->css); return ret; @@ -387,6 +388,7 @@ out: static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { + unsigned int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; @@ -394,15 +396,15 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, memcg = READ_ONCE(s->memcg_params.memcg); if (likely(!mem_cgroup_is_root(memcg))) { lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); - memcg_kmem_uncharge_memcg(page, order, memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages); + memcg_kmem_uncharge(memcg, nr_pages); } else { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), - -(1 << order)); + -nr_pages); } rcu_read_unlock(); - percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); + percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages); } extern void slab_init_memcg_params(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 1907cb2903c7..5282f881d2f5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1521,7 +1521,7 @@ void dump_unreclaimable_slab(void) mutex_unlock(&slab_mutex); } -#if defined(CONFIG_MEMCG) +#if defined(CONFIG_MEMCG_KMEM) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); diff --git a/mm/slub.c b/mm/slub.c index 6589b41d5a60..3098e0cf2899 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -259,7 +259,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, * freepointer to be restored incorrectly. */ return (void *)((unsigned long)ptr ^ s->random ^ - (unsigned long)kasan_reset_tag((void *)ptr_addr)); + swab((unsigned long)kasan_reset_tag((void *)ptr_addr))); #else return ptr; #endif @@ -2205,11 +2205,11 @@ static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; - while ((page = c->partial)) { + while ((page = slub_percpu_partial(c))) { struct page new; struct page old; - c->partial = page->next; + slub_set_percpu_partial(c, page); n2 = get_node(s, page_to_nid(page)); if (n != n2) { @@ -2282,7 +2282,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) if (oldpage) { pobjects = oldpage->pobjects; pages = oldpage->pages; - if (drain && pobjects > s->cpu_partial) { + if (drain && pobjects > slub_cpu_partial(s)) { unsigned long flags; /* * partial array is full. Move the existing @@ -2307,7 +2307,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - if (unlikely(!s->cpu_partial)) { + if (unlikely(!slub_cpu_partial(s))) { unsigned long flags; local_irq_save(flags); @@ -3512,15 +3512,15 @@ static void set_cpu_partial(struct kmem_cache *s) * 50% to keep some capacity around for frees. */ if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; + slub_set_cpu_partial(s, 0); else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; + slub_set_cpu_partial(s, 2); else if (s->size >= 1024) - s->cpu_partial = 6; + slub_set_cpu_partial(s, 6); else if (s->size >= 256) - s->cpu_partial = 13; + slub_set_cpu_partial(s, 13); else - s->cpu_partial = 30; + slub_set_cpu_partial(s, 30); #endif } @@ -3581,6 +3581,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->offset = size; size += sizeof(void *); + } else if (size > sizeof(void *)) { + /* + * Store freelist pointer near middle of object to keep + * it away from the edges of the object to avoid small + * sized over/underflows from neighboring allocations. + */ + s->offset = ALIGN(size / 2, sizeof(void *)); } #ifdef CONFIG_SLUB_DEBUG diff --git a/mm/sparse.c b/mm/sparse.c index 65599e8bd636..f1af4d4ee80b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -664,35 +664,14 @@ static void free_map_bootmem(struct page *memmap) struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { - struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; - - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); - if (page) - goto got_map_page; - - ret = vmalloc(memmap_size); - if (ret) - goto got_map_ptr; - - return NULL; -got_map_page: - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); -got_map_ptr: - - return ret; + return kvmalloc_node(array_size(sizeof(struct page), + PAGES_PER_SECTION), GFP_KERNEL, nid); } static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - struct page *memmap = pfn_to_page(pfn); - - if (is_vmalloc_addr(memmap)) - vfree(memmap); - else - free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * PAGES_PER_SECTION)); + kvfree(pfn_to_page(pfn)); } static void free_map_bootmem(struct page *memmap) @@ -894,7 +873,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, /* Align memmap to section boundary in the subsection case */ if (section_nr_to_pfn(section_nr) != start_pfn) - memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr)); + memmap = pfn_to_page(section_nr_to_pfn(section_nr)); sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); return 0; diff --git a/mm/swap.c b/mm/swap.c index cf39d24ada2a..a4af8c999963 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -931,7 +931,6 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); /* * Page becomes evictable in two ways: * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. @@ -958,7 +957,8 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, * looking at the same page) and the evictable page will be stranded * in an unevictable LRU. */ - smp_mb(); + SetPageLRU(page); + smp_mb__after_atomic(); if (page_evictable(page)) { lru = page_lru(page); @@ -986,7 +986,6 @@ void __pagevec_lru_add(struct pagevec *pvec) { pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); } -EXPORT_SYMBOL(__pagevec_lru_add); /** * pagevec_lookup_entries - gang pagecache lookup diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 63a7b4563a57..0975adc72253 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -309,7 +309,7 @@ direct_free: swp_entry_t get_swap_page(struct page *page) { - swp_entry_t entry, *pentry; + swp_entry_t entry; struct swap_slots_cache *cache; entry.val = 0; @@ -336,13 +336,11 @@ swp_entry_t get_swap_page(struct page *page) if (cache->slots) { repeat: if (cache->nr) { - pentry = &cache->slots[cache->cur++]; - entry = *pentry; - pentry->val = 0; + entry = cache->slots[cache->cur]; + cache->slots[cache->cur++].val = 0; cache->nr--; - } else { - if (refill_swap_slots_cache(cache)) - goto repeat; + } else if (refill_swap_slots_cache(cache)) { + goto repeat; } } mutex_unlock(&cache->alloc_lock); diff --git a/mm/swap_state.c b/mm/swap_state.c index 8e7ce9a9bc5e..ebed37bbf7a3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = compound_nr(page); + unsigned long i, nr = hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); diff --git a/mm/swapfile.c b/mm/swapfile.c index be33e6176cd9..273a923c275c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2132,7 +2132,7 @@ int try_to_unuse(unsigned int type, bool frontswap, swp_entry_t entry; unsigned int i; - if (!si->inuse_pages) + if (!READ_ONCE(si->inuse_pages)) return 0; if (!frontswap) @@ -2148,7 +2148,7 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while (si->inuse_pages && + while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { @@ -2177,7 +2177,7 @@ retry: mmput(prev_mm); i = 0; - while (si->inuse_pages && + while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (i = find_next_to_unuse(si, i, frontswap)) != 0) { @@ -2219,7 +2219,7 @@ retry: * been preempted after get_swap_page(), temporarily hiding that swap. * It's easy and robust (though cpu-intensive) just to keep retrying. */ - if (si->inuse_pages) { + if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; retval = -EINTR; @@ -3475,7 +3475,7 @@ int swap_duplicate(swp_entry_t entry) * * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. - * -EBUSY means there is a swap cache. + * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 1b0d7abad1d4..bd96855f3961 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -276,10 +276,14 @@ retry: BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via hugetlb_fault_mutex + * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. + * i_mmap_rwsem ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. */ - idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; + i_mmap_lock_read(mapping); + idx = linear_page_index(dst_vma, dst_addr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -287,6 +291,7 @@ retry: dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); goto out_unlock; } @@ -294,6 +299,7 @@ retry: dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); goto out_unlock; } @@ -301,6 +307,7 @@ retry: dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); vm_alloc_shared = vm_shared; cond_resched(); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 4bac22fe1aa2..d69019fc3789 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -280,7 +280,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, enum vmpressure_levels level; /* For now, no users for root-level efficiency */ - if (!memcg || memcg == root_mem_cgroup) + if (!memcg || mem_cgroup_is_root(memcg)) return; spin_lock(&vmpr->sr_lock); @@ -371,10 +371,8 @@ int vmpressure_register_event(struct mem_cgroup *memcg, int ret = 0; spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); - if (!spec) { - ret = -ENOMEM; - goto out; - } + if (!spec) + return -ENOMEM; /* Find required level */ token = strsep(&spec, ","); diff --git a/mm/vmscan.c b/mm/vmscan.c index 876370565455..2e8e690d2813 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1084,9 +1084,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; - int may_enter_fs; enum page_references references = PAGEREF_RECLAIM; - bool dirty, writeback; + bool dirty, writeback, may_enter_fs; unsigned int nr_pages; cond_resched(); @@ -1267,7 +1266,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked_split; } - may_enter_fs = 1; + may_enter_fs = true; /* Adding to swap updated mapping */ mapping = page_mapping(page); @@ -2096,7 +2095,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long reclaim_pages(struct list_head *page_list) { - int nid = -1; + int nid = NUMA_NO_NODE; unsigned long nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; @@ -2111,7 +2110,7 @@ unsigned long reclaim_pages(struct list_head *page_list) while (!list_empty(page_list)) { page = lru_to_page(page_list); - if (nid == -1) { + if (nid == NUMA_NO_NODE) { nid = page_to_nid(page); INIT_LIST_HEAD(&node_page_list); } @@ -2132,7 +2131,7 @@ unsigned long reclaim_pages(struct list_head *page_list) putback_lru_page(page); } - nid = -1; + nid = NUMA_NO_NODE; } if (!list_empty(&node_page_list)) { @@ -2427,10 +2426,8 @@ out: case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - lruvec_size = 0; + if ((scan_balance == SCAN_FILE) != file) scan = 0; - } break; default: /* Look ma, no brain */ @@ -3096,7 +3093,6 @@ retry: if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; - sc->skipped_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; @@ -3136,8 +3132,9 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, - (enum zone_type)ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3769,9 +3766,9 @@ out: static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, enum zone_type prev_classzone_idx) { - if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) - return prev_classzone_idx; - return pgdat->kswapd_classzone_idx; + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + + return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, @@ -3815,8 +3812,11 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); - pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); + WRITE_ONCE(pgdat->kswapd_classzone_idx, + kswapd_classzone_idx(pgdat, classzone_idx)); + + if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) + WRITE_ONCE(pgdat->kswapd_order, reclaim_order); } finish_wait(&pgdat->kswapd_wait, &wait); @@ -3893,12 +3893,12 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = MAX_NR_ZONES; + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); for ( ; ; ) { bool ret; - alloc_order = reclaim_order = pgdat->kswapd_order; + alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); kswapd_try_sleep: @@ -3906,10 +3906,10 @@ kswapd_try_sleep: classzone_idx); /* Read the new order and classzone_idx */ - alloc_order = reclaim_order = pgdat->kswapd_order; + alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); - pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = MAX_NR_ZONES; + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3953,20 +3953,23 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + enum zone_type curr_idx; if (!managed_zone(zone)) return; if (!cpuset_zone_allowed(zone, gfp_flags)) return; + pgdat = zone->zone_pgdat; + curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + + if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) + WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + + if (READ_ONCE(pgdat->kswapd_order) < order) + WRITE_ONCE(pgdat->kswapd_order, order); - if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) - pgdat->kswapd_classzone_idx = classzone_idx; - else - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, - classzone_idx); - pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; @@ -4030,27 +4033,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) } #endif /* CONFIG_HIBERNATION */ -/* It's optimal to keep kswapds on the same CPUs as their memory, but - not required for correctness. So if the last cpu in a node goes - away, we get changed to run anywhere: as the first one comes back, - restore their cpu bindings. */ -static int kswapd_cpu_online(unsigned int cpu) -{ - int nid; - - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; - - mask = cpumask_of_node(pgdat->node_id); - - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kswapd, mask); - } - return 0; -} - /* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. @@ -4090,15 +4072,11 @@ void kswapd_stop(int nid) static int __init kswapd_init(void) { - int nid, ret; + int nid; swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/vmscan:online", kswapd_cpu_online, - NULL); - WARN_ON(ret < 0); return 0; } @@ -4277,29 +4255,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) } #endif -/* - * page_evictable - test whether a page is evictable - * @page: the page to test - * - * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. - * - * Reasons page might not be evictable: - * (1) page's mapping marked unevictable - * (2) page is part of an mlocked VMA - * - */ -int page_evictable(struct page *page) -{ - int ret; - - /* Prevent address_space of inode and swap cache from being freed */ - rcu_read_lock(); - ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); - rcu_read_unlock(); - return ret; -} - /** * check_move_unevictable_pages - check pages for evictability and move to * appropriate zone lru list diff --git a/mm/vmstat.c b/mm/vmstat.c index 78d53378db99..c9c0d71f917f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1168,6 +1168,8 @@ const char * const vmstat_text[] = { "nr_dirtied", "nr_written", "nr_kernel_misc_reclaimable", + "nr_foll_pin_acquired", + "nr_foll_pin_released", /* enum writeback_stat_item counters */ "nr_dirty_threshold", |