diff options
Diffstat (limited to 'mm')
48 files changed, 1260 insertions, 841 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0331f1461f81..e3fbd0788878 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -23,7 +23,7 @@ menuconfig SWAP in your computer. If unsure say Y. config ZSWAP - bool "Compressed cache for swap pages (EXPERIMENTAL)" + bool "Compressed cache for swap pages" depends on SWAP select FRONTSWAP select CRYPTO @@ -36,12 +36,6 @@ config ZSWAP in the case where decompressing from RAM is faster than swap device reads, can also improve workload performance. - This is marked experimental because it is a new feature (as of - v3.11) that interacts heavily with memory reclaim. While these - interactions don't cause any known issues on simple memory setups, - they have not be fully explored on the large set of potential - configurations and workloads that exist. - config ZSWAP_DEFAULT_ON bool "Enable the compressed cache for swap pages by default" depends on ZSWAP diff --git a/mm/backing-dev.c b/mm/backing-dev.c index de65cb1e5f76..c30419a5e119 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -776,8 +776,6 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) int bdi_init(struct backing_dev_info *bdi) { - int ret; - bdi->dev = NULL; kref_init(&bdi->refcnt); @@ -788,9 +786,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); - ret = cgwb_bdi_init(bdi); - - return ret; + return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index c3ffe253e055..602fff89b15f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; - char name[CMA_MAX_NAME]; - scnprintf(name, sizeof(name), "cma-%s", cma->name); - - tmp = debugfs_create_dir(name, root_dentry); + tmp = debugfs_create_dir(cma->name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); diff --git a/mm/compaction.c b/mm/compaction.c index 640fa76228dd..262c4676b32c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1981,9 +1981,21 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +/* + * Determine whether kswapd is (or recently was!) running on this node. + * + * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't + * zero it. + */ static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && task_is_running(pgdat->kswapd); + bool running; + + pgdat_kswapd_lock(pgdat); + running = pgdat->kswapd && task_is_running(pgdat->kswapd); + pgdat_kswapd_unlock(pgdat); + + return running; } /* diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 573669566f84..45db79d28fdc 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -126,7 +126,7 @@ static void damon_test_split_at(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 100); damon_add_region(r, t); - damon_split_region_at(c, t, r, 25); + damon_split_region_at(t, r, 25); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); @@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(c, t, 2); + damon_split_regions_of(t, 2); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(c, t, 4); + damon_split_regions_of(t, 4); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); diff --git a/mm/damon/core.c b/mm/damon/core.c index 7d25dc582fe3..9964b9d00768 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -658,9 +658,8 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } -static void damon_split_region_at(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - unsigned long sz_r); +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r); static bool __damos_valid_target(struct damon_region *r, struct damos *s) { @@ -726,7 +725,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; sz = DAMON_MIN_REGION; } - damon_split_region_at(c, t, r, sz); + damon_split_region_at(t, r, sz); r = damon_next_region(r); sz = r->ar.end - r->ar.start; } @@ -745,7 +744,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, DAMON_MIN_REGION); if (!sz) goto update_stat; - damon_split_region_at(c, t, r, sz); + damon_split_region_at(t, r, sz); } ktime_get_coarse_ts64(&begin); sz_applied = c->ops.apply_scheme(c, t, r, s); @@ -928,9 +927,8 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, * r the region to be split * sz_r size of the first sub-region that will be made */ -static void damon_split_region_at(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - unsigned long sz_r) +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r) { struct damon_region *new; @@ -947,8 +945,7 @@ static void damon_split_region_at(struct damon_ctx *ctx, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_ctx *ctx, - struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -969,7 +966,7 @@ static void damon_split_regions_of(struct damon_ctx *ctx, if (sz_sub == 0 || sz_sub >= sz_region) continue; - damon_split_region_at(ctx, t, r, sz_sub); + damon_split_region_at(t, r, sz_sub); sz_region = sz_sub; } } @@ -1004,7 +1001,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(ctx, t, nr_subregions); + damon_split_regions_of(t, nr_subregions); last_nr_regions = nr_regions; } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 4e51466c4e74..652a94deafe3 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -1053,7 +1053,7 @@ static int __init __damon_dbgfs_init(void) fops[i]); dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); if (!dbgfs_dirs) { debugfs_remove(dbgfs_root); return -ENOMEM; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index b1335de200e7..f599838b5f64 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -88,7 +88,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) #define DAMON_MAX_SUBSCORE (100) #define DAMON_MAX_AGE_IN_LOG (32) -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { unsigned int max_nr_accesses; @@ -127,48 +127,14 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, */ hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; - /* Return coldness of the region */ - return DAMOS_MAX_SCORE - hotness; + return hotness; } -int damon_hot_score(struct damon_ctx *c, struct damon_region *r, +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { - unsigned int max_nr_accesses; - int freq_subscore; - unsigned int age_in_sec; - int age_in_log, age_subscore; - unsigned int freq_weight = s->quota.weight_nr_accesses; - unsigned int age_weight = s->quota.weight_age; - int hotness; - - max_nr_accesses = c->aggr_interval / c->sample_interval; - freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + int hotness = damon_hot_score(c, r, s); - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; - - /* If frequency is 0, higher age means it's colder */ - if (freq_subscore == 0) - age_in_log *= -1; - - /* - * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. - * Scale it to be in [0, 100] and set it as age subscore. - */ - age_in_log += DAMON_MAX_AGE_IN_LOG; - age_subscore = age_in_log * DAMON_MAX_SUBSCORE / - DAMON_MAX_AGE_IN_LOG / 2; - - hotness = (freq_weight * freq_subscore + age_weight * age_subscore); - if (freq_weight + age_weight) - hotness /= freq_weight + age_weight; - /* - * Transform it to fit in [0, DAMOS_MAX_SCORE] - */ - hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; - - return hotness; + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dc131c6a5403..6b0d9e6aa677 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -166,8 +166,7 @@ out: return result.accessed; } -static void __damon_pa_check_access(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_check_access(struct damon_region *r) { static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; @@ -196,7 +195,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(ctx, r); + __damon_pa_check_access(r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3c7b9d6dca95..a8505ad47c60 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -302,9 +302,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, pte_t *pte; spinlock_t *ptl; - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); - if (pmd_huge(*pmd)) { + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_huge(*pmd)) { damon_pmdp_mkold(pmd, walk->mm, addr); spin_unlock(ptl); return 0; @@ -429,9 +434,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); - if (!pmd_huge(*pmd)) { + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!pmd_trans_huge(*pmd)) { spin_unlock(ptl); goto regular_page; } @@ -532,16 +542,15 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void __damon_va_check_access(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_check_access(struct mm_struct *mm, + struct damon_region *r, bool same_target) { - static struct mm_struct *last_mm; static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == ALIGN_DOWN(r->sampling_addr, last_page_sz))) { if (last_accessed) r->nr_accesses++; @@ -552,7 +561,6 @@ static void __damon_va_check_access(struct damon_ctx *ctx, if (last_accessed) r->nr_accesses++; - last_mm = mm; last_addr = r->sampling_addr; } @@ -562,14 +570,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) struct mm_struct *mm; struct damon_region *r; unsigned int max_nr_accesses = 0; + bool same_target; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; + same_target = false; damon_for_each_region(r, t) { - __damon_va_check_access(ctx, mm, r); + __damon_va_check_access(mm, r, same_target); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + same_target = true; } mmput(mm); } diff --git a/mm/filemap.c b/mm/filemap.c index 15800334147b..8151890e9a00 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1633,24 +1633,26 @@ EXPORT_SYMBOL(folio_end_writeback); */ void page_endio(struct page *page, bool is_write, int err) { + struct folio *folio = page_folio(page); + if (!is_write) { if (!err) { - SetPageUptodate(page); + folio_mark_uptodate(folio); } else { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(folio); + folio_set_error(folio); } - unlock_page(page); + folio_unlock(folio); } else { if (err) { struct address_space *mapping; - SetPageError(page); - mapping = page_mapping(page); + folio_set_error(folio); + mapping = folio_mapping(folio); if (mapping) mapping_set_error(mapping, err); } - end_page_writeback(page); + folio_end_writeback(folio); } } EXPORT_SYMBOL_GPL(page_endio); @@ -2195,30 +2197,31 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) } /** - * find_get_pages_contig - gang contiguous pagecache lookup + * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search - * @index: The starting page index - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed + * @start: The starting page index + * @end: The final page index (inclusive) + * @fbatch: The batch to fill * - * find_get_pages_contig() works exactly like find_get_pages_range(), - * except that the returned number of pages are guaranteed to be - * contiguous. + * filemap_get_folios_contig() works exactly like filemap_get_folios(), + * except the returned folios are guaranteed to be contiguous. This may + * not return all contiguous folios if the batch gets filled up. * - * Return: the number of pages which were found. + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. */ -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, - unsigned int nr_pages, struct page **pages) + +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, index); + XA_STATE(xas, &mapping->i_pages, *start); + unsigned long nr; struct folio *folio; - unsigned int ret = 0; - - if (unlikely(!nr_pages)) - return 0; rcu_read_lock(); - for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + + for (folio = xas_load(&xas); folio && xas.xa_index <= end; + folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* @@ -2226,33 +2229,45 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) - break; + goto update_start; if (!folio_try_get_rcu(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) - goto put_page; + goto put_folio; -again: - pages[ret] = folio_file_page(folio, xas.xa_index); - if (++ret == nr_pages) - break; - if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; + if (!folio_batch_add(fbatch, folio)) { + nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; } continue; -put_page: +put_folio: folio_put(folio); + retry: xas_reset(&xas); } + +update_start: + nr = folio_batch_count(fbatch); + + if (nr) { + folio = fbatch->folios[nr - 1]; + if (folio_test_hugetlb(folio)) + *start = folio->index + 1; + else + *start = folio->index + folio_nr_pages(folio); + } +out: rcu_read_unlock(); - return ret; + return folio_batch_count(fbatch); } -EXPORT_SYMBOL(find_get_pages_contig); +EXPORT_SYMBOL(filemap_get_folios_contig); /** * find_get_pages_range_tag - Find and return head pages matching @tag. @@ -158,6 +158,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in page_try_share_anon_rmap(). + */ + smp_mb__after_atomic(); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); return folio; @@ -1927,20 +1934,16 @@ struct page *get_dump_page(unsigned long addr) #ifdef CONFIG_MIGRATION /* - * Check whether all pages are pinnable, if so return number of pages. If some - * pages are not pinnable, migrate them, and unpin all pages. Return zero if - * pages were migrated, or if some pages were not successfully isolated. - * Return negative error if migration fails. + * Returns the number of collected pages. Return value is always >= 0. */ -static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) +static unsigned long collect_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) { - unsigned long isolation_error_count = 0, i; + unsigned long i, collected = 0; struct folio *prev_folio = NULL; - LIST_HEAD(movable_page_list); - bool drain_allow = true, coherent_pages = false; - int ret = 0; + bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); @@ -1949,45 +1952,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - /* - * Device coherent pages are managed by a driver and should not - * be pinned indefinitely as it prevents the driver moving the - * page. So when trying to pin with FOLL_LONGTERM instead try - * to migrate the page out of device memory. - */ - if (folio_is_device_coherent(folio)) { - /* - * We always want a new GUP lookup with device coherent - * pages. - */ - pages[i] = 0; - coherent_pages = true; - - /* - * Migration will fail if the page is pinned, so convert - * the pin on the source page to a normal reference. - */ - if (gup_flags & FOLL_PIN) { - get_page(&folio->page); - unpin_user_page(&folio->page); - } + if (folio_is_longterm_pinnable(folio)) + continue; - ret = migrate_device_coherent_page(&folio->page); - if (ret) - goto unpin_pages; + collected++; + if (folio_is_device_coherent(folio)) continue; - } - if (folio_is_longterm_pinnable(folio)) - continue; - /* - * Try to move out any movable page before pinning the range. - */ if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(&folio->page, - &movable_page_list)) - isolation_error_count++; + isolate_hugetlb(&folio->page, movable_page_list); continue; } @@ -1996,63 +1970,124 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, drain_allow = false; } - if (folio_isolate_lru(folio)) { - isolation_error_count++; + if (!folio_isolate_lru(folio)) continue; - } - list_add_tail(&folio->lru, &movable_page_list); + + list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } - if (!list_empty(&movable_page_list) || isolation_error_count || - coherent_pages) - goto unpin_pages; + return collected; +} - /* - * If list is empty, and no isolation errors, means that all pages are - * in the correct zone. - */ - return nr_pages; +/* + * Unpins all pages and migrates device coherent pages and movable_page_list. + * Returns -EAGAIN if all pages were successfully migrated or -errno for failure + * (or partial success). + */ +static int migrate_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) +{ + int ret; + unsigned long i; -unpin_pages: - /* - * pages[i] might be NULL if any device coherent pages were found. - */ for (i = 0; i < nr_pages; i++) { - if (!pages[i]) + struct folio *folio = page_folio(pages[i]); + + if (folio_is_device_coherent(folio)) { + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + pages[i] = NULL; + folio_get(folio); + gup_put_folio(folio, 1, FOLL_PIN); + + if (migrate_device_coherent_page(&folio->page)) { + ret = -EBUSY; + goto err; + } + continue; + } - if (gup_flags & FOLL_PIN) - unpin_user_page(pages[i]); - else - put_page(pages[i]); + /* + * We can't migrate pages with unexpected references, so drop + * the reference obtained by __get_user_pages_locked(). + * Migrating pages have been added to movable_page_list after + * calling folio_isolate_lru() which takes a reference so the + * page won't be freed if it's migrating. + */ + unpin_user_page(pages[i]); + pages[i] = NULL; } - if (!list_empty(&movable_page_list)) { + if (!list_empty(movable_page_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; - ret = migrate_pages(&movable_page_list, alloc_migration_target, - NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN, NULL); - if (ret > 0) /* number of pages not migrated */ + if (migrate_pages(movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; + goto err; + } } - if (ret && !list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); + putback_movable_pages(movable_page_list); + + return -EAGAIN; + +err: + for (i = 0; i < nr_pages; i++) + if (pages[i]) + unpin_user_page(pages[i]); + putback_movable_pages(movable_page_list); + return ret; } + +/* + * Check whether all pages are *allowed* to be pinned. Rather confusingly, all + * pages in the range are required to be pinned via FOLL_PIN, before calling + * this routine. + * + * If any pages in the range are not allowed to be pinned, then this routine + * will migrate those pages away, unpin all the pages in the range and return + * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then + * call this routine again. + * + * If an error other than -EAGAIN occurs, this indicates a migration failure. + * The caller should give up, and propagate the error back up the call stack. + * + * If everything is OK and all pages in the range are allowed to be pinned, then + * this routine leaves all pages pinned and returns zero for success. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + unsigned long collected; + LIST_HEAD(movable_page_list); + + collected = collect_longterm_unpinnable_pages(&movable_page_list, + nr_pages, pages); + if (!collected) + return 0; + + return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, + pages); +} #else static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) + struct page **pages) { - return nr_pages; + return 0; } #endif /* CONFIG_MIGRATION */ @@ -2068,22 +2103,36 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned int gup_flags) { unsigned int flags; - long rc; + long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); + + /* + * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM + * implies FOLL_PIN (although the reverse is not true). Therefore it is + * correct to unconditionally call check_and_migrate_movable_pages() + * which assumes pages have been pinned via FOLL_PIN. + * + * Enforce the above reasoning by asserting that FOLL_PIN is set. + */ + if (WARN_ON(!(gup_flags & FOLL_PIN))) + return -EINVAL; flags = memalloc_pin_save(); do { - rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); - if (rc <= 0) + nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if (nr_pinned_pages <= 0) { + rc = nr_pinned_pages; break; - rc = check_and_migrate_movable_pages(rc, pages, gup_flags); - } while (!rc); + } + rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); + } while (rc == -EAGAIN); memalloc_pin_restore(flags); - return rc; + return rc ? rc : nr_pinned_pages; } static bool is_valid_gup_flags(unsigned int gup_flags) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f42bb51e023a..7bf2299cb24b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -70,9 +70,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -121,11 +120,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ @@ -772,8 +770,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - if (pgtable) - pgtable_trans_huge_deposit(mm, pmd, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } @@ -1479,7 +1476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; - int target_nid, last_cpupid = -1; + int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); bool migrated = false; bool was_writable = pmd_savedwrite(oldpmd); int flags = 0; @@ -1500,7 +1497,12 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (node_is_toptier(page_nid)) + last_cpupid = page_cpupid_last(page); target_nid = numa_migrate_prep(page, vma, haddr, page_nid, &flags); @@ -1824,6 +1826,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (prot_numa) { struct page *page; + bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1836,13 +1839,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto unlock; page = pmd_page(*pmd); + toptier = node_is_toptier(page_to_nid(page)); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(page_to_nid(page))) + toptier) goto unlock; + + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical @@ -2140,6 +2148,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. + * + * See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) @@ -2288,25 +2298,11 @@ out: void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) + if (!pmd) return; - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, folio); } @@ -2649,6 +2645,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) mapping = NULL; anon_vma_lock_write(anon_vma); } else { + gfp_t gfp; + mapping = head->mapping; /* Truncated ? */ @@ -2657,8 +2655,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), - mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + + if (folio_test_private(folio) && + !filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } + + xas_split_alloc(&xas, head, compound_order(head), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -3175,6 +3181,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + /* See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0bdfc7e1c933..2ca4e8c3163e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -456,14 +456,12 @@ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { - struct list_head allocated_regions; + LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); - INIT_LIST_HEAD(&allocated_regions); - /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. @@ -1506,6 +1504,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_page_private(page, 0); + /* + * We have to set HPageVmemmapOptimized again as above + * set_page_private(page, 0) cleared it. + */ SetHPageVmemmapOptimized(page); /* @@ -2336,7 +2338,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { - struct list_head surplus_list; + LIST_HEAD(surplus_list); struct page *page, *tmp; int ret; long i; @@ -2351,7 +2353,6 @@ static int gather_surplus_pages(struct hstate *h, long delta) } allocated = 0; - INIT_LIST_HEAD(&surplus_list); ret = -ENOMEM; retry: @@ -3474,7 +3475,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) * based on pool changes for the demoted page. */ h->max_huge_pages--; - target_hstate->max_huge_pages += pages_per_huge_page(h); + target_hstate->max_huge_pages += + pages_per_huge_page(h) / pages_per_huge_page(target_hstate); return rc; } @@ -3767,8 +3769,7 @@ HSTATE_ATTR_WO(demote); static ssize_t demote_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int nid; - struct hstate *h = kobj_to_hstate(kobj, &nid); + struct hstate *h = kobj_to_hstate(kobj, NULL); unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; return sysfs_emit(buf, "%lukB\n", demote_size); @@ -3781,7 +3782,6 @@ static ssize_t demote_size_store(struct kobject *kobj, struct hstate *h, *demote_hstate; unsigned long demote_size; unsigned int demote_order; - int nid; demote_size = (unsigned long)memparse(buf, NULL); @@ -3793,7 +3793,7 @@ static ssize_t demote_size_store(struct kobject *kobj, return -EINVAL; /* demote order must be smaller than hstate order */ - h = kobj_to_hstate(kobj, &nid); + h = kobj_to_hstate(kobj, NULL); if (demote_order >= h->order) return -EINVAL; @@ -3847,15 +3847,22 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, if (retval) { kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; + return retval; } if (h->demote_order) { - if (sysfs_create_group(hstate_kobjs[hi], - &hstate_demote_attr_group)) + retval = sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group); + if (retval) { pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } } - return retval; + return 0; } static void __init hugetlb_sysfs_init(void) @@ -3941,10 +3948,15 @@ static void hugetlb_unregister_node(struct node *node) for_each_hstate(h) { int idx = hstate_index(h); - if (nhs->hstate_kobjs[idx]) { - kobject_put(nhs->hstate_kobjs[idx]); - nhs->hstate_kobjs[idx] = NULL; - } + struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; + + if (!hstate_kobj) + continue; + if (h->demote_order) + sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); + sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); + kobject_put(hstate_kobj); + nhs->hstate_kobjs[idx] = NULL; } kobject_put(nhs->hugepages_kobj); @@ -4019,6 +4031,14 @@ static void hugetlb_register_all_nodes(void) { } #endif +#ifdef CONFIG_CMA +static void __init hugetlb_cma_check(void); +#else +static inline __init void hugetlb_cma_check(void) +{ +} +#endif + static int __init hugetlb_init(void) { int i; @@ -4118,7 +4138,7 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_alloc = first_memory_node; h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", - huge_page_size(h)/1024); + huge_page_size(h)/SZ_1K); parsed_hstate = h; } @@ -4133,11 +4153,11 @@ static void __init hugepages_clear_pages_in_node(void) if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, - MAX_NUMNODES * sizeof(unsigned int)); + sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, - MAX_NUMNODES * sizeof(unsigned int)); + sizeof(parsed_hstate->max_huge_pages_node)); } } @@ -4332,18 +4352,34 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; - nodemask_t *mpol_allowed; + nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); - mpol_allowed = policy_nodemask_current(gfp_mask); - + mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || node_isset(node, *mpol_allowed)) + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } @@ -4723,7 +4759,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - pte_t *src_pte, *dst_pte, entry, dst_entry; + pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); @@ -4768,15 +4804,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, /* * If the pagetables are shared don't copy or take references. - * dst_pte == src_pte is the common case of src/dest sharing. * + * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with - * another vma. If dst_pte !none, this implies sharing. - * Check here before taking page table lock, and once again - * after taking the lock below. + * another vma. So page_count of ptep page is checked instead + * to reliably determine whether pte is shared. */ - dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + if (page_count(virt_to_page(dst_pte)) > 1) { addr |= last_addr_mask; continue; } @@ -4785,13 +4819,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - dst_entry = huge_ptep_get(dst_pte); again: - if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + if (huge_pte_none(entry)) { /* - * Skip if src entry none. Also, skip in the - * unlikely case dst entry !none as this implies - * sharing with another vma. + * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { @@ -4870,7 +4901,7 @@ again: restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); - /* dst_entry won't change as in child */ + /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_page(dst_vma, dst_pte, addr, new); @@ -5316,7 +5347,6 @@ retry_avoidcopy: u32 hash; put_page(old_page); - BUG_ON(huge_pte_none(pte)); /* * Drop hugetlb_fault_mutex and i_mmap_rwsem before * unmapping. unmapping needs to hold i_mmap_rwsem @@ -5408,19 +5438,6 @@ out_release_old: return ret; } -/* Return the pagecache page at a given address within a VMA */ -static struct page *hugetlbfs_pagecache_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) -{ - struct address_space *mapping; - pgoff_t idx; - - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - - return find_lock_page(mapping, idx); -} - /* * Return whether there is a pagecache page to back given address within VMA. * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. @@ -5547,7 +5564,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; -retry: new_page = false; page = find_lock_page(mapping, idx); if (!page) { @@ -5587,9 +5603,15 @@ retry: if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); if (err) { + /* + * err can't be -EEXIST which implies someone + * else consumed the reservation since hugetlb + * fault mutex is held when add a hugetlb page + * to the page cache. So it's safe to call + * restore_reserve_on_error() here. + */ + restore_reserve_on_error(h, vma, haddr, page); put_page(page); - if (err == -EEXIST) - goto retry; goto out; } new_pagecache_page = true; @@ -5810,7 +5832,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr); + pagecache_page = find_lock_page(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -6017,8 +6039,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, page_in_pagecache = true; } - ptl = huge_pte_lockptr(h, dst_mm, dst_pte); - spin_lock(ptl); + ptl = huge_pte_lock(h, dst_mm, dst_pte); /* * Recheck the i_size after holding PT lock to make sure not @@ -7334,7 +7355,7 @@ void __init hugetlb_cma_reserve(int order) hugetlb_cma_size = 0; } -void __init hugetlb_cma_check(void) +static void __init hugetlb_cma_check(void) { if (!hugetlb_cma_size || cma_reserve_called) return; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index c86691c431fd..f61d132df52b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -75,11 +75,11 @@ parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { - int idx; + struct hstate *h; - for (idx = 0; idx < hugetlb_max_hstate; idx++) { + for_each_hstate(h) { if (page_counter_read( - hugetlb_cgroup_counter_from_cgroup(h_cg, idx))) + hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) return true; } return false; @@ -154,9 +154,9 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) * function. */ for_each_node(node) { - /* Set node_to_alloc to -1 for offline nodes. */ + /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ int node_to_alloc = - node_state(node, N_NORMAL_MEMORY) ? node : -1; + node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; h_cgroup->nodeinfo[node] = kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), GFP_KERNEL, node_to_alloc); @@ -225,17 +225,14 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; - int idx; do { - idx = 0; for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(idx, h_cg, page); + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); spin_unlock_irq(&hugetlb_lock); - idx++; } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); @@ -442,7 +439,7 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; - if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 && + if (rg->reservation_counter && resv->pages_per_hpage && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); @@ -675,12 +672,12 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, static char *mem_fmt(char *buf, int size, unsigned long hsize) { - if (hsize >= (1UL << 30)) - snprintf(buf, size, "%luGB", hsize >> 30); - else if (hsize >= (1UL << 20)) - snprintf(buf, size, "%luMB", hsize >> 20); + if (hsize >= SZ_1G) + snprintf(buf, size, "%luGB", hsize / SZ_1G); + else if (hsize >= SZ_1M) + snprintf(buf, size, "%luMB", hsize / SZ_1M); else - snprintf(buf, size, "%luKB", hsize >> 10); + snprintf(buf, size, "%luKB", hsize / SZ_1K); return buf; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 20f414c0379f..ba2a2596fb4e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -265,11 +265,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, static inline void reset_struct_pages(struct page *start) { - int i; struct page *from = start + NR_RESET_STRUCT_PAGE; - for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) - memcpy(start + i, from, sizeof(*from)); + BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); + memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, @@ -287,6 +286,11 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); + /* + * Makes sure that preceding stores to the page contents become visible + * before the set_pte_at() write. + */ + smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } diff --git a/mm/internal.h b/mm/internal.h index 785409805ed7..55ce10e4d0c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -187,7 +187,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason /* * in mm/rmap.c: */ -extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c diff --git a/mm/kfence/core.c b/mm/kfence/core.c index c252081b11df..8c08ae2101d7 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -1003,6 +1003,13 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) return NULL; } + /* + * Skip allocations for this slab, if KFENCE has been disabled for + * this slab. + */ + if (s->flags & SLAB_SKIP_KFENCE) + return NULL; + if (atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 70b7ac66411c..0bcba493ebb4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, @@ -73,6 +74,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -85,6 +88,16 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ + u32 node_load[MAX_NUMNODES]; + + /* Last target selected in hpage_collapse_find_target_node() */ + int last_target_node; +}; + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -425,7 +438,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } -static inline int khugepaged_test_exit(struct mm_struct *mm) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } @@ -440,7 +453,7 @@ void __khugepaged_enter(struct mm_struct *mm) return; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return; @@ -466,7 +479,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } @@ -492,11 +505,10 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (mm_slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_lock. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); @@ -546,11 +558,12 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; @@ -558,8 +571,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -579,11 +594,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } } if (PageCompound(page)) { @@ -646,10 +664,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -658,19 +680,19 @@ next: if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -735,9 +757,12 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, + .last_target_node = NUMA_NO_NODE, +}; -static bool khugepaged_scan_abort(int nid) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -749,11 +774,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -772,146 +797,62 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { + if (target_node <= cc->last_target_node) + for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == cc->node_load[nid]) { target_node = nid; break; } - last_khugepaged_target_node = target_node; + cc->last_target_node = target_node; return target_node; } - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +#else +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; + return 0; } +#endif -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) { - VM_BUG_ON_PAGE(*hpage, *hpage); - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; + return false; } prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - /* - * If the hpage allocated earlier was briefly exposed in page cache - * before collapse_file() failed, it is possible that racing lookups - * have not yet completed, and would then be unpleasantly surprised by - * finding the hpage reused for the same mapping at a different offset. - * Just release the previous allocation if there is any danger of that. - */ - if (*hpage && page_count(*hpage) > 1) { - put_page(*hpage); - *hpage = NULL; - } - - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - return true; } -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). + * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -920,7 +861,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -931,21 +873,60 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, */ if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; - return 0; + return SCAN_SUCCEED; +} + +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmd_read_atomic(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Note that if false is returned, mmap_lock will be released. */ -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; @@ -976,12 +957,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, */ if (ret & VM_FAULT_RETRY) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + return SCAN_FAIL; } swapped_in++; } @@ -991,30 +973,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, lru_add_drain(); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced, int unmapped) +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + /* Only allocate from the target node */ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE) | __GFP_THISNODE; + int node = hpage_collapse_find_target_node(cc); + + if (!hpage_collapse_alloc_page(hpage, gfp, node)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; - gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1022,40 +1015,34 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + result = hugepage_vma_revalidate(mm, address, &vma, cc); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - /* - * __collapse_huge_page_swapin will return with mmap_lock released - * when it fails. So we jump out_nolock directly in that case. - * Continuing to collapse causes inconsistency. - */ - if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, - pmd, referenced)) { - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } mmap_read_unlock(mm); @@ -1065,11 +1052,12 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + result = hugepage_vma_revalidate(mm, address, &vma, cc); + if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1095,11 +1083,11 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1111,7 +1099,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1121,8 +1108,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1130,42 +1117,43 @@ static void collapse_huge_page(struct mm_struct *mm, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); - trace_mm_collapse_huge_page(mm, isolated, result); - return; + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1175,19 +1163,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) goto out; - } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1205,8 +1193,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1236,27 +1226,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out_unmap; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } } page = compound_head(page); /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1291,31 +1284,38 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } static void collect_mm_slot(struct mm_slot *mm_slot) @@ -1324,7 +1324,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); @@ -1402,12 +1402,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) return; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1422,8 +1423,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; - pmd = mm_find_pmd(mm, haddr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); @@ -1497,7 +1497,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (!mmap_write_trylock(mm)) return; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) @@ -1541,8 +1541,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; - pmd = mm_find_pmd(mm, addr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; /* * We need exclusive mmap_lock to retract page table. @@ -1560,7 +1559,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) + if (!hpage_collapse_test_exit(mm) && + !userfaultfd_wp(vma)) collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { @@ -1577,8 +1577,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @mm: process address space where collapse happens * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse - * @node: appointed node the new huge page allocate from + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1595,13 +1594,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, - struct file *file, pgoff_t start, - struct page **hpage, int node) +static int collapse_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - gfp_t gfp; - struct page *new_page; + struct page *hpage; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1612,20 +1609,9 @@ static void collapse_file(struct mm_struct *mm, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; - } - - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* * Ensure we have slots for all the pages in the range. This is @@ -1643,14 +1629,14 @@ static void collapse_file(struct mm_struct *mm, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1678,7 +1664,7 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } @@ -1820,19 +1806,19 @@ static void collapse_file(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1843,21 +1829,21 @@ out_unlock: smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1879,11 +1865,11 @@ xa_unlocked: index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1894,23 +1880,22 @@ xa_unlocked: index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + set_page_dirty(hpage); + lru_cache_add(hpage); /* * Remove pte page tables, so we can re-fault the page as huge. */ retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + unlock_page(hpage); + hpage = NULL; } else { struct page *page; @@ -1949,19 +1934,23 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + if (hpage) + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } /* TODO: tracepoints */ + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1972,14 +1961,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + ++swap; + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1997,11 +1988,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -2030,20 +2021,21 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + result = collapse_file(mm, file, start, cc); } } /* TODO: tracepoints */ + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); } @@ -2053,8 +2045,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2065,6 +2057,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2085,7 +2078,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) + if (likely(!hpage_collapse_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; @@ -2093,11 +2086,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; @@ -2111,9 +2104,10 @@ skip: VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2125,19 +2119,29 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + *result = khugepaged_scan_file(mm, file, pgoff, + cc); + mmap_locked = false; fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); } + if (*result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2153,7 +2157,7 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2187,19 +2191,16 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - + while (true) { cond_resched(); if (unlikely(kthread_should_stop() || try_to_freeze())) @@ -2211,14 +2212,25 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) @@ -2255,7 +2267,7 @@ static int khugepaged(void *none) set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } @@ -2354,3 +2366,120 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + *prev = vma; + + /* TODO: Support file/shmem */ + if (!vma->anon_vma || !vma_is_anonymous(vma)) + return -EINVAL; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + cc->last_target_node = NUMA_NO_NODE; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, &vma, cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, + cc); + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 1eddc0132f7f..37af2dc8dac9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -604,9 +604,8 @@ static int __save_stack_trace(unsigned long *trace) * memory block and add it to the object_list and object_tree_root (or * object_phys_tree_root). */ -static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp, - bool is_phys) +static void __create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp, bool is_phys) { unsigned long flags; struct kmemleak_object *object, *parent; @@ -618,7 +617,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); - return NULL; + return; } INIT_LIST_HEAD(&object->object_list); @@ -687,7 +686,6 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, */ dump_object_info(parent); kmem_cache_free(object_cache, object); - object = NULL; goto out; } } @@ -698,21 +696,20 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, list_add_tail_rcu(&object->object_list, &object_list); out: raw_spin_unlock_irqrestore(&kmemleak_lock, flags); - return object; } /* Create kmemleak object which allocated with virtual address. */ -static struct kmemleak_object *create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp) +static void create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { - return __create_object(ptr, size, min_count, gfp, false); + __create_object(ptr, size, min_count, gfp, false); } /* Create kmemleak object which allocated with physical address. */ -static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size, - int min_count, gfp_t gfp) +static void create_object_phys(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { - return __create_object(ptr, size, min_count, gfp, true); + __create_object(ptr, size, min_count, gfp, true); } /* @@ -1095,6 +1095,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; } + /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; @@ -1134,6 +1135,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, { struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; + pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; @@ -1148,6 +1150,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; + /* + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); diff --git a/mm/madvise.c b/mm/madvise.c index 9ff51650f4f0..4f86eb7f554d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1060,6 +1061,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1153,6 +1156,7 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: @@ -1169,13 +1173,13 @@ madvise_behavior_valid(int behavior) } } -static bool -process_madvise_behavior_valid(int behavior) +static bool process_madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: + case MADV_COLLAPSE: return true; default: return false; @@ -1342,6 +1346,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c9ced5..403af5f7a2b9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1143,7 +1143,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } while ((memcg = parent_mem_cgroup(memcg))); /* - * When cgruop1 non-hierarchy mode is used, + * When cgroup1 non-hierarchy mode is used, * parent_mem_cgroup() does not walk all the way up to the * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. @@ -3975,6 +3975,8 @@ static const unsigned int memcg1_stats[] = { NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, MEMCG_SWAP, }; @@ -3988,6 +3990,8 @@ static const char *const memcg1_stat_names[] = { "mapped_file", "dirty", "writeback", + "workingset_refault_anon", + "workingset_refault_file", "swap", }; @@ -4016,7 +4020,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4047,7 +4052,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * PAGE_SIZE); + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e7ac570dda75..265378237c22 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -413,7 +413,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, { struct to_kill *tk, *next; - list_for_each_entry_safe (tk, next, to_kill, nd) { + list_for_each_entry_safe(tk, next, to_kill, nd) { if (forcekill) { /* * In case something went wrong with munmapping @@ -437,6 +437,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } + list_del(&tk->nd); put_task_struct(tk->tsk); kfree(tk); } @@ -1401,7 +1402,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; - int kill = 1, forcekill; + int forcekill; bool mlocked = PageMlocked(hpage); /* @@ -1442,7 +1443,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { - kill = 0; ttu |= TTU_IGNORE_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); @@ -1453,12 +1453,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. - * - * Error handling: We ignore errors here because - * there's nothing that can be done. */ - if (kill) - collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); if (PageHuge(hpage) && !PageAnon(hpage)) { /* @@ -1500,7 +1496,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); return unmap_success; @@ -1529,20 +1526,18 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } -static int try_to_split_thp_page(struct page *page, const char *msg) +static int try_to_split_thp_page(struct page *page) { + int ret; + lock_page(page); - if (unlikely(split_huge_page(page))) { - unsigned long pfn = page_to_pfn(page); + ret = split_huge_page(page); + unlock_page(page); - unlock_page(page); - pr_info("%s: %#lx: thp split failed\n", msg, pfn); + if (unlikely(ret)) put_page(page); - return -EBUSY; - } - unlock_page(page); - return 0; + return ret; } static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, @@ -1867,8 +1862,10 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); - res = -EOPNOTSUPP; - goto out; + unlock_page(head); + if (res == 1) + put_page(head); + return -EOPNOTSUPP; } /* @@ -2084,7 +2081,7 @@ try_again: * page is a valid handlable page. */ SetPageHasHWPoisoned(hpage); - if (try_to_split_thp_page(p, "Memory Failure") < 0) { + if (try_to_split_thp_page(p) < 0) { action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); res = -EBUSY; goto unlock_mutex; @@ -2359,7 +2356,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (PageSlab(page) || PageTable(page)) + if (PageSlab(page) || PageTable(page) || PageReserved(page)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); @@ -2383,13 +2380,14 @@ int unpoison_memory(unsigned long pfn) count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; + put_page(page); goto unlock_mutex; } } freeit = !!TestClearPageHWPoison(p); put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + if (freeit) { put_page(page); ret = 0; } @@ -2439,11 +2437,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) } /* - * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. * If the page is mapped, it migrates the contents over. */ -static int __soft_offline_page(struct page *page) +static int soft_offline_in_use_page(struct page *page) { long ret = 0; unsigned long pfn = page_to_pfn(page); @@ -2456,6 +2454,14 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + if (!huge && PageTransHuge(hpage)) { + if (try_to_split_thp_page(page)) { + pr_info("soft offline: %#lx: thp split failed\n", pfn); + return -EBUSY; + } + hpage = page; + } + lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); @@ -2505,26 +2511,6 @@ static int __soft_offline_page(struct page *page) return ret; } -static int soft_offline_in_use_page(struct page *page) -{ - struct page *hpage = compound_head(page); - - if (!PageHuge(page) && PageTransHuge(hpage)) - if (try_to_split_thp_page(page, "soft offline") < 0) - return -EBUSY; - return __soft_offline_page(page); -} - -static int soft_offline_free_page(struct page *page) -{ - int rc = 0; - - if (!page_handle_poison(page, true, false)) - rc = -EBUSY; - - return rc; -} - static void put_ref_page(struct page *page) { if (page) @@ -2592,8 +2578,6 @@ retry: if (hwpoison_filter(page)) { if (ret > 0) put_page(page); - else - put_ref_page(ref_page); mutex_unlock(&mf_mutex); return -EOPNOTSUPP; @@ -2602,7 +2586,7 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (soft_offline_free_page(page) && try_again) { + if (!page_handle_poison(page, true, false) && try_again) { try_again = false; flags &= ~MF_COUNT_INCREASED; goto retry; diff --git a/mm/memory.c b/mm/memory.c index a78814413ac0..e38f9245470c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -74,6 +74,7 @@ #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> +#include <linux/sched/sysctl.h> #include <trace/events/kmem.h> @@ -4731,8 +4732,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; - last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(page_nid)) + last_cpupid = (-1 & LAST_CPUPID_MASK); + else + last_cpupid = page_cpupid_last(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); if (target_nid == NUMA_NO_NODE) { @@ -4991,7 +5000,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5025,7 +5034,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fad6d1f2262a..9ae1f98548b1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1940,8 +1940,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { - kswapd_stop(node); kcompactd_stop(node); + kswapd_stop(node); } writeback_set_ratelimit(); @@ -1969,11 +1969,10 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { - int ret = !is_memblock_offlined(mem); int *nid = arg; *nid = mem->nid; - if (unlikely(ret)) { + if (unlikely(mem->state != MEM_OFFLINE)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b73d3248d976..a88fd94e18d6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -853,12 +853,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { + task_unlock(current); mpol_put(new); goto out; } - task_lock(current); + old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) @@ -1803,7 +1805,7 @@ bool vma_policy_mof(struct vm_area_struct *vma) return pol->flags & MPOL_F_MOF; } -static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; diff --git a/mm/memremap.c b/mm/memremap.c index 58b20c3c300b..25029a474d30 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -454,7 +454,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) + if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) pgmap = NULL; rcu_read_unlock(); diff --git a/mm/migrate.c b/mm/migrate.c index 6a1597c92261..ce6a58f3b21f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -560,6 +560,18 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) * future migrations of this same page. */ cpupid = page_cpupid_xchg_last(&folio->page, -1); + /* + * For memory tiering mode, when migrate between slow and fast + * memory node, reset cpupid, because that is used to record + * page access time in slow memory node. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { + bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); + bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); + + if (f_toptier != t_toptier) + cpupid = -1; + } page_cpupid_xchg_last(&newfolio->page, cpupid); folio_migrate_ksm(newfolio, folio); @@ -1848,6 +1860,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); + unsigned int foll_flags = FOLL_DUMP; struct vm_area_struct *vma; struct page *page; int err = -EFAULT; @@ -1856,8 +1869,12 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma) goto set_status; + /* Not all huge page follow APIs support 'FOLL_GET' */ + if (!is_vm_hugetlb_page(vma)) + foll_flags |= FOLL_GET; + /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page(vma, addr, foll_flags); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1865,7 +1882,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (page && !is_zone_device_page(page)) { err = page_to_nid(page); - put_page(page); + if (foll_flags & FOLL_GET) + put_page(page); } else { err = -ENOENT; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index dbf6c7a7a7c9..d8efd5a0eb40 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -186,9 +186,16 @@ again: get_page(page); /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. + * We rely on trylock_page() to avoid deadlock between + * concurrent migrations where each is waiting on the others + * page lock. If we can't immediately lock the page we fail this + * migration as it is only best effort anyway. + * + * If we can lock the page it's safe to set up a migration entry + * now. In the common case where the page is mapped once in a + * single process setting up the migration entry now is an + * optimisation to avoid walking the rmap later with + * try_to_migrate(). */ if (trylock_page(page)) { bool anon_exclusive; diff --git a/mm/mmap.c b/mm/mmap.c index 9d780f415be3..dd25a2aa94f7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2232,6 +2232,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, */ pgoff = 0; get_area = shmem_get_unmapped_area; + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + /* Ensures that larger anonymous mappings are THP aligned. */ + get_area = thp_get_unmapped_area; } addr = get_area(file, addr, len, pgoff, flags); diff --git a/mm/mprotect.c b/mm/mprotect.c index bc6bddd156ca..ed013f836b4a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -121,6 +121,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, if (prot_numa) { struct page *page; int nid; + bool toptier; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) @@ -150,14 +151,19 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, nid = page_to_nid(page); if (target_node == nid) continue; + toptier = node_is_toptier(nid); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(nid)) + toptier) continue; + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, + jiffies_to_msecs(jiffies)); } oldpte = ptep_modify_prot_start(vma, addr, pte); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d04211f0ef0b..262896bd1a90 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -482,6 +482,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; + if (early_page_ext_enabled()) + return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. @@ -3010,7 +3012,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ - if (alloc_flags & ALLOC_NOFRAGMENT) + if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order; /* @@ -3598,16 +3600,11 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { - unsigned long watermark; - struct zone *zone; - int mt; - - BUG_ON(!PageBuddy(page)); - - zone = page_zone(page); - mt = get_pageblock_migratetype(page); + struct zone *zone = page_zone(page); + int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { + unsigned long watermark; /* * Obey watermarks as if the page was being allocated. We can * emulate a high-order watermark check with a raised order-0 @@ -3621,8 +3618,6 @@ int __isolate_free_page(struct page *page, unsigned int order) __mod_zone_freepage_state(zone, -(1UL << order), mt); } - /* Remove page from free list */ - del_page_from_free_list(page, zone, order); /* @@ -3643,7 +3638,6 @@ int __isolate_free_page(struct page *page, unsigned int order) } } - return 1UL << order; } @@ -3777,8 +3771,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3839,7 +3832,7 @@ struct page *rmqueue(struct zone *preferred_zone, if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + migratetype, alloc_flags); if (likely(page)) goto out; } @@ -7659,6 +7652,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) int i; pgdat_resize_init(pgdat); + pgdat_kswapd_lock_init(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); @@ -7953,17 +7947,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/** - * find_min_pfn_with_active_regions - Find the minimum PFN registered - * - * Return: the minimum PFN based on information provided via - * memblock_set_node(). - */ -unsigned long __init find_min_pfn_with_active_regions(void) -{ - return PHYS_PFN(memblock_start_of_DRAM()); -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. @@ -8256,7 +8239,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - start_pfn = find_min_pfn_with_active_regions(); + start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { @@ -9019,7 +9002,7 @@ void *__init alloc_large_system_hash(const char *tablename, { unsigned long long max = high_limit; unsigned long log2qty, size; - void *table = NULL; + void *table; gfp_t gfp_flags; bool virt; bool huge; diff --git a/mm/page_counter.c b/mm/page_counter.c index eb156ff5d603..db20d6452b71 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; - unsigned long low, min; long delta; if (!c->parent) return; - min = READ_ONCE(c->min); - if (min || atomic_long_read(&c->min_usage)) { - protected = min(usage, min); + protected = min(usage, READ_ONCE(c->min)); + old_protected = atomic_long_read(&c->min_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } - low = READ_ONCE(c->low); - if (low || atomic_long_read(&c->low_usage)) { - protected = min(usage, low); + protected = min(usage, READ_ONCE(c->low)); + old_protected = atomic_long_read(&c->low_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) @@ -193,7 +192,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) old = xchg(&counter->max, nr_pages); - if (page_counter_read(counter) <= usage) + if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; diff --git a/mm/page_ext.c b/mm/page_ext.c index 3dc715d7ac29..affe80243b6d 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -9,6 +9,7 @@ #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> +#include <linux/rcupdate.h> /* * struct page extension @@ -59,6 +60,10 @@ * can utilize this callback to initialize the state of it correctly. */ +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { @@ -84,6 +89,15 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; +static struct page_ext *lookup_page_ext(const struct page *page); + +bool early_page_ext; +static int __init setup_early_page_ext(char *str) +{ + early_page_ext = true; + return 0; +} +early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { @@ -125,6 +139,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index) return base + page_ext_size * index; } +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext - Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} #ifndef CONFIG_SPARSEMEM @@ -133,12 +189,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; + WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a @@ -206,20 +263,27 @@ fail: } #else /* CONFIG_SPARSEMEM */ +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ - if (!section->page_ext) + if (page_ext_invalid(page_ext)) return NULL; - return get_entry(section->page_ext, pfn); + return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -298,9 +362,30 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = get_entry(ms->page_ext, pfn); + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); free_page_ext(base); - ms->page_ext = NULL; +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, @@ -336,13 +421,27 @@ static int __meminit online_page_ext(unsigned long start_pfn, } static int __meminit offline_page_ext(unsigned long start_pfn, - unsigned long nr_pages, int nid) + unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return 0; @@ -362,11 +461,11 @@ static int __meminit page_ext_callback(struct notifier_block *self, break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + mn->nr_pages); break; case MEM_GOING_OFFLINE: break; diff --git a/mm/page_io.c b/mm/page_io.c index 68318134dc92..68d53fc27598 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -28,7 +28,7 @@ #include <linux/delayacct.h> #include "swap.h" -void end_swap_bio_write(struct bio *bio) +static void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -202,7 +202,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) end_page_writeback(page); goto out; } - ret = __swap_writepage(page, wbc, end_swap_bio_write); + ret = __swap_writepage(page, wbc); out: return ret; } @@ -332,8 +332,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) return 0; } -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func) +int __swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret; @@ -358,7 +357,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_end_io = end_write_func; + bio->bi_end_io = end_swap_bio_write; bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); diff --git a/mm/page_owner.c b/mm/page_owner.c index e4c6f3f1695b..90023f938c19 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order) struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; depot_stack_handle_t handle; + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - - handle = save_stack(gfp_mask); __set_page_owner_handle(page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -219,16 +223,24 @@ void __split_page_owner(struct page *page, unsigned int nr) page_owner->order = 0; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - struct page_ext *old_ext = lookup_page_ext(&old->page); - struct page_ext *new_ext = lookup_page_ext(&newfolio->page); + struct page_ext *old_ext; + struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; - if (unlikely(!old_ext || !new_ext)) + old_ext = page_ext_get(&old->page); + if (unlikely(!old_ext)) + return; + + new_ext = page_ext_get(&newfolio->page); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); return; + } old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); @@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); @@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, count[pageblock_mt]++; pfn = block_end_pfn; + page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); } } @@ -435,7 +452,7 @@ err: void __dump_page_owner(const struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; @@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); return; } @@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page) if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); } static ssize_t @@ -497,8 +516,10 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) return -EINVAL; page = NULL; - pfn = min_low_pfn + *ppos; - + if (*ppos == 0) + pfn = min_low_pfn; + else + pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; @@ -508,6 +529,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + + /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not */ @@ -525,7 +554,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; @@ -534,14 +563,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); @@ -550,7 +579,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) - continue; + goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should @@ -558,18 +587,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) */ handle = READ_ONCE(page_owner->handle); if (!handle) - continue; + goto ext_put_continue; /* Record the next PFN to read in the file offset */ - *ppos = (pfn - min_low_pfn) + 1; + *ppos = pfn + 1; + page_owner_tmp = *page_owner; + page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, - page_owner, handle); + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); } return 0; } +static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case SEEK_SET: + file->f_pos = offset; + break; + case SEEK_CUR: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + return file->f_pos; +} + static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; @@ -617,18 +665,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; +ext_put_continue: + page_ext_put(page_ext); } cond_resched(); } @@ -660,6 +710,7 @@ static void init_early_allocated_pages(void) static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, + .llseek = lseek_page_owner, }; static int __init pageowner_init(void) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e2062748791a..903db62794d3 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -68,7 +68,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -83,6 +83,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -103,7 +104,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -118,6 +119,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -126,9 +128,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, */ void __page_table_check_zero(struct page *page, unsigned int order) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; unsigned long i; + page_ext = page_ext_get(page); BUG_ON(!page_ext); for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); @@ -137,6 +140,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) BUG_ON(atomic_read(&ptc->file_map_count)); page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b3db11a4d1d..908ec1577f40 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -479,7 +479,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } -/* +/** + * walk_page_range_novma - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for diff --git a/mm/rmap.c b/mm/rmap.c index edc06c52bc82..6781f693df50 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -767,13 +767,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return vma_address(page, vma); } +/* + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* + * represents. + */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; - pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -788,15 +792,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - /* - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() - * without holding anon_vma lock for write. So when looking for a - * genuine pmde (in which to find pte), test present and !THP together. - */ - pmde = *pmd; - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - pmd = NULL; out: return pmd; } @@ -1579,11 +1574,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); - /* - * Nuke the page table entry. When having to clear - * PageAnonExclusive(), we always have to flush. - */ - if (should_defer_flush(mm, flags) && !anon_exclusive) { + /* Nuke the page table entry. */ + if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. @@ -1714,6 +1706,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } + + /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(subpage)) { swap_free(entry); @@ -2045,6 +2039,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && !anon_exclusive, subpage); + + /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(subpage)) { if (folio_test_hugetlb(folio)) diff --git a/mm/slub.c b/mm/slub.c index 862dbd9af4f5..6953c3367bc2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5745,6 +5745,29 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); #endif /* CONFIG_SLUB_STATS */ +#ifdef CONFIG_KFENCE +static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE)); +} + +static ssize_t skip_kfence_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = length; + + if (buf[0] == '0') + s->flags &= ~SLAB_SKIP_KFENCE; + else if (buf[0] == '1') + s->flags |= SLAB_SKIP_KFENCE; + else + ret = -EINVAL; + + return ret; +} +SLAB_ATTR(skip_kfence); +#endif + static struct attribute *slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, @@ -5812,6 +5835,9 @@ static struct attribute *slab_attrs[] = { &failslab_attr.attr, #endif &usersize_attr.attr, +#ifdef CONFIG_KFENCE + &skip_kfence_attr.attr, +#endif NULL }; diff --git a/mm/swap.h b/mm/swap.h index 17936e068c1c..0ffa5b478051 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -void end_swap_bio_write(struct bio *bio); -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func); +int __swap_writepage(struct page *page, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ diff --git a/mm/util.c b/mm/util.c index c9439c66d8cf..8d944ce71e94 100644 --- a/mm/util.c +++ b/mm/util.c @@ -850,10 +850,10 @@ int folio_mapcount(struct folio *folio) return atomic_read(&folio->_mapcount) + 1; compound = folio_entire_mapcount(folio); - nr = folio_nr_pages(folio); if (folio_test_hugetlb(folio)) return compound; ret = compound; + nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; /* File pages has compound_mapcount included in _mapcount */ @@ -1052,6 +1052,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: + pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n", + __func__, current->pid, current->comm); vm_unacct_memory(pages); return -ENOMEM; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dd6cdb201195..a991b909866f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -590,7 +590,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), - __pa(page_address(pages[i])), prot, + page_to_phys(pages[i]), prot, page_shift); if (err) return err; @@ -1300,12 +1300,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size, #include <linux/random.h> static struct vmap_area * -find_vmap_lowest_linear_match(unsigned long size, +find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; - list_for_each_entry(va, &free_vmap_area_list, list) { + list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; @@ -1316,7 +1316,8 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size, unsigned long align) +find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1325,8 +1326,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart, false); - va_2 = find_vmap_lowest_linear_match(size, align, vstart); + va_1 = find_vmap_lowest_match(root, size, align, vstart, false); + va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1513,7 +1514,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size, align); + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; diff --git a/mm/vmscan.c b/mm/vmscan.c index 382dbe97329f..c879694b41fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3228,22 +3228,22 @@ again: if (!sc->force_deactivate) { unsigned long refaults; + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - if (refaults != target_lruvec->refaults[0] || + if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) sc->may_deactivate |= DEACTIVATE_ANON; else sc->may_deactivate &= ~DEACTIVATE_ANON; - /* - * When refaults are being observed, it means a new - * workingset is being established. Deactivate to get - * rid of any stale active pages quickly. - */ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); - if (refaults != target_lruvec->refaults[1] || + if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) sc->may_deactivate |= DEACTIVATE_FILE; else @@ -3559,9 +3559,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; + target_lruvec->refaults[WORKINGSET_ANON] = refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); - target_lruvec->refaults[1] = refaults; + target_lruvec->refaults[WORKINGSET_FILE] = refaults; } /* @@ -4643,16 +4643,17 @@ void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - if (pgdat->kswapd) - return; - - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ - BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); - pgdat->kswapd = NULL; + pgdat_kswapd_lock(pgdat); + if (!pgdat->kswapd) { + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd on node %d\n", nid); + pgdat->kswapd = NULL; + } } + pgdat_kswapd_unlock(pgdat); } /* @@ -4661,12 +4662,16 @@ void kswapd_run(int nid) */ void kswapd_stop(int nid) { - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + pg_data_t *pgdat = NODE_DATA(nid); + struct task_struct *kswapd; + pgdat_kswapd_lock(pgdat); + kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; + pgdat->kswapd = NULL; } + pgdat_kswapd_unlock(pgdat); } static int __init kswapd_init(void) diff --git a/mm/vmstat.c b/mm/vmstat.c index 90af9a8572f5..c109167a669c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1252,6 +1252,7 @@ const char * const vmstat_text[] = { #endif #ifdef CONFIG_NUMA_BALANCING "pgpromote_success", + "pgpromote_candidate", #endif /* enum writeback_stat_item counters */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 907c9b1e1e61..12eb11e70939 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1555,6 +1555,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, d_off += size; d_size -= size; + /* + * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() + * calls must occurs in reverse order of calls to kmap_atomic(). + * So, to call kunmap_atomic(s_addr) we should first call + * kunmap_atomic(d_addr). For more details see + * Documentation/mm/highmem.rst. + */ if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); @@ -2103,8 +2110,6 @@ unsigned long zs_compact(struct zs_pool *pool) for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; pages_freed += __zs_compact(pool, class); @@ -2149,8 +2154,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; @@ -2308,9 +2311,6 @@ void zs_destroy_pool(struct zs_pool *pool) int fg; struct size_class *class = pool->size_class[i]; - if (!class) - continue; - if (class->index != i) continue; diff --git a/mm/zswap.c b/mm/zswap.c index 104835b379ec..2d48fd59cc7a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1026,7 +1026,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) SetPageReclaim(page); /* start writeback */ - __swap_writepage(page, &wbc, end_swap_bio_write); + __swap_writepage(page, &wbc); put_page(page); zswap_written_back_pages++; |