diff options
Diffstat (limited to 'mm')
99 files changed, 4435 insertions, 4240 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e3454087fd31..e00df648c91d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -46,6 +46,22 @@ config ZSWAP_DEFAULT_ON The selection made here can be overridden by using the kernel command line 'zswap.enabled=' option. +config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON + bool "Invalidate zswap entries when pages are loaded" + depends on ZSWAP + help + If selected, exclusive loads for zswap will be enabled at boot, + otherwise it will be disabled. + + If exclusive loads are enabled, when a page is loaded from zswap, + the zswap entry is invalidated at once, as opposed to leaving it + in zswap until the swap entry is freed. + + This avoids having two copies of the same page in memory + (compressed and uncompressed) after faulting in a page from zswap. + The cost is that if the page was never dirtied and needs to be + swapped out again, it will be re-compressed. + choice prompt "Default compressor" depends on ZSWAP diff --git a/mm/Makefile b/mm/Makefile index e29afc890cde..678530a07326 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o \ + compaction.o show_mem.o\ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -89,6 +89,7 @@ obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o +obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o @@ -123,6 +124,7 @@ obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o +obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7da9727fcdf3..3ffc3cfa7a14 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -20,7 +20,6 @@ struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); -static struct class *bdi_class; static const char *bdi_unknown_name = "(unknown)"; /* @@ -345,13 +344,19 @@ static struct attribute *bdi_dev_attrs[] = { }; ATTRIBUTE_GROUPS(bdi_dev); +static const struct class bdi_class = { + .name = "bdi", + .dev_groups = bdi_dev_groups, +}; + static __init int bdi_class_init(void) { - bdi_class = class_create("bdi"); - if (IS_ERR(bdi_class)) - return PTR_ERR(bdi_class); + int ret; + + ret = class_register(&bdi_class); + if (ret) + return ret; - bdi_class->dev_groups = bdi_dev_groups; bdi_debug_init(); return 0; @@ -1001,7 +1006,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); - dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); + dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -483,8 +483,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, if (ret != -EBUSY) break; - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); + pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", + __func__, pfn, pfn_to_page(pfn)); trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), count, align); diff --git a/mm/compaction.c b/mm/compaction.c index c8bcdea15f5f..dbc9f86b1934 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -229,6 +229,33 @@ static void reset_cached_positions(struct zone *zone) pageblock_start_pfn(zone_end_pfn(zone) - 1); } +#ifdef CONFIG_SPARSEMEM +/* + * If the PFN falls into an offline section, return the start PFN of the + * next online section. If the PFN falls into an online section or if + * there is no next online section, return 0. + */ +static unsigned long skip_offline_sections(unsigned long start_pfn) +{ + unsigned long start_nr = pfn_to_section_nr(start_pfn); + + if (online_section_nr(start_nr)) + return 0; + + while (++start_nr <= __highest_present_section_nr) { + if (online_section_nr(start_nr)) + return section_nr_to_pfn(start_nr); + } + + return 0; +} +#else +static unsigned long skip_offline_sections(unsigned long start_pfn) +{ + return 0; +} +#endif + /* * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are @@ -392,18 +419,14 @@ void reset_isolation_suitable(pg_data_t *pgdat) * Sets the pageblock skip bit if it was clear. Note that this is a hint as * locks are not required for read/writers. Returns true if it was already set. */ -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { bool skip; - /* Do no update if skip hint is being ignored */ + /* Do not update if skip hint is being ignored */ if (cc->ignore_skip_hint) return false; - if (!pageblock_aligned(pfn)) - return false; - skip = get_pageblock_skip(page); if (!skip && !cc->no_set_skip_hint) set_pageblock_skip(page); @@ -440,9 +463,6 @@ static void update_pageblock_skip(struct compact_control *cc, if (cc->no_set_skip_hint) return; - if (!page) - return; - set_pageblock_skip(page); /* Update where async and sync compaction should restart */ @@ -470,8 +490,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { } -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { return false; } @@ -745,8 +764,9 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(pg_data_t *pgdat) +static bool too_many_isolated(struct compact_control *cc) { + pg_data_t *pgdat = cc->zone->zone_pgdat; bool too_many; unsigned long active, inactive, isolated; @@ -758,6 +778,17 @@ static bool too_many_isolated(pg_data_t *pgdat) isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); + /* + * Allow GFP_NOFS to isolate past the limit set for regular + * compaction runs. This prevents an ABBA deadlock when other + * compactors have already isolated to the limit, but are + * blocked on filesystem locks held by the GFP_NOFS thread. + */ + if (cc->gfp_mask & __GFP_FS) { + inactive >>= 3; + active >>= 3; + } + too_many = isolated > (inactive + active) / 2; if (!too_many) wake_throttle_isolated(pgdat); @@ -791,6 +822,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, struct lruvec *lruvec; unsigned long flags = 0; struct lruvec *locked = NULL; + struct folio *folio = NULL; struct page *page = NULL, *valid_page = NULL; struct address_space *mapping; unsigned long start_pfn = low_pfn; @@ -806,7 +838,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(pgdat))) { + while (unlikely(too_many_isolated(cc))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) return -EAGAIN; @@ -887,7 +919,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!valid_page && pageblock_aligned(low_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; - page = NULL; + folio = NULL; goto isolate_abort; } valid_page = page; @@ -919,7 +951,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - low_pfn += compound_nr(page) - 1; + folio = page_folio(page); + low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } @@ -987,8 +1020,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - if (isolate_movable_page(page, mode)) + if (isolate_movable_page(page, mode)) { + folio = page_folio(page); goto isolate_success; + } } goto isolate_fail; @@ -999,7 +1034,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * sure the page is not being freed elsewhere -- the * page release code relies on it. */ - if (unlikely(!get_page_unless_zero(page))) + folio = folio_get_nontail_page(page); + if (unlikely(!folio)) goto isolate_fail; /* @@ -1007,8 +1043,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * so avoid taking lru_lock and isolating it unnecessarily in an * admittedly racy check. */ - mapping = page_mapping(page); - if (!mapping && (page_count(page) - 1) > total_mapcount(page)) + mapping = folio_mapping(folio); + if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio)) goto isolate_fail_put; /* @@ -1019,11 +1055,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail_put; /* Only take pages on LRU: a check now makes later tests safe */ - if (!PageLRU(page)) + if (!folio_test_lru(folio)) goto isolate_fail_put; /* Compaction might skip unevictable pages but CMA takes them */ - if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page)) + if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio)) goto isolate_fail_put; /* @@ -1032,10 +1068,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * it will be able to migrate without blocking - clean pages * for the most part. PageWriteback would require blocking. */ - if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page)) + if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio)) goto isolate_fail_put; - if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) { + if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) { bool migrate_dirty; /* @@ -1047,22 +1083,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * the page lock until after the page is removed * from the page cache. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto isolate_fail_put; - mapping = page_mapping(page); + mapping = folio_mapping(folio); migrate_dirty = !mapping || mapping->a_ops->migrate_folio; - unlock_page(page); + folio_unlock(folio); if (!migrate_dirty) goto isolate_fail_put; } - /* Try isolate the page */ - if (!TestClearPageLRU(page)) + /* Try isolate the folio */ + if (!folio_test_clear_lru(folio)) goto isolate_fail_put; - lruvec = folio_lruvec(page_folio(page)); + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { @@ -1072,44 +1108,49 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, page_folio(page)); + lruvec_memcg_debug(lruvec, folio); - /* Try get exclusive access under lock */ - if (!skip_updated) { + /* + * Try get exclusive access under lock. If marked for + * skip, the scan is aborted unless the current context + * is a rescan to reach the end of the pageblock. + */ + if (!skip_updated && valid_page) { skip_updated = true; - if (test_and_set_skip(cc, page, low_pfn)) + if (test_and_set_skip(cc, valid_page) && + !cc->finish_pageblock) { goto isolate_abort; + } } /* - * Page become compound since the non-locked check, - * and it's on LRU. It can only be a THP so the order - * is safe to read and it's 0 for tail pages. + * folio become large since the non-locked check, + * and it's on LRU. */ - if (unlikely(PageCompound(page) && !cc->alloc_contig)) { - low_pfn += compound_nr(page) - 1; - nr_scanned += compound_nr(page) - 1; - SetPageLRU(page); + if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) { + low_pfn += folio_nr_pages(folio) - 1; + nr_scanned += folio_nr_pages(folio) - 1; + folio_set_lru(folio); goto isolate_fail_put; } } - /* The whole page is taken off the LRU; skip the tail pages. */ - if (PageCompound(page)) - low_pfn += compound_nr(page) - 1; + /* The folio is taken off the LRU */ + if (folio_test_large(folio)) + low_pfn += folio_nr_pages(folio) - 1; /* Successfully isolated */ - del_page_from_lru_list(page, lruvec); - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - thp_nr_pages(page)); + lruvec_del_folio(lruvec, folio); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); isolate_success: - list_add(&page->lru, &cc->migratepages); + list_add(&folio->lru, &cc->migratepages); isolate_success_no_list: - cc->nr_migratepages += compound_nr(page); - nr_isolated += compound_nr(page); - nr_scanned += compound_nr(page) - 1; + cc->nr_migratepages += folio_nr_pages(folio); + nr_isolated += folio_nr_pages(folio); + nr_scanned += folio_nr_pages(folio) - 1; /* * Avoid isolating too much unless this block is being @@ -1131,7 +1172,7 @@ isolate_fail_put: unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; } - put_page(page); + folio_put(folio); isolate_fail: if (!skip_on_failure && ret != -ENOMEM) @@ -1172,14 +1213,14 @@ isolate_fail: if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; - page = NULL; + folio = NULL; isolate_abort: if (locked) unlock_page_lruvec_irqrestore(locked, flags); - if (page) { - SetPageLRU(page); - put_page(page); + if (folio) { + folio_set_lru(folio); + folio_put(folio); } /* @@ -1191,7 +1232,7 @@ isolate_abort: * rescanned twice in a row. */ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { - if (valid_page && !skip_updated) + if (!cc->no_set_skip_hint && valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); } @@ -1379,7 +1420,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ - if (cc->nr_freepages < cc->nr_migratepages) + if (start_pfn == end_pfn) set_pageblock_skip(page); return; @@ -1403,11 +1444,10 @@ static int next_search_order(struct compact_control *cc, int order) return order; } -static unsigned long -fast_isolate_freepages(struct compact_control *cc) +static void fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); - unsigned int nr_scanned = 0; + unsigned int nr_scanned = 0, total_isolated = 0; unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; @@ -1417,7 +1457,7 @@ fast_isolate_freepages(struct compact_control *cc) /* Full compaction passes in a negative order */ if (cc->order <= 0) - return cc->free_pfn; + return; /* * If starting the scan, use a deeper search and use the highest @@ -1506,6 +1546,7 @@ fast_isolate_freepages(struct compact_control *cc) set_page_private(page, order); nr_isolated = 1 << order; nr_scanned += nr_isolated - 1; + total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1518,6 +1559,10 @@ fast_isolate_freepages(struct compact_control *cc) spin_unlock_irqrestore(&cc->zone->lock, flags); + /* Skip fast search if enough freepages isolated */ + if (cc->nr_freepages >= cc->nr_migratepages) + break; + /* * Smaller scan on next order so the total scan is related * to freelist_scan_limit. @@ -1526,6 +1571,9 @@ fast_isolate_freepages(struct compact_control *cc) limit = max(1U, limit >> 1); } + trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn, + nr_scanned, total_isolated); + if (!page) { cc->fast_search_fail++; if (scan_start) { @@ -1556,11 +1604,10 @@ fast_isolate_freepages(struct compact_control *cc) cc->total_free_scanned += nr_scanned; if (!page) - return cc->free_pfn; + return; low_pfn = page_to_pfn(page); fast_isolate_around(cc, low_pfn); - return low_pfn; } /* @@ -1684,11 +1731,10 @@ splitmap: * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct page *compaction_alloc(struct page *migratepage, - unsigned long data) +static struct folio *compaction_alloc(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - struct page *freepage; + struct folio *dst; if (list_empty(&cc->freepages)) { isolate_freepages(cc); @@ -1697,11 +1743,11 @@ static struct page *compaction_alloc(struct page *migratepage, return NULL; } - freepage = list_entry(cc->freepages.next, struct page, lru); - list_del(&freepage->lru); + dst = list_entry(cc->freepages.next, struct folio, lru); + list_del(&dst->lru); cc->nr_freepages--; - return freepage; + return dst; } /* @@ -1709,11 +1755,11 @@ static struct page *compaction_alloc(struct page *migratepage, * freelist. All pages on the freelist are from the same zone, so there is no * special handling needed for NUMA. */ -static void compaction_free(struct page *page, unsigned long data) +static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - list_add(&page->lru, &cc->freepages); + list_add(&dst->lru, &cc->freepages); cc->nr_freepages++; } @@ -1736,6 +1782,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE */ static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; static int sysctl_extfrag_threshold = 500; +static int __read_mostly sysctl_compact_memory; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -1864,7 +1911,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } @@ -1940,8 +1986,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, cc->zone); - if (!page) + if (!page) { + unsigned long next_pfn; + + next_pfn = skip_offline_sections(block_start_pfn); + if (next_pfn) + block_end_pfn = min(next_pfn, cc->free_pfn); continue; + } /* * If isolation recently failed, do not retry. Only check the @@ -2193,25 +2245,11 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -static enum compact_result __compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, - int highest_zoneidx, - unsigned long wmark_target) +static bool __compaction_suitable(struct zone *zone, int order, + int highest_zoneidx, + unsigned long wmark_target) { unsigned long watermark; - - if (is_via_compact_memory(order)) - return COMPACT_CONTINUE; - - watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - /* - * If watermarks for high-order allocation are already met, there - * should be no need for compaction at all. - */ - if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, - alloc_flags)) - return COMPACT_SUCCESS; - /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the @@ -2229,29 +2267,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, - ALLOC_CMA, wmark_target)) - return COMPACT_SKIPPED; - - return COMPACT_CONTINUE; + return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + ALLOC_CMA, wmark_target); } /* * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction - * COMPACT_CONTINUE - If compaction should run now */ -enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, - int highest_zoneidx) +bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { - enum compact_result ret; - int fragindex; + enum compact_result compact_result; + bool suitable; - ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, - zone_page_state(zone, NR_FREE_PAGES)); + suitable = __compaction_suitable(zone, order, highest_zoneidx, + zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -2268,17 +2297,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order, * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ - if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - ret = COMPACT_NOT_SUITABLE_ZONE; + if (suitable) { + compact_result = COMPACT_CONTINUE; + if (order > PAGE_ALLOC_COSTLY_ORDER) { + int fragindex = fragmentation_index(zone, order); + + if (fragindex >= 0 && + fragindex <= sysctl_extfrag_threshold) { + suitable = false; + compact_result = COMPACT_NOT_SUITABLE_ZONE; + } + } + } else { + compact_result = COMPACT_SKIPPED; } - trace_mm_compaction_suitable(zone, order, ret); - if (ret == COMPACT_NOT_SUITABLE_ZONE) - ret = COMPACT_SKIPPED; + trace_mm_compaction_suitable(zone, order, compact_result); - return ret; + return suitable; } bool compaction_zonelist_suitable(struct alloc_context *ac, int order, @@ -2294,7 +2330,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { unsigned long available; - enum compact_result compact_result; /* * Do not consider all the reclaimable memory because we do not @@ -2304,9 +2339,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); - compact_result = __compaction_suitable(zone, order, alloc_flags, - ac->highest_zoneidx, available); - if (compact_result == COMPACT_CONTINUE) + if (__compaction_suitable(zone, order, ac->highest_zoneidx, + available)) return true; } @@ -2336,11 +2370,22 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); - ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->highest_zoneidx); - /* Compaction is likely to fail */ - if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) - return ret; + + if (!is_via_compact_memory(cc->order)) { + unsigned long watermark; + + /* Allocation can already succeed, nothing to do */ + watermark = wmark_pages(cc->zone, + cc->alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(cc->zone, cc->order, watermark, + cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; + + /* Compaction is likely to fail */ + if (!compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx)) + return COMPACT_SKIPPED; + } /* * Clear pageblock skip if there were failures recently and compaction @@ -2456,7 +2501,8 @@ rescan: } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page - * within the current order-aligned block, scan the + * within the current order-aligned block and + * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary @@ -2464,8 +2510,9 @@ rescan: * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ - if (cc->direct_compaction && !cc->finish_pageblock && - (cc->mode < MIGRATE_SYNC)) { + if (!pageblock_aligned(cc->migrate_pfn) && + !cc->ignore_skip_hint && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* @@ -2780,6 +2827,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int static int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (sysctl_compact_memory != 1) + return -EINVAL; + if (write) compact_nodes(); @@ -2833,8 +2889,14 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - highest_zoneidx) == COMPACT_CONTINUE) + /* Allocation can already succeed, check other zones */ + if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, + min_wmark_pages(zone), + highest_zoneidx, 0)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, + highest_zoneidx)) return true; } @@ -2871,8 +2933,12 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - if (compaction_suitable(zone, cc.order, 0, zoneid) != - COMPACT_CONTINUE) + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, cc.order, + min_wmark_pages(zone), zoneid, 0)) + continue; + + if (!compaction_suitable(zone, cc.order, zoneid)) continue; if (kthread_should_stop()) @@ -3021,7 +3087,7 @@ static int kcompactd(void *p) * This kcompactd start function will be called by init and node-hot-add. * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. */ -void kcompactd_run(int nid) +void __meminit kcompactd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3039,7 +3105,7 @@ void kcompactd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kcompactd_stop(int nid) +void __meminit kcompactd_stop(int nid) { struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; @@ -3095,7 +3161,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, static struct ctl_table vm_compaction[] = { { .procname = "compact_memory", - .data = NULL, + .data = &sysctl_compact_memory, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index fae64d32b925..c11210124344 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -318,6 +318,29 @@ static void damon_test_update_monitoring_result(struct kunit *test) KUNIT_EXPECT_EQ(test, r->age, 20); } +static void damon_test_set_attrs(struct kunit *test) +{ + struct damon_ctx ctx; + struct damon_attrs valid_attrs = { + .min_nr_regions = 10, .max_nr_regions = 1000, + .sample_interval = 5000, .aggr_interval = 100000,}; + struct damon_attrs invalid_attrs; + + KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0); + + invalid_attrs = valid_attrs; + invalid_attrs.min_nr_regions = 1; + KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + + invalid_attrs = valid_attrs; + invalid_attrs.max_nr_regions = 9; + KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + + invalid_attrs = valid_attrs; + invalid_attrs.aggr_interval = 4999; + KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -329,6 +352,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_ops_registration), KUNIT_CASE(damon_test_set_regions), KUNIT_CASE(damon_test_update_monitoring_result), + KUNIT_CASE(damon_test_set_attrs), {}, }; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index cc63cf953636..e940802a15a4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -37,51 +37,29 @@ struct folio *damon_get_folio(unsigned long pfn) return folio; } -void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) +void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - bool referenced = false; - struct folio *folio = damon_get_folio(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte))); if (!folio) return; - if (pte_young(*pte)) { - referenced = true; - *pte = pte_mkold(*pte); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) + if (ptep_clear_young_notify(vma, addr, pte)) folio_set_young(folio); folio_set_idle(folio); folio_put(folio); } -void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) +void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool referenced = false; struct folio *folio = damon_get_folio(pmd_pfn(*pmd)); if (!folio) return; - if (pmd_young(*pmd)) { - referenced = true; - *pmd = pmd_mkold(*pmd); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) + if (pmdp_clear_young_notify(vma, addr, pmd)) folio_set_young(folio); folio_set_idle(folio); diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 14f4bc69f29b..18d837d11bce 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -9,8 +9,8 @@ struct folio *damon_get_folio(unsigned long pfn); -void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); -void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); +void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr); +void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr); int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 467b99166b43..40801e38fcf0 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -24,9 +24,9 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) - damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr); + damon_ptep_mkold(pvmw.pte, vma, addr); else - damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr); + damon_pmdp_mkold(pvmw.pmd, vma, addr); } return true; } @@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - *accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(ptep_get(pvmw.pte)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 1fec16d7263e..2fcc9731528a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -311,19 +311,21 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, } if (pmd_trans_huge(*pmd)) { - damon_pmdp_mkold(pmd, walk->mm, addr); + damon_pmdp_mkold(pmd, walk->vma, addr); spin_unlock(ptl); return 0; } spin_unlock(ptl); } - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return 0; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (!pte_present(*pte)) + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } + if (!pte_present(ptep_get(pte))) goto out; - damon_ptep_mkold(pte, walk->mm, addr); + damon_ptep_mkold(pte, walk->vma, addr); out: pte_unmap_unlock(pte, ptl); return 0; @@ -431,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pte_t ptent; spinlock_t *ptl; struct folio *folio; struct damon_young_walk_private *priv = walk->private; @@ -464,15 +467,18 @@ huge_out: regular_page: #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return -EINVAL; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (!pte_present(*pte)) + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } + ptent = ptep_get(pte); + if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(*pte)); + folio = damon_get_folio(pte_pfn(ptent)); if (!folio) goto out; - if (pte_young(*pte) || !folio_test_idle(folio) || + if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); diff --git a/mm/debug.c b/mm/debug.c index c7b228097bd9..ee533a5ceb79 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } + +void vma_iter_dump_tree(const struct vma_iterator *vmi) +{ +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mas_dump(&vmi->mas); + mt_dump(vmi->mas.tree, mt_dump_hex); +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +} + #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c new file mode 100644 index 000000000000..f9d145730fd1 --- /dev/null +++ b/mm/debug_page_alloc.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/page-isolation.h> + +unsigned int _debug_guardpage_minorder; + +bool _debug_pagealloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +EXPORT_SYMBOL(_debug_pagealloc_enabled); + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static int __init early_debug_pagealloc(char *buf) +{ + return kstrtobool(buf, &_debug_pagealloc_enabled_early); +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + pr_err("Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + pr_info("Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype) +{ + if (order >= debug_guardpage_minorder()) + return false; + + __SetPageGuard(page); + INIT_LIST_HEAD(&page->buddy_list); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; +} + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype) +{ + __ClearPageGuard(page); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); +} diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index c54177aabebd..ee119e33fef1 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -138,6 +138,9 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PTE advanced\n"); + if (WARN_ON(!args->ptep)) + return; + pte = pfn_pte(args->pte_pfn, args->page_prot); set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); @@ -619,6 +622,9 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) * the unexpected overhead of cache flushing is acceptable. */ pr_debug("Validating PTE clear\n"); + if (WARN_ON(!args->ptep)) + return; + #ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); #endif @@ -1377,7 +1383,8 @@ static int __init debug_vm_pgtable(void) args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl); pte_clear_tests(&args); pte_advanced_tests(&args); - pte_unmap_unlock(args.ptep, ptl); + if (args.ptep) + pte_unmap_unlock(args.ptep, ptl); ptl = pmd_lock(args.mm, args.pmdp); pmd_clear_tests(&args); diff --git a/mm/dmapool.c b/mm/dmapool.c index d2b0f8fc9649..a151a21e571b 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -226,7 +226,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; - bool empty = false; + bool empty; if (!dev) return NULL; @@ -276,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, */ mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - empty = true; + empty = list_empty(&dev->dma_pools); list_add(&retval->pools, &dev->dma_pools); mutex_unlock(&pools_lock); if (empty) { @@ -361,7 +360,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) void dma_pool_destroy(struct dma_pool *pool) { struct dma_page *page, *tmp; - bool empty = false, busy = false; + bool empty, busy = false; if (unlikely(!pool)) return; @@ -369,8 +368,7 @@ void dma_pool_destroy(struct dma_pool *pool) mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); - if (list_empty(&pool->dev->dma_pools)) - empty = true; + empty = list_empty(&pool->dev->dma_pools); mutex_unlock(&pools_lock); if (empty) device_remove_file(pool->dev, &dev_attr_pools); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 9bc12e526ed0..ce06b2884789 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -72,12 +72,10 @@ void __init early_ioremap_setup(void) { int i; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - if (WARN_ON(prev_map[i])) - break; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + WARN_ON_ONCE(prev_map[i]); slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + } } static int __init check_early_ioremap_leak(void) diff --git a/mm/fadvise.c b/mm/fadvise.c index fb7c5f43fd2a..6c39d42f16dc 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -14,7 +14,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> -#include <linux/pagevec.h> #include <linux/fadvise.h> #include <linux/writeback.h> #include <linux/syscalls.h> @@ -143,7 +142,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) } if (end_index >= start_index) { - unsigned long nr_pagevec = 0; + unsigned long nr_failed = 0; /* * It's common to FADV_DONTNEED right after @@ -156,17 +155,15 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ lru_add_drain(); - invalidate_mapping_pagevec(mapping, - start_index, end_index, - &nr_pagevec); + mapping_try_invalidate(mapping, start_index, end_index, + &nr_failed); /* - * If fewer pages were invalidated than expected then - * it is possible that some of the pages were on - * a per-cpu pagevec for a remote CPU. Drain all - * pagevecs and try again. + * The failures may be due to the folio being + * in the LRU cache of a remote CPU. Drain all + * caches and try again. */ - if (nr_pagevec) { + if (nr_failed) { lru_add_drain_all(); invalidate_mapping_pages(mapping, start_index, end_index); diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c new file mode 100644 index 000000000000..b1b09cce9394 --- /dev/null +++ b/mm/fail_page_alloc.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fault-inject.h> +#include <linux/mm.h> + +static struct { + struct fault_attr attr; + + bool ignore_gfp_highmem; + bool ignore_gfp_reclaim; + u32 min_order; +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_reclaim = true, + .ignore_gfp_highmem = true, + .min_order = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + int flags = 0; + + if (order < fail_page_alloc.min_order) + return false; + if (gfp_mask & __GFP_NOFAIL) + return false; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return false; + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* See comment in __should_failslab() */ + if (gfp_mask & __GFP_NOWARN) + flags |= FAULT_NOWARN; + + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); + + return 0; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/mm/filemap.c b/mm/filemap.c index 83dda76d1fc3..9e44a49bbd74 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> @@ -58,6 +59,8 @@ #include <asm/mman.h> +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -114,7 +117,7 @@ * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) @@ -1359,8 +1362,6 @@ repeat: /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. - * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required - * for pte entries, pass NULL for pmd entries. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is @@ -1369,13 +1370,13 @@ repeat: * should be called while holding the ptl for the migration entry referencing * the page. * - * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1409,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ - if (ptep) - pte_unmap_unlock(ptep, ptl); - else - spin_unlock(ptl); + spin_unlock(ptl); for (;;) { unsigned int flags; @@ -1625,36 +1623,6 @@ void folio_end_writeback(struct folio *folio) } EXPORT_SYMBOL(folio_end_writeback); -/* - * After completing I/O on a page, call this routine to update the page - * flags appropriately - */ -void page_endio(struct page *page, bool is_write, int err) -{ - struct folio *folio = page_folio(page); - - if (!is_write) { - if (!err) { - folio_mark_uptodate(folio); - } else { - folio_clear_uptodate(folio); - folio_set_error(folio); - } - folio_unlock(folio); - } else { - if (err) { - struct address_space *mapping; - - folio_set_error(folio); - mapping = folio_mapping(folio); - if (mapping) - mapping_set_error(mapping, err); - } - folio_end_writeback(folio); - } -} -EXPORT_SYMBOL_GPL(page_endio); - /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock @@ -1760,9 +1728,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). - * In the rare case of index wrap-around, 0 will be returned. 0 will also - * be returned if index == 0 and there is a gap at the index. We can not - * wrap-around if passed index == 0. + * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1772,13 +1738,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == 0 && index != 0) - return xas.xa_index; + break; + if (xas.xa_index == 0) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index + 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1799,9 +1764,7 @@ EXPORT_SYMBOL(page_cache_next_miss); * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). - * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX - * will also be returned if index == ULONG_MAX and there is a gap at the - * index. We can not wrap-around if passed index == ULONG_MAX. + * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1811,13 +1774,12 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == ULONG_MAX && index != ULONG_MAX) - return xas.xa_index; + break; + if (xas.xa_index == ULONG_MAX) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index - 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); @@ -2693,8 +2655,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter->count, &fbatch, - iov_iter_is_pipe(iter)); + error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; @@ -2768,6 +2729,48 @@ put_folios: } EXPORT_SYMBOL_GPL(filemap_read); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) + return -EAGAIN; + return 0; + } + + return filemap_write_and_wait_range(mapping, pos, end); +} + +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + int ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* we could block if there are any pages in the range */ + if (filemap_range_has_page(mapping, pos, end)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, pos, end); + if (ret) + return ret; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -2803,18 +2806,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; - } else { - retval = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - return retval; - } - + retval = kiocb_write_and_wait(iocb, count); + if (retval < 0) + return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); @@ -2878,9 +2872,24 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, return spliced; } -/* - * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into - * a pipe. +/** + * filemap_splice_read - Splice data from a file's pagecache into a pipe + * @in: The file to read from + * @ppos: Pointer to the file position to read from + * @pipe: The pipe to splice into + * @len: The amount to splice + * @flags: The SPLICE_F_* flags + * + * This function gets folios from a file's pagecache and splices them into the + * pipe. Readahead will be called as necessary to fill more folios. This may + * be used for blockdevs also. + * + * Return: On success, the number of bytes read will be returned and *@ppos + * will be updated if appropriate; 0 will be returned if there is no more data + * to be read; -EAGAIN will be returned if the pipe had no space, and some + * other negative error code will be returned on error. A short read may occur + * if the pipe has insufficient space, we reach the end of the data or we hit a + * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, @@ -2893,6 +2902,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, bool writably_mapped; int i, error = 0; + if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) + return 0; + init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; @@ -2906,7 +2918,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, do { cond_resched(); - if (*ppos >= i_size_read(file_inode(in))) + if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; @@ -2922,7 +2934,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ - isize = i_size_read(file_inode(in)); + isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); @@ -3419,13 +3431,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd)) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); - /* See comment in handle_pte_fault() */ - if (pmd_devmap_trans_unstable(vmf->pmd)) { - folio_unlock(folio); - folio_put(folio); - return true; - } - return false; } @@ -3512,6 +3517,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + if (!vmf->pte) { + folio_unlock(folio); + folio_put(folio); + goto out; + } do { again: page = folio_file_page(folio, xas.xa_index); @@ -3530,7 +3540,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ @@ -3789,7 +3799,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ -void dio_warn_stale_pagecache(struct file *filp) +static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; @@ -3806,48 +3816,33 @@ void dio_warn_stale_pagecache(struct file *filp) } } -ssize_t -generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; - ssize_t written; - size_t write_len; - pgoff_t end; + struct address_space *mapping = iocb->ki_filp->f_mapping; - write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_SHIFT; + if (mapping->nrpages && + invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) + dio_warn_stale_pagecache(iocb->ki_filp); +} - if (iocb->ki_flags & IOCB_NOWAIT) { - /* If there are pages to writeback, return */ - if (filemap_range_has_page(file->f_mapping, pos, - pos + write_len - 1)) - return -EAGAIN; - } else { - written = filemap_write_and_wait_range(mapping, pos, - pos + write_len - 1); - if (written) - goto out; - } +ssize_t +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t write_len = iov_iter_count(from); + ssize_t written; /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ + written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; - goto out; + return written; } written = mapping->a_ops->direct_IO(iocb, from); @@ -3869,11 +3864,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages && - invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) - dio_warn_stale_pagecache(file); - if (written > 0) { + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + + kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -3884,7 +3879,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); -out: return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -3963,7 +3957,10 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); - return written ? written : status; + if (!written) + return status; + iocb->ki_pos += written; + return written; } EXPORT_SYMBOL(generic_perform_write); @@ -3992,25 +3989,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t written = 0; - ssize_t err; - ssize_t status; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_privs(file); - if (err) - goto out; + struct inode *inode = mapping->host; + ssize_t ret; - err = file_update_time(file); - if (err) - goto out; + ret = file_remove_privs(file); + if (ret) + return ret; - if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos, endbyte; + ret = file_update_time(file); + if (ret) + return ret; - written = generic_file_direct_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -4018,49 +4009,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) - goto out; - - pos = iocb->ki_pos; - status = generic_perform_write(iocb, from); - /* - * If generic_perform_write() returned a synchronous error - * then we want to return the number of bytes which were - * direct-written, or the error code if that was zero. Note - * that this differs from normal direct-io semantics, which - * will return -EFOO even if some bytes were written. - */ - if (unlikely(status < 0)) { - err = status; - goto out; - } - /* - * We need to ensure that the page cache pages are written to - * disk and invalidated to preserve the expected O_DIRECT - * semantics. - */ - endbyte = pos + status - 1; - err = filemap_write_and_wait_range(mapping, pos, endbyte); - if (err == 0) { - iocb->ki_pos = endbyte + 1; - written += status; - invalidate_mapping_pages(mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - } else { - /* - * We don't know how much we wrote, so just return - * the number of bytes which were direct-written - */ - } - } else { - written = generic_perform_write(iocb, from); - if (likely(written > 0)) - iocb->ki_pos += written; + if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) + return ret; + return direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); } -out: - current->backing_dev_info = NULL; - return written ? written : err; + + return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); @@ -4125,3 +4080,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ diff --git a/mm/frontswap.c b/mm/frontswap.c index 279e55b4ed87..2fb5df3384b8 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -206,6 +206,7 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + bool exclusive = false; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -215,9 +216,14 @@ int __frontswap_load(struct page *page) return -1; /* Try loading from each implementation, until one succeeds. */ - ret = frontswap_ops->load(type, offset, page); - if (ret == 0) + ret = frontswap_ops->load(type, offset, page, &exclusive); + if (ret == 0) { inc_frontswap_loads(); + if (exclusive) { + SetPageDirty(page); + __frontswap_clear(sis, offset); + } + } return ret; } @@ -18,6 +18,7 @@ #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> +#include <linux/shmem_fs.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -51,7 +52,8 @@ static inline void sanity_check_pinned_pages(struct page **pages, struct page *page = *pages; struct folio *folio = page_folio(page); - if (!folio_test_anon(folio)) + if (is_zero_page(page) || + !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); @@ -123,63 +125,72 @@ retry: */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { + struct folio *folio; + + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) + return NULL; + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL; if (flags & FOLL_GET) return try_get_folio(page, refs); - else if (flags & FOLL_PIN) { - struct folio *folio; - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !is_longterm_pinnable_page(page))) - return NULL; + /* FOLL_PIN is set */ - /* - * CAUTION: Don't use compound_head() on the page before this - * point, the result won't be stable. - */ - folio = try_get_folio(page, refs); - if (!folio) - return NULL; - - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in page_try_share_anon_rmap(). - */ - smp_mb__after_atomic(); + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + folio = try_get_folio(page, refs); + if (!folio) + return NULL; - return folio; + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); + return NULL; } - WARN_ON_ONCE(1); - return NULL; + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in page_try_share_anon_rmap(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; } static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { + if (is_zero_folio(folio)) + return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); @@ -225,6 +236,13 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) folio_ref_inc(folio); else if (flags & FOLL_PIN) { /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return 0; + + /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, * so that the page really is pinned. @@ -258,6 +276,33 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +/** + * folio_add_pin - Try to get an additional pin on a pinned folio + * @folio: The folio to be pinned + * + * Get an additional pin on a folio we already have a pin on. Makes no change + * if the folio is a zero_page. + */ +void folio_add_pin(struct folio *folio) +{ + if (is_zero_folio(folio)) + return; + + /* + * Similar to try_grab_folio(): be sure to *also* increment the normal + * page refcount field at least once, so that the page really is + * pinned. + */ + if (folio_test_large(folio)) { + WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); + folio_ref_inc(folio); + atomic_inc(&folio->_pincount); + } else { + WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + } +} + static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { @@ -476,13 +521,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { - pte_t entry = *pte; + pte_t orig_entry = ptep_get(pte); + pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); - if (!pte_same(*pte, entry)) { + if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } @@ -544,11 +590,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); - if (unlikely(pmd_bad(*pmd))) - return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - pte = *ptep; + if (!ptep) + return no_page_table(vma, flags); + pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) @@ -653,11 +699,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - /* - * The READ_ONCE() will stabilize the pmdval in a register or - * on the stack so that it will stop changing under the code. - */ - pmdval = READ_ONCE(*pmd); + pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) @@ -685,21 +727,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT_PMD) { - int ret; - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - ret = 0; - split_huge_pmd(vma, pmd, address); - if (pmd_trans_unstable(pmd)) - ret = -EBUSY; - } else { - spin_unlock(ptl); - split_huge_pmd(vma, pmd, address); - ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; - } - - return ret ? ERR_PTR(ret) : + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + /* If pmd was left empty, stuff a page table in there quickly */ + return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); @@ -835,6 +866,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ @@ -855,18 +887,20 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; - VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); - if (pte_none(*pte)) + if (!pte) + return -EFAULT; + entry = ptep_get(pte); + if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; - *page = vm_normal_page(*vma, address, *pte); + *page = vm_normal_page(*vma, address, entry); if (!*page) { - if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; - *page = pte_page(*pte); + *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) @@ -959,16 +993,54 @@ static int faultin_page(struct vm_area_struct *vma, return 0; } +/* + * Writing to file-backed mappings which require folio dirty tracking using GUP + * is a fundamentally broken operation, as kernel write access to GUP mappings + * do not adhere to the semantics expected by a file system. + * + * Consider the following scenario:- + * + * 1. A folio is written to via GUP which write-faults the memory, notifying + * the file system and dirtying the folio. + * 2. Later, writeback is triggered, resulting in the folio being cleaned and + * the PTE being marked read-only. + * 3. The GUP caller writes to the folio, as it is mapped read/write via the + * direct mapping. + * 4. The GUP caller, now done with the page, unpins it and sets it dirty + * (though it does not have to). + * + * This results in both data being written to a folio without writenotify, and + * the folio being dirtied unexpectedly (if the caller decides to do so). + */ +static bool writable_file_mapping_allowed(struct vm_area_struct *vma, + unsigned long gup_flags) +{ + /* + * If we aren't pinning then no problematic write can occur. A long term + * pin is the most egregious case so this is the case we disallow. + */ + if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != + (FOLL_PIN | FOLL_LONGTERM)) + return true; + + /* + * If the VMA does not require dirty tracking then no problematic write + * can occur either. + */ + return !vma_needs_dirty_tracking(vma); +} + static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; int write = (gup_flags & FOLL_WRITE); int foreign = (gup_flags & FOLL_REMOTE); + bool vma_anon = vma_is_anonymous(vma); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; - if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) + if ((gup_flags & FOLL_ANON) && !vma_anon) return -EFAULT; if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) @@ -978,6 +1050,10 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) return -EFAULT; if (write) { + if (!vma_anon && + !writable_file_mapping_allowed(vma, gup_flags)) + return -EFAULT; + if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; @@ -1024,8 +1100,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the @@ -1039,8 +1113,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_lock is held. - * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to @@ -1076,7 +1148,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; @@ -1120,9 +1192,9 @@ static long __get_user_pages(struct mm_struct *mm, goto out; if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, - gup_flags, locked); + i = follow_hugetlb_page(mm, vma, pages, + &start, &nr_pages, i, + gup_flags, locked); if (!*locked) { /* * We've got a VM_FAULT_RETRY @@ -1187,10 +1259,6 @@ retry: ctx.page_mask = 0; } next_page: - if (vmas) { - vmas[i] = vma; - ctx.page_mask = 0; - } page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; @@ -1349,7 +1417,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, unsigned int flags) { @@ -1387,7 +1454,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, - vmas, locked); + locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; @@ -1451,7 +1518,7 @@ retry: *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, - pages, NULL, locked); + pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); @@ -1549,7 +1616,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked ? locked : &local_locked); + NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } @@ -1607,7 +1674,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, return -EINVAL; ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked); + NULL, locked); lru_add_drain(); return ret; } @@ -1675,8 +1742,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, - unsigned int foll_flags) + int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; @@ -1720,8 +1786,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, if (pages[i]) get_page(pages[i]); } - if (vmas) - vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; } @@ -1902,8 +1967,7 @@ struct page *get_dump_page(unsigned long addr) int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL, - &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } @@ -2076,7 +2140,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, unsigned int gup_flags) { @@ -2084,13 +2147,13 @@ static long __gup_longterm_locked(struct mm_struct *mm, long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, locked, + pages, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; @@ -2108,9 +2171,8 @@ static long __gup_longterm_locked(struct mm_struct *mm, * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ -static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, - int *locked, unsigned int *gup_flags_p, - unsigned int to_set) +static bool is_valid_gup_args(struct page **pages, int *locked, + unsigned int *gup_flags_p, unsigned int to_set) { unsigned int gup_flags = *gup_flags_p; @@ -2152,13 +2214,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, (gup_flags & FOLL_PCI_P2PDMA))) return false; - /* - * Can't use VMAs with locked, as locked allows GUP to unlock - * which invalidates the vmas array - */ - if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE))) - return false; - *gup_flags_p = gup_flags; return true; } @@ -2173,8 +2228,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -2189,8 +2242,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_lock is held. - * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference @@ -2227,15 +2278,15 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } @@ -2245,7 +2296,7 @@ EXPORT_SYMBOL(get_user_pages_remote); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { return 0; } @@ -2259,8 +2310,6 @@ long get_user_pages_remote(struct mm_struct *mm, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to @@ -2268,16 +2317,15 @@ long get_user_pages_remote(struct mm_struct *mm, * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, &locked, gup_flags); + &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -2301,12 +2349,12 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - NULL, &locked, gup_flags); + &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2345,6 +2393,82 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ #ifdef CONFIG_HAVE_FAST_GUP +/* + * Used in the GUP-fast path to determine whether a pin is permitted for a + * specific folio. + * + * This call assumes the caller has pinned the folio, that the lowest page table + * level still points to this folio, and that interrupts have been disabled. + * + * Writing to pinned file-backed dirty tracked folios is inherently problematic + * (see comment describing the writable_file_mapping_allowed() function). We + * therefore try to avoid the most egregious case of a long-term mapping doing + * so. + * + * This function cannot be as thorough as that one as the VMA is not available + * in the fast path, so instead we whitelist known good cases and if in doubt, + * fall back to the slow path. + */ +static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) +{ + struct address_space *mapping; + unsigned long mapping_flags; + + /* + * If we aren't pinning then no problematic write can occur. A long term + * pin is the most egregious case so this is the one we disallow. + */ + if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != + (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) + return true; + + /* The folio is pinned, so we can safely access folio fields. */ + + if (WARN_ON_ONCE(folio_test_slab(folio))) + return false; + + /* hugetlb mappings do not require dirty-tracking. */ + if (folio_test_hugetlb(folio)) + return true; + + /* + * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods + * cannot proceed, which means no actions performed under RCU can + * proceed either. + * + * inodes and thus their mappings are freed under RCU, which means the + * mapping cannot be freed beneath us and thus we can safely dereference + * it. + */ + lockdep_assert_irqs_disabled(); + + /* + * However, there may be operations which _alter_ the mapping, so ensure + * we read it once and only once. + */ + mapping = READ_ONCE(folio->mapping); + + /* + * The mapping may have been truncated, in any case we cannot determine + * if this mapping is safe - fall back to slow path to determine how to + * proceed. + */ + if (!mapping) + return false; + + /* Anonymous folios pose no problem. */ + mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; + if (mapping_flags) + return mapping_flags & PAGE_MAPPING_ANON; + + /* + * At this point, we know the mapping is non-null and points to an + * address_space object. The only remaining whitelisted file system is + * shmem. + */ + return shmem_mapping(mapping); +} + static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) @@ -2389,6 +2513,8 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); + if (!ptep) + return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; @@ -2425,7 +2551,12 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(*ptep))) { + unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + + if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2617,7 +2748,12 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (!folio) return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2688,6 +2824,10 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; @@ -2728,6 +2868,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; @@ -2763,6 +2908,16 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } + if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } + *nr += refs; folio_set_referenced(folio); return 1; @@ -2977,7 +3132,7 @@ static int internal_get_user_pages_fast(unsigned long start, start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) - return 0; + return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) @@ -2991,7 +3146,7 @@ static int internal_get_user_pages_fast(unsigned long start, start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, - pages, NULL, &locked, + pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* @@ -3033,7 +3188,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; @@ -3066,7 +3221,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -3087,11 +3242,14 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -3106,8 +3264,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -3118,18 +3274,21 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } @@ -3143,25 +3302,25 @@ EXPORT_SYMBOL(pin_user_pages_remote); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, &locked, gup_flags); + pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); @@ -3169,17 +3328,20 @@ EXPORT_SYMBOL(pin_user_pages); * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; - return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); diff --git a/mm/gup_test.c b/mm/gup_test.c index c0421b786dcd..eeb3f4d87c51 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -40,24 +40,25 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, unsigned long nr_pages) { unsigned long i; - struct page *page; + struct folio *folio; switch (cmd) { case PIN_FAST_BENCHMARK: case PIN_BASIC_TEST: case PIN_LONGTERM_BENCHMARK: for (i = 0; i < nr_pages; i++) { - page = pages[i]; - if (WARN(!page_maybe_dma_pinned(page), + folio = page_folio(pages[i]); + + if (WARN(!folio_maybe_dma_pinned(folio), "pages[%lu] is NOT dma-pinned\n", i)) { - dump_page(page, "gup_test failure"); + dump_page(&folio->page, "gup_test failure"); break; } else if (cmd == PIN_LONGTERM_BENCHMARK && - WARN(!is_longterm_pinnable_page(page), + WARN(!folio_is_longterm_pinnable(folio), "pages[%lu] is NOT pinnable but pinned\n", i)) { - dump_page(page, "gup_test failure"); + dump_page(&folio->page, "gup_test failure"); break; } } @@ -139,29 +140,27 @@ static int __gup_test_ioctl(unsigned int cmd, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_FAST_BENCHMARK: nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, gup->gup_flags | FOLL_LONGTERM, - pages + i, NULL); + pages + i); break; case DUMP_USER_PAGES_TEST: if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) nr = pin_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); else nr = get_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); break; default: ret = -EINVAL; @@ -271,7 +270,7 @@ static inline int pin_longterm_test_start(unsigned long arg) gup_flags, pages); else cur_pages = pin_user_pages(addr, remaining_pages, - gup_flags, pages, NULL); + gup_flags, pages); if (cur_pages < 0) { pin_longterm_test_stop(); ret = cur_pages; diff --git a/mm/highmem.c b/mm/highmem.c index db251e77f98f..e19269093a93 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr) /* kmap() mappings */ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP))) - return pte_page(pkmap_page_table[PKMAP_NR(addr)]); + return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)])); /* kmap_local_page() mappings */ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && @@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void) for (i = 0; i < LAST_PKMAP; i++) { struct page *page; + pte_t ptent; /* * zero means we don't have anything to do, @@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + ptent = ptep_get(&pkmap_page_table[i]); + BUG_ON(pte_none(ptent)); /* * Don't need an atomic fetch-and-clear op here; @@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); + page = pte_page(ptent); pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); @@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)]))); return true; } #endif @@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(vaddr, idx); - BUG_ON(!pte_none(*kmap_pte)); + BUG_ON(!pte_none(ptep_get(kmap_pte))); pteval = pfn_pte(pfn, prot); arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); @@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long cpu_flags; - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; if (pte_none_mostly(pte)) { @@ -332,7 +332,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, pmd_t pmd; again: - pmd = READ_ONCE(*pmdp); + pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, -1, walk); @@ -381,6 +381,8 @@ again: } ptep = pte_offset_map(pmdp, addr); + if (!ptep) + goto again; for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { int r; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 624671aaa60d..eb3678360b97 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -583,7 +583,7 @@ void prep_transhuge_page(struct page *page) VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); - set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); + folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR); } static inline bool is_transparent_hugepage(struct page *page) @@ -1344,7 +1344,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) /* * See do_wp_page(): we can only reuse the folio exclusively if * there are no additional references. Note that we always drain - * the LRU pagevecs immediately after adding a THP. + * the LRU cache immediately after adding a THP. */ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) @@ -1760,9 +1760,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * The destination pmd shouldn't be established, free_pgtables() - * should have release it. + * should have released it; but move_page_tables() might have already + * inserted a page table, if racing against shmem/file collapse. */ - if (WARN_ON(!pmd_none(*new_pmd))) { + if (!pmd_none(*new_pmd)) { VM_BUG_ON(pmd_trans_huge(*new_pmd)); return false; } @@ -2036,6 +2037,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; pmd_t _pmd, old_pmd; + unsigned long addr; + pte_t *pte; int i; /* @@ -2051,17 +2054,20 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte); + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry; + + entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); + VM_BUG_ON(!pte_none(ptep_get(pte))); + set_pte_at(mm, addr, pte, entry); + pte++; } + pte_unmap(pte - 1); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); } @@ -2076,6 +2082,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; bool anon_exclusive = false, dirty = false; unsigned long addr; + pte_t *pte; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); @@ -2204,8 +2211,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte); for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry, *pte; + pte_t entry; /* * Note that NUMA hinting access restrictions are not * transferred to avoid any possibility of altering @@ -2248,11 +2257,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkuffd_wp(entry); page_add_anon_rmap(page + i, vma, addr, false); } - pte = pte_offset_map(&_pmd, addr); - BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); - pte_unmap(pte); + pte++; } + pte_unmap(pte - 1); if (!pmd_migration) page_remove_rmap(page, vma, true); @@ -2792,12 +2801,19 @@ void free_transhuge_page(struct page *page) struct deferred_split *ds_queue = get_deferred_split_queue(folio); unsigned long flags; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); + /* + * At this point, there is no one trying to add the folio to + * deferred_list. If folio is not in deferred_list, it's safe + * to check without acquiring the split_queue_lock. + */ + if (data_race(!list_empty(&folio->_deferred_list))) { + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + list_del(&folio->_deferred_list); + } + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f154019e6b84..bce28cca73a1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1489,7 +1489,6 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, set_page_refcounted(p); } - folio_set_order(folio, 0); __folio_clear_head(folio); } @@ -1951,9 +1950,6 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, struct page *p; __folio_clear_reserved(folio); - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_order(folio, order); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1999,6 +1995,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, if (i != 0) set_compound_head(p, &folio->page); } + __folio_set_head(folio); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -2017,8 +2016,6 @@ out_error: p = folio_page(folio, j); __ClearPageReserved(p); } - folio_set_order(folio, 0); - __folio_clear_head(folio); return false; } @@ -5016,7 +5013,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; + struct folio *pte_folio; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); @@ -5115,8 +5112,8 @@ again: set_huge_pte_at(dst, addr, dst_pte, entry); } else { entry = huge_ptep_get(src_pte); - ptepage = pte_page(entry); - get_page(ptepage); + pte_folio = page_folio(pte_page(entry)); + folio_get(pte_folio); /* * Failing to duplicate the anon rmap is a rare case @@ -5128,10 +5125,10 @@ again: * need to be without the pgtable locks since we could * sleep during the process. */ - if (!PageAnon(ptepage)) { - page_dup_file_rmap(ptepage, true); - } else if (page_try_dup_anon_rmap(ptepage, true, - src_vma)) { + if (!folio_test_anon(pte_folio)) { + page_dup_file_rmap(&pte_folio->page, true); + } else if (page_try_dup_anon_rmap(&pte_folio->page, + true, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; @@ -5140,14 +5137,14 @@ again: /* Do not use reserve as it's private owned */ new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); if (IS_ERR(new_folio)) { - put_page(ptepage); + folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, - page_folio(ptepage), - addr, dst_vma); - put_page(ptepage); + pte_folio, + addr, dst_vma); + folio_put(pte_folio); if (ret) { folio_put(new_folio); break; @@ -5540,7 +5537,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); - struct page *old_page; + struct folio *old_folio; struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; @@ -5571,7 +5568,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - old_page = pte_page(pte); + old_folio = page_folio(pte_page(pte)); delayacct_wpcopy_start(); @@ -5580,17 +5577,17 @@ retry_avoidcopy: * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. */ - if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - if (!PageAnonExclusive(old_page)) - page_move_anon_rmap(old_page, vma); + if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { + if (!PageAnonExclusive(&old_folio->page)) + page_move_anon_rmap(&old_folio->page, vma); if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); delayacct_wpcopy_end(); return 0; } - VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), - old_page); + VM_BUG_ON_PAGE(folio_test_anon(old_folio) && + PageAnonExclusive(&old_folio->page), &old_folio->page); /* * If the process that created a MAP_PRIVATE mapping is about to @@ -5602,10 +5599,10 @@ retry_avoidcopy: * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - page_folio(old_page) != pagecache_folio) + old_folio != pagecache_folio) outside_reserve = 1; - get_page(old_page); + folio_get(old_folio); /* * Drop page table lock as buddy allocator may be called. It will @@ -5627,7 +5624,7 @@ retry_avoidcopy: pgoff_t idx; u32 hash; - put_page(old_page); + folio_put(old_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock @@ -5642,7 +5639,7 @@ retry_avoidcopy: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, old_page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); @@ -5672,7 +5669,7 @@ retry_avoidcopy: goto out_release_all; } - if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) { + if (copy_user_large_folio(new_folio, old_folio, address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; goto out_release_all; } @@ -5694,14 +5691,14 @@ retry_avoidcopy: /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, vma, true); + page_remove_rmap(&old_folio->page, vma, true); hugepage_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, haddr, ptep, newpte); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_folio = page_folio(old_page); + new_folio = old_folio; } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5710,11 +5707,11 @@ out_release_all: * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_folio != page_folio(old_page)) + if (new_folio != old_folio) restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: - put_page(old_page); + folio_put(old_folio); spin_lock(ptl); /* Caller expects lock to be held */ @@ -5731,13 +5728,13 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, vma, address); - bool present; - - rcu_read_lock(); - present = page_cache_next_miss(mapping, idx, 1) != idx; - rcu_read_unlock(); + struct folio *folio; - return present; + folio = filemap_get_folio(mapping, idx); + if (IS_ERR(folio)) + return false; + folio_put(folio); + return true; } int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, @@ -6062,7 +6059,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vm_fault_t ret; u32 hash; pgoff_t idx; - struct page *page = NULL; + struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; @@ -6179,16 +6176,16 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and * pagecache_folio, so here we need take the former one - * when page != pagecache_folio or !pagecache_folio. + * when folio != pagecache_folio or !pagecache_folio. */ - page = pte_page(entry); - if (page_folio(page) != pagecache_folio) - if (!trylock_page(page)) { + folio = page_folio(pte_page(entry)); + if (folio != pagecache_folio) + if (!folio_trylock(folio)) { need_wait_lock = 1; goto out_ptl; } - get_page(page); + folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { @@ -6204,9 +6201,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page_folio(page) != pagecache_folio) - unlock_page(page); - put_page(page); + if (folio != pagecache_folio) + folio_unlock(folio); + folio_put(folio); out_ptl: spin_unlock(ptl); @@ -6225,7 +6222,7 @@ out_mutex: * here without taking refcount. */ if (need_wait_lock) - wait_on_page_locked(page); + folio_wait_locked(folio); return ret; } @@ -6425,17 +6422,14 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, - int refs, struct page **pages, - struct vm_area_struct **vmas) +static void record_subpages(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages) { int nr; for (nr = 0; nr < refs; nr++) { if (likely(pages)) pages[nr] = nth_page(page, nr); - if (vmas) - vmas[nr] = vma; } } @@ -6508,9 +6502,9 @@ out_unlock: } long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *locked) + struct page **pages, unsigned long *position, + unsigned long *nr_pages, long i, unsigned int flags, + int *locked) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -6638,7 +6632,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If subpage information not requested, update counters * and skip the same_page loop below. */ - if (!pages && !vmas && !pfn_offset && + if (!pages && !pfn_offset && (vaddr + huge_page_size(h) < vma->vm_end) && (remainder >= pages_per_huge_page(h))) { vaddr += huge_page_size(h); @@ -6653,11 +6647,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); - if (pages || vmas) - record_subpages_vmas(nth_page(page, pfn_offset), - vma, refs, - likely(pages) ? pages + i : NULL, - vmas ? vmas + i : NULL); + if (pages) + record_subpages(nth_page(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL); if (pages) { /* @@ -7137,7 +7130,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long saddr; pte_t *spte = NULL; pte_t *pte; - spinlock_t *ptl; i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { @@ -7158,7 +7150,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, if (!spte) goto out; - ptl = huge_pte_lock(hstate_vma(vma), mm, spte); + spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); @@ -7166,7 +7158,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } else { put_page(virt_to_page(spte)); } - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); i_mmap_unlock_read(mapping); @@ -7254,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte))); return pte; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 27f001e0f0a2..c2007ef5e9b0 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, * remapping (which is calling @walk->remap_pte). */ if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); + walk->reuse_page = pte_page(ptep_get(pte)); /* * Because the reuse address is part of the range that we are * walking, skip the reuse address range. @@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - struct page *page = pte_page(*pte); + struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ @@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct page *page; void *to; - BUG_ON(pte_page(*pte) != walk->reuse_page); + BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); @@ -384,8 +384,9 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, - gfp_t gfp_mask, struct list_head *list) + struct list_head *list) { + gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; @@ -413,12 +414,11 @@ out: * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. - * @gfp_mask: GFP flag for allocating vmemmap pages. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask) + unsigned long reuse) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { @@ -430,7 +430,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); - if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; mmap_read_lock(&init_mm); @@ -476,8 +476,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, - GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse); if (!ret) { ClearHPageVmemmapOptimized(head); static_branch_dec(&hugetlb_optimize_vmemmap_key); diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97ac..a7d9e980429a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -133,8 +133,8 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long invalidate_inode_page(struct page *page); -unsigned long invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); +unsigned long mapping_try_invalidate(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. @@ -179,12 +179,6 @@ extern unsigned long highest_memmap_pfn; #define MAX_RECLAIM_RETRIES 16 /* - * in mm/early_ioremap.c - */ -pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, - unsigned long size, pgprot_t prot); - -/* * in mm/vmscan.c: */ bool isolate_lru_page(struct page *page); @@ -208,10 +202,12 @@ extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); -static inline bool is_check_pages_enabled(void) -{ - return static_branch_unlikely(&check_pages_enabled); -} +extern int min_free_kbytes; + +void setup_per_zone_wmarks(void); +void calculate_min_free_kbytes(void); +int __meminit init_per_zone_wmark_min(void); +void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed @@ -371,6 +367,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } +void set_zone_contiguous(struct zone *zone); + +static inline void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); @@ -378,12 +381,27 @@ extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); +/* + * This will have no effect, other than possibly generating a warning, if the + * caller passes in a non-large folio. + */ +static inline void folio_set_order(struct folio *folio, unsigned int order) +{ + if (WARN_ON_ONCE(!order || !folio_test_large(folio))) + return; + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + folio->_folio_nr_pages = 1U << order; +#endif +} + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); + folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -416,27 +434,12 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset); +void memmap_init_range(unsigned long, int, unsigned long, unsigned long, + unsigned long, enum meminit_context, struct vmem_altmap *, int); -/* - * This will have no effect, other than possibly generating a warning, if the - * caller passes in a non-large folio. - */ -static inline void folio_set_order(struct folio *folio, unsigned int order) -{ - if (WARN_ON_ONCE(!folio_test_large(folio))) - return; - folio->_folio_order = order; -#ifdef CONFIG_64BIT - /* - * When hugetlb dissolves a folio, we need to clear the tail - * page, rather than setting nr_pages to 1. - */ - folio->_folio_nr_pages = order ? 1U << order : 0; -#endif -} +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset); #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -563,8 +566,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len); +extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, + unsigned long bytes); /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, @@ -1047,17 +1050,17 @@ static inline void vma_iter_store(struct vma_iterator *vmi, { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { - printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); - printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); - printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree); + if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + vmi->mas.index > vma->vm_start)) { + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", + vmi->mas.index, vma->vm_start, vma->vm_start, + vma->vm_end, vmi->mas.index, vmi->mas.last); } - if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { - printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); - printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); - printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree); + if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + vmi->mas.last < vma->vm_start)) { + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, + vmi->mas.index, vmi->mas.last); } #endif diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b376a5d055e5..256930da578a 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -445,7 +445,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { - kasan_report((unsigned long)address, 1, false, ip); + kasan_report(address, 1, false, ip); return false; } return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index e5eef670735e..5b4c97baa656 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -40,39 +40,39 @@ * depending on memory access size X. */ -static __always_inline bool memory_is_poisoned_1(unsigned long addr) +static __always_inline bool memory_is_poisoned_1(const void *addr) { - s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr); if (unlikely(shadow_value)) { - s8 last_accessible_byte = addr & KASAN_GRANULE_MASK; + s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK; return unlikely(last_accessible_byte >= shadow_value); } return false; } -static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, +static __always_inline bool memory_is_poisoned_2_4_8(const void *addr, unsigned long size) { - u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr); /* * Access crosses 8(shadow size)-byte boundary. Such access maps * into 2 shadow bytes, so we need to check them both. */ - if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) + if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) return *shadow_addr || memory_is_poisoned_1(addr + size - 1); return memory_is_poisoned_1(addr + size - 1); } -static __always_inline bool memory_is_poisoned_16(unsigned long addr) +static __always_inline bool memory_is_poisoned_16(const void *addr) { - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr); /* Unaligned 16-bytes access maps into 3 shadow bytes. */ - if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE))) + if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE))) return *shadow_addr || memory_is_poisoned_1(addr + 15); return *shadow_addr; @@ -120,26 +120,25 @@ static __always_inline unsigned long memory_is_nonzero(const void *start, return bytes_is_nonzero(start, (end - start) % 8); } -static __always_inline bool memory_is_poisoned_n(unsigned long addr, - size_t size) +static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size) { unsigned long ret; - ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), - kasan_mem_to_shadow((void *)addr + size - 1) + 1); + ret = memory_is_nonzero(kasan_mem_to_shadow(addr), + kasan_mem_to_shadow(addr + size - 1) + 1); if (unlikely(ret)) { - unsigned long last_byte = addr + size - 1; - s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + const void *last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) + (((long)last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) return true; } return false; } -static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +static __always_inline bool memory_is_poisoned(const void *addr, size_t size) { if (__builtin_constant_p(size)) { switch (size) { @@ -159,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline bool check_region_inline(unsigned long addr, +static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { @@ -172,7 +171,7 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely(!addr_has_metadata((void *)addr))) + if (unlikely(!addr_has_metadata(addr))) return !kasan_report(addr, size, write, ret_ip); if (likely(!memory_is_poisoned(addr, size))) @@ -181,7 +180,7 @@ static __always_inline bool check_region_inline(unsigned long addr, return !kasan_report(addr, size, write, ret_ip); } -bool kasan_check_range(unsigned long addr, size_t size, bool write, +bool kasan_check_range(const void *addr, size_t size, bool write, unsigned long ret_ip) { return check_region_inline(addr, size, write, ret_ip); @@ -221,36 +220,37 @@ static void register_global(struct kasan_global *global) KASAN_GLOBAL_REDZONE, false); } -void __asan_register_globals(struct kasan_global *globals, size_t size) +void __asan_register_globals(void *ptr, ssize_t size) { int i; + struct kasan_global *globals = ptr; for (i = 0; i < size; i++) register_global(&globals[i]); } EXPORT_SYMBOL(__asan_register_globals); -void __asan_unregister_globals(struct kasan_global *globals, size_t size) +void __asan_unregister_globals(void *ptr, ssize_t size) { } EXPORT_SYMBOL(__asan_unregister_globals); #define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ + void __asan_load##size(void *addr) \ { \ check_region_inline(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_load##size); \ __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ + void __asan_load##size##_noabort(void *); \ EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ + void __asan_store##size(void *addr) \ { \ check_region_inline(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_store##size); \ __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ + void __asan_store##size##_noabort(void *); \ EXPORT_SYMBOL(__asan_store##size##_noabort) DEFINE_ASAN_LOAD_STORE(1); @@ -259,24 +259,24 @@ DEFINE_ASAN_LOAD_STORE(4); DEFINE_ASAN_LOAD_STORE(8); DEFINE_ASAN_LOAD_STORE(16); -void __asan_loadN(unsigned long addr, size_t size) +void __asan_loadN(void *addr, ssize_t size) { kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); __alias(__asan_loadN) -void __asan_loadN_noabort(unsigned long, size_t); +void __asan_loadN_noabort(void *, ssize_t); EXPORT_SYMBOL(__asan_loadN_noabort); -void __asan_storeN(unsigned long addr, size_t size) +void __asan_storeN(void *addr, ssize_t size) { kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); __alias(__asan_storeN) -void __asan_storeN_noabort(unsigned long, size_t); +void __asan_storeN_noabort(void *, ssize_t); EXPORT_SYMBOL(__asan_storeN_noabort); /* to shut up compiler complaints */ @@ -284,7 +284,7 @@ void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); /* Emitted by compiler to poison alloca()ed objects. */ -void __asan_alloca_poison(unsigned long addr, size_t size) +void __asan_alloca_poison(void *addr, ssize_t size) { size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE); size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - @@ -295,7 +295,7 @@ void __asan_alloca_poison(unsigned long addr, size_t size) KASAN_ALLOCA_REDZONE_SIZE); const void *right_redzone = (const void *)(addr + rounded_up_size); - WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE)); kasan_unpoison((const void *)(addr + rounded_down_size), size - rounded_down_size, false); @@ -307,18 +307,18 @@ void __asan_alloca_poison(unsigned long addr, size_t size) EXPORT_SYMBOL(__asan_alloca_poison); /* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom) { - if (unlikely(!stack_top || stack_top > stack_bottom)) + if (unlikely(!stack_top || stack_top > (void *)stack_bottom)) return; - kasan_unpoison(stack_top, stack_bottom - stack_top, false); + kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false); } EXPORT_SYMBOL(__asan_allocas_unpoison); /* Emitted by the compiler to [un]poison local variables. */ #define DEFINE_ASAN_SET_SHADOW(byte) \ - void __asan_set_shadow_##byte(const void *addr, size_t size) \ + void __asan_set_shadow_##byte(const void *addr, ssize_t size) \ { \ __memset((void *)addr, 0x##byte, size); \ } \ @@ -488,7 +488,7 @@ static void __kasan_record_aux_stack(void *addr, bool can_alloc) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc); + alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc); } void kasan_record_aux_stack(void *addr) @@ -518,7 +518,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object) if (!free_meta) return; - kasan_set_track(&free_meta->free_track, GFP_NOWAIT); + kasan_set_track(&free_meta->free_track, 0); /* The object was freed and has free track set. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } diff --git a/mm/kasan/init.c b/mm/kasan/init.c index cc64ed6858c6..dcfec277e839 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return; } @@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, unsigned long end) { unsigned long next; + pte_t ptent; for (; addr < end; addr = next, pte++) { next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; - if (!pte_present(*pte)) + ptent = ptep_get(pte); + + if (!pte_present(ptent)) continue; - if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + if (WARN_ON(!kasan_early_shadow_page_entry(ptent))) continue; pte_clear(&init_mm, addr, pte); } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index f5e4f5f2ba20..b799f11e45dc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -198,13 +198,13 @@ enum kasan_report_type { struct kasan_report_info { /* Filled in by kasan_report_*(). */ enum kasan_report_type type; - void *access_addr; + const void *access_addr; size_t access_size; bool is_write; unsigned long ip; /* Filled in by the common reporting code. */ - void *first_bad_addr; + const void *first_bad_addr; struct kmem_cache *cache; void *object; size_t alloc_size; @@ -311,7 +311,7 @@ static __always_inline bool addr_has_metadata(const void *addr) * @ret_ip: return address * @return: true if access was valid, false if invalid */ -bool kasan_check_range(unsigned long addr, size_t size, bool write, +bool kasan_check_range(const void *addr, size_t size, bool write, unsigned long ret_ip); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -323,7 +323,7 @@ static __always_inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -void *kasan_find_first_bad_addr(void *addr, size_t size); +const void *kasan_find_first_bad_addr(const void *addr, size_t size); size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); @@ -346,7 +346,7 @@ void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } #endif -bool kasan_report(unsigned long addr, size_t size, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); @@ -571,79 +571,82 @@ void kasan_restore_multi_shot(bool enabled); */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); -void __asan_register_globals(struct kasan_global *globals, size_t size); -void __asan_unregister_globals(struct kasan_global *globals, size_t size); +void __asan_register_globals(void *globals, ssize_t size); +void __asan_unregister_globals(void *globals, ssize_t size); void __asan_handle_no_return(void); -void __asan_alloca_poison(unsigned long addr, size_t size); -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); - -void __asan_load1(unsigned long addr); -void __asan_store1(unsigned long addr); -void __asan_load2(unsigned long addr); -void __asan_store2(unsigned long addr); -void __asan_load4(unsigned long addr); -void __asan_store4(unsigned long addr); -void __asan_load8(unsigned long addr); -void __asan_store8(unsigned long addr); -void __asan_load16(unsigned long addr); -void __asan_store16(unsigned long addr); -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); - -void __asan_load1_noabort(unsigned long addr); -void __asan_store1_noabort(unsigned long addr); -void __asan_load2_noabort(unsigned long addr); -void __asan_store2_noabort(unsigned long addr); -void __asan_load4_noabort(unsigned long addr); -void __asan_store4_noabort(unsigned long addr); -void __asan_load8_noabort(unsigned long addr); -void __asan_store8_noabort(unsigned long addr); -void __asan_load16_noabort(unsigned long addr); -void __asan_store16_noabort(unsigned long addr); -void __asan_loadN_noabort(unsigned long addr, size_t size); -void __asan_storeN_noabort(unsigned long addr, size_t size); - -void __asan_report_load1_noabort(unsigned long addr); -void __asan_report_store1_noabort(unsigned long addr); -void __asan_report_load2_noabort(unsigned long addr); -void __asan_report_store2_noabort(unsigned long addr); -void __asan_report_load4_noabort(unsigned long addr); -void __asan_report_store4_noabort(unsigned long addr); -void __asan_report_load8_noabort(unsigned long addr); -void __asan_report_store8_noabort(unsigned long addr); -void __asan_report_load16_noabort(unsigned long addr); -void __asan_report_store16_noabort(unsigned long addr); -void __asan_report_load_n_noabort(unsigned long addr, size_t size); -void __asan_report_store_n_noabort(unsigned long addr, size_t size); - -void __asan_set_shadow_00(const void *addr, size_t size); -void __asan_set_shadow_f1(const void *addr, size_t size); -void __asan_set_shadow_f2(const void *addr, size_t size); -void __asan_set_shadow_f3(const void *addr, size_t size); -void __asan_set_shadow_f5(const void *addr, size_t size); -void __asan_set_shadow_f8(const void *addr, size_t size); - -void *__asan_memset(void *addr, int c, size_t len); -void *__asan_memmove(void *dest, const void *src, size_t len); -void *__asan_memcpy(void *dest, const void *src, size_t len); - -void __hwasan_load1_noabort(unsigned long addr); -void __hwasan_store1_noabort(unsigned long addr); -void __hwasan_load2_noabort(unsigned long addr); -void __hwasan_store2_noabort(unsigned long addr); -void __hwasan_load4_noabort(unsigned long addr); -void __hwasan_store4_noabort(unsigned long addr); -void __hwasan_load8_noabort(unsigned long addr); -void __hwasan_store8_noabort(unsigned long addr); -void __hwasan_load16_noabort(unsigned long addr); -void __hwasan_store16_noabort(unsigned long addr); -void __hwasan_loadN_noabort(unsigned long addr, size_t size); -void __hwasan_storeN_noabort(unsigned long addr, size_t size); - -void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); - -void *__hwasan_memset(void *addr, int c, size_t len); -void *__hwasan_memmove(void *dest, const void *src, size_t len); -void *__hwasan_memcpy(void *dest, const void *src, size_t len); +void __asan_alloca_poison(void *, ssize_t size); +void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom); + +void __asan_load1(void *); +void __asan_store1(void *); +void __asan_load2(void *); +void __asan_store2(void *); +void __asan_load4(void *); +void __asan_store4(void *); +void __asan_load8(void *); +void __asan_store8(void *); +void __asan_load16(void *); +void __asan_store16(void *); +void __asan_loadN(void *, ssize_t size); +void __asan_storeN(void *, ssize_t size); + +void __asan_load1_noabort(void *); +void __asan_store1_noabort(void *); +void __asan_load2_noabort(void *); +void __asan_store2_noabort(void *); +void __asan_load4_noabort(void *); +void __asan_store4_noabort(void *); +void __asan_load8_noabort(void *); +void __asan_store8_noabort(void *); +void __asan_load16_noabort(void *); +void __asan_store16_noabort(void *); +void __asan_loadN_noabort(void *, ssize_t size); +void __asan_storeN_noabort(void *, ssize_t size); + +void __asan_report_load1_noabort(void *); +void __asan_report_store1_noabort(void *); +void __asan_report_load2_noabort(void *); +void __asan_report_store2_noabort(void *); +void __asan_report_load4_noabort(void *); +void __asan_report_store4_noabort(void *); +void __asan_report_load8_noabort(void *); +void __asan_report_store8_noabort(void *); +void __asan_report_load16_noabort(void *); +void __asan_report_store16_noabort(void *); +void __asan_report_load_n_noabort(void *, ssize_t size); +void __asan_report_store_n_noabort(void *, ssize_t size); + +void __asan_set_shadow_00(const void *addr, ssize_t size); +void __asan_set_shadow_f1(const void *addr, ssize_t size); +void __asan_set_shadow_f2(const void *addr, ssize_t size); +void __asan_set_shadow_f3(const void *addr, ssize_t size); +void __asan_set_shadow_f5(const void *addr, ssize_t size); +void __asan_set_shadow_f8(const void *addr, ssize_t size); + +void *__asan_memset(void *addr, int c, ssize_t len); +void *__asan_memmove(void *dest, const void *src, ssize_t len); +void *__asan_memcpy(void *dest, const void *src, ssize_t len); + +void __hwasan_load1_noabort(void *); +void __hwasan_store1_noabort(void *); +void __hwasan_load2_noabort(void *); +void __hwasan_store2_noabort(void *); +void __hwasan_load4_noabort(void *); +void __hwasan_store4_noabort(void *); +void __hwasan_load8_noabort(void *); +void __hwasan_store8_noabort(void *); +void __hwasan_load16_noabort(void *); +void __hwasan_store16_noabort(void *); +void __hwasan_loadN_noabort(void *, ssize_t size); +void __hwasan_storeN_noabort(void *, ssize_t size); + +void __hwasan_tag_memory(void *, u8 tag, ssize_t size); + +void *__hwasan_memset(void *addr, int c, ssize_t len); +void *__hwasan_memmove(void *dest, const void *src, ssize_t len); +void *__hwasan_memcpy(void *dest, const void *src, ssize_t len); + +void kasan_tag_mismatch(void *addr, unsigned long access_info, + unsigned long ret_ip); #endif /* __MM_KASAN_KASAN_H */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 892a9dc9d4d3..ca4b6ff080a6 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -43,6 +43,7 @@ enum kasan_arg_fault { KASAN_ARG_FAULT_DEFAULT, KASAN_ARG_FAULT_REPORT, KASAN_ARG_FAULT_PANIC, + KASAN_ARG_FAULT_PANIC_ON_WRITE, }; static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT; @@ -57,6 +58,8 @@ static int __init early_kasan_fault(char *arg) kasan_arg_fault = KASAN_ARG_FAULT_REPORT; else if (!strcmp(arg, "panic")) kasan_arg_fault = KASAN_ARG_FAULT_PANIC; + else if (!strcmp(arg, "panic_on_write")) + kasan_arg_fault = KASAN_ARG_FAULT_PANIC_ON_WRITE; else return -EINVAL; @@ -211,7 +214,7 @@ static void start_report(unsigned long *flags, bool sync) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags, void *addr) +static void end_report(unsigned long *flags, const void *addr, bool is_write) { if (addr) trace_error_report_end(ERROR_DETECTOR_KASAN, @@ -220,8 +223,18 @@ static void end_report(unsigned long *flags, void *addr) spin_unlock_irqrestore(&report_lock, *flags); if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) check_panic_on_warn("KASAN"); - if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) + switch (kasan_arg_fault) { + case KASAN_ARG_FAULT_DEFAULT: + case KASAN_ARG_FAULT_REPORT: + break; + case KASAN_ARG_FAULT_PANIC: panic("kasan.fault=panic set ...\n"); + break; + case KASAN_ARG_FAULT_PANIC_ON_WRITE: + if (is_write) + panic("kasan.fault=panic_on_write set ...\n"); + break; + } add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); lockdep_on(); report_suppress_stop(); @@ -450,8 +463,8 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *addr = kasan_reset_tag(info->access_addr); - u8 tag = get_tag(info->access_addr); + void *addr = kasan_reset_tag((void *)info->access_addr); + u8 tag = get_tag((void *)info->access_addr); print_error_description(info); if (addr_has_metadata(addr)) @@ -468,12 +481,12 @@ static void print_report(struct kasan_report_info *info) static void complete_report_info(struct kasan_report_info *info) { - void *addr = kasan_reset_tag(info->access_addr); + void *addr = kasan_reset_tag((void *)info->access_addr); struct slab *slab; if (info->type == KASAN_REPORT_ACCESS) info->first_bad_addr = kasan_find_first_bad_addr( - info->access_addr, info->access_size); + (void *)info->access_addr, info->access_size); else info->first_bad_addr = addr; @@ -536,7 +549,11 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty print_report(&info); - end_report(&flags, ptr); + /* + * Invalid free is considered a "write" since the allocator's metadata + * updates involves writes. + */ + end_report(&flags, ptr, true); } /* @@ -544,11 +561,10 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty * user_access_save/restore(): kasan_report_invalid_free() cannot be called * from a UACCESS region, and kasan_report_async() is not used on x86. */ -bool kasan_report(unsigned long addr, size_t size, bool is_write, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip) { bool ret = true; - void *ptr = (void *)addr; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; struct kasan_report_info info; @@ -562,7 +578,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; - info.access_addr = ptr; + info.access_addr = addr; info.access_size = size; info.is_write = is_write; info.ip = ip; @@ -571,7 +587,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, print_report(&info); - end_report(&irq_flags, ptr); + end_report(&irq_flags, (void *)addr, is_write); out: user_access_restore(ua_flags); @@ -597,7 +613,11 @@ void kasan_report_async(void) pr_err("Asynchronous fault: no details available\n"); pr_err("\n"); dump_stack_lvl(KERN_ERR); - end_report(&flags, NULL); + /* + * Conservatively set is_write=true, because no details are available. + * In this mode, kasan.fault=panic_on_write is like kasan.fault=panic. + */ + end_report(&flags, NULL, true); } #endif /* CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 87d39bc0a673..51a1e8a8877f 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -30,9 +30,9 @@ #include "kasan.h" #include "../slab.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { - void *p = addr; + const void *p = addr; if (!addr_has_metadata(p)) return p; @@ -362,14 +362,14 @@ void kasan_print_address_stack_frame(const void *addr) #endif /* CONFIG_KASAN_STACK */ #define DEFINE_ASAN_REPORT_LOAD(size) \ -void __asan_report_load##size##_noabort(unsigned long addr) \ +void __asan_report_load##size##_noabort(void *addr) \ { \ kasan_report(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_report_load##size##_noabort) #define DEFINE_ASAN_REPORT_STORE(size) \ -void __asan_report_store##size##_noabort(unsigned long addr) \ +void __asan_report_store##size##_noabort(void *addr) \ { \ kasan_report(addr, size, true, _RET_IP_); \ } \ @@ -386,13 +386,13 @@ DEFINE_ASAN_REPORT_STORE(4); DEFINE_ASAN_REPORT_STORE(8); DEFINE_ASAN_REPORT_STORE(16); -void __asan_report_load_n_noabort(unsigned long addr, size_t size) +void __asan_report_load_n_noabort(void *addr, ssize_t size) { kasan_report(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_report_load_n_noabort); -void __asan_report_store_n_noabort(unsigned long addr, size_t size) +void __asan_report_store_n_noabort(void *addr, ssize_t size) { kasan_report(addr, size, true, _RET_IP_); } diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 32e80f78de7d..065e1b2fc484 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -15,7 +15,7 @@ #include "kasan.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { /* * Hardware Tag-Based KASAN only calls this function for normal memory diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 8b1f5a73ee6d..689e94f9fe3c 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -30,7 +30,7 @@ #include "kasan.h" #include "../slab.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { u8 tag = get_tag(addr); void *p = kasan_reset_tag(addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index c8b86f3273b5..dd772f9d0f08 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -28,13 +28,13 @@ bool __kasan_check_read(const volatile void *p, unsigned int size) { - return kasan_check_range((unsigned long)p, size, false, _RET_IP_); + return kasan_check_range((void *)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); bool __kasan_check_write(const volatile void *p, unsigned int size) { - return kasan_check_range((unsigned long)p, size, true, _RET_IP_); + return kasan_check_range((void *)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); @@ -50,7 +50,7 @@ EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range(addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -60,8 +60,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -71,17 +71,17 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); } #endif -void *__asan_memset(void *addr, int c, size_t len) +void *__asan_memset(void *addr, int c, ssize_t len) { - if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range(addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -89,10 +89,10 @@ void *__asan_memset(void *addr, int c, size_t len) EXPORT_SYMBOL(__asan_memset); #ifdef __HAVE_ARCH_MEMMOVE -void *__asan_memmove(void *dest, const void *src, size_t len) +void *__asan_memmove(void *dest, const void *src, ssize_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -100,10 +100,10 @@ void *__asan_memmove(void *dest, const void *src, size_t len) EXPORT_SYMBOL(__asan_memmove); #endif -void *__asan_memcpy(void *dest, const void *src, size_t len) +void *__asan_memcpy(void *dest, const void *src, ssize_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); @@ -111,13 +111,13 @@ void *__asan_memcpy(void *dest, const void *src, size_t len) EXPORT_SYMBOL(__asan_memcpy); #ifdef CONFIG_KASAN_SW_TAGS -void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset); +void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset); EXPORT_SYMBOL(__hwasan_memset); #ifdef __HAVE_ARCH_MEMMOVE -void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove); +void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove); EXPORT_SYMBOL(__hwasan_memmove); #endif -void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy); +void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy); EXPORT_SYMBOL(__hwasan_memcpy); #endif @@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr) if (pmd_bad(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); - return !pte_none(*pte); + return !pte_none(ptep_get(pte)); } static int __meminit kasan_mem_notifier(struct notifier_block *nb, @@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, unsigned long page; pte_t pte; - if (likely(!pte_none(*ptep))) + if (likely(!pte_none(ptep_get(ptep)))) return 0; page = __get_free_page(GFP_KERNEL); @@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(*ptep))) { + if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); page = 0; } @@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, { unsigned long page; - page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); spin_lock(&init_mm.page_table_lock); - if (likely(!pte_none(*ptep))) { + if (likely(!pte_none(ptep_get(ptep)))) { pte_clear(&init_mm, addr, ptep); free_page(page); } diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 30da65fa02a1..220b5d4c6876 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -70,8 +70,8 @@ u8 kasan_random_tag(void) return (u8)(state % (KASAN_TAG_MAX + 1)); } -bool kasan_check_range(unsigned long addr, size_t size, bool write, - unsigned long ret_ip) +bool kasan_check_range(const void *addr, size_t size, bool write, + unsigned long ret_ip) { u8 tag; u8 *shadow_first, *shadow_last, *shadow; @@ -133,12 +133,12 @@ bool kasan_byte_accessible(const void *addr) } #define DEFINE_HWASAN_LOAD_STORE(size) \ - void __hwasan_load##size##_noabort(unsigned long addr) \ + void __hwasan_load##size##_noabort(void *addr) \ { \ - kasan_check_range(addr, size, false, _RET_IP_); \ + kasan_check_range(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ - void __hwasan_store##size##_noabort(unsigned long addr) \ + void __hwasan_store##size##_noabort(void *addr) \ { \ kasan_check_range(addr, size, true, _RET_IP_); \ } \ @@ -150,25 +150,25 @@ DEFINE_HWASAN_LOAD_STORE(4); DEFINE_HWASAN_LOAD_STORE(8); DEFINE_HWASAN_LOAD_STORE(16); -void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) +void __hwasan_loadN_noabort(void *addr, ssize_t size) { kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__hwasan_loadN_noabort); -void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) +void __hwasan_storeN_noabort(void *addr, ssize_t size) { kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__hwasan_storeN_noabort); -void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) +void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size) { - kasan_poison((void *)addr, size, tag, false); + kasan_poison(addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); -void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, +void kasan_tag_mismatch(void *addr, unsigned long access_info, unsigned long ret_ip) { kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10, diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 67a222586846..7dcfe341d48e 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -140,5 +140,5 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) void kasan_save_free_info(struct kmem_cache *cache, void *object) { - save_stack_info(cache, object, GFP_NOWAIT, true); + save_stack_info(cache, object, 0, true); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b9d39d65b73..3beb4ad2ee5e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -88,7 +88,7 @@ static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 -static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); +static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; @@ -422,19 +422,17 @@ void __khugepaged_enter(struct mm_struct *mm) struct mm_slot *slot; int wakeup; + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + return; + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; slot = &mm_slot->slot; - /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - mm_slot_free(mm_slot_cache, mm_slot); - return; - } - spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* @@ -513,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, struct folio *folio, *tmp; while (--_pte >= pte) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) @@ -557,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; @@ -701,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { @@ -799,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte, */ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; _pte++, page++, _address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, _address); continue; @@ -946,10 +944,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -/* - * See pmd_trans_unstable() for how the result may change out from - * underneath us, even if we hold mmap_lock in read. - */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -961,11 +955,6 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return SCAN_PMD_NULL; pmde = pmdp_get_lockless(*pmd); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ - barrier(); -#endif if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) @@ -998,9 +987,8 @@ static int check_pmd_still_valid(struct mm_struct *mm, * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. - * Note that if false is returned, mmap_lock will be released. + * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ - static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -1009,23 +997,37 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, int swapped_in = 0; vm_fault_t ret = 0; unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + int result; + pte_t *pte = NULL; + spinlock_t *ptl; for (address = haddr; address < end; address += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = address, - .pgoff = linear_page_index(vma, haddr), + .pgoff = linear_page_index(vma, address), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; - vmf.pte = pte_offset_map(pmd, address); - vmf.orig_pte = *vmf.pte; - if (!is_swap_pte(vmf.orig_pte)) { - pte_unmap(vmf.pte); - continue; + if (!pte++) { + pte = pte_offset_map_nolock(mm, pmd, address, &ptl); + if (!pte) { + mmap_read_unlock(mm); + result = SCAN_PMD_NULL; + goto out; + } } + + vmf.orig_pte = ptep_get_lockless(pte); + if (!is_swap_pte(vmf.orig_pte)) + continue; + + vmf.pte = pte; + vmf.ptl = ptl; ret = do_swap_page(&vmf); + /* Which unmaps pte (after perhaps re-checking the entry) */ + pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. @@ -1034,24 +1036,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); /* Likely, but not guaranteed, that page lock failed */ - return SCAN_PAGE_LOCK; + result = SCAN_PAGE_LOCK; + goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return SCAN_FAIL; + result = SCAN_FAIL; + goto out; } swapped_in++; } - /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ + if (pte) + pte_unmap(pte); + + /* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return SCAN_SUCCEED; + result = SCAN_SUCCEED; +out: + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); + return result; } static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, @@ -1151,9 +1158,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); - pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow @@ -1168,13 +1172,18 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); - spin_lock(pte_ptl); - result = __collapse_huge_page_isolate(vma, address, pte, cc, - &compound_pagelist); - spin_unlock(pte_ptl); + pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); + if (pte) { + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); + spin_unlock(pte_ptl); + } else { + result = SCAN_PMD_NULL; + } if (unlikely(result != SCAN_SUCCEED)) { - pte_unmap(pte); + if (pte) + pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* @@ -1258,9 +1267,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte) { + result = SCAN_PMD_NULL; + goto out; + } + for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || @@ -1627,25 +1641,28 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * lockless_pages_from_mm() and the hardware page walker can access page * tables while all the high-level locks are held in write mode. */ - start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); result = SCAN_FAIL; + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + if (!start_pte) + goto drop_immap; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); /* empty pte, skip */ - if (pte_none(*pte)) + if (pte_none(ptent)) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) { + if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* @@ -1661,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); - if (pte_none(*pte)) + if (pte_none(ptent)) continue; - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) goto abort; page_remove_rmap(page, vma, false); @@ -1702,6 +1720,7 @@ drop_hpage: abort: pte_unmap_unlock(start_pte, ptl); +drop_immap: i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; } @@ -1953,7 +1972,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, result = SCAN_FAIL; goto xa_unlocked; } - /* drain pagevecs to help isolate_lru_page() */ + /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); page = folio_file_page(folio, index); } else if (trylock_page(page)) { @@ -1969,7 +1988,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); - /* drain pagevecs to help isolate_lru_page() */ + /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); page = find_lock_page(mapping, index); if (unlikely(page == NULL)) { @@ -2070,7 +2089,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); - xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 7d1e4aa30bae..3adb4c1d3b19 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -74,7 +74,7 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); /* Don't sleep. */ - flags &= ~__GFP_DIRECT_RECLAIM; + flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM); handle = __stack_depot_save(entries, nr_entries, flags, true); return stack_depot_set_extra_bits(handle, extra); @@ -245,7 +245,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) extra_bits = kmsan_extra_bits(depth, uaf); entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN; - entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0); + entries[1] = kmsan_save_stack_with_flags(__GFP_HIGH, 0); entries[2] = id; /* * @entries is a local var in non-instrumented code, so KMSAN does not @@ -253,7 +253,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * positives when __stack_depot_save() passes it to instrumented code. */ kmsan_internal_unpoison_memory(entries, sizeof(entries), false); - handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC, + handle = __stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH, true); return stack_depot_set_extra_bits(handle, extra_bits); } diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index cf12e9616b24..cc3907a9c33a 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -282,7 +282,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr) /* stack_depot_save() may allocate memory. */ kmsan_enter_runtime(); - handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH); kmsan_leave_runtime(); kmsan_internal_set_shadow_origin(address, size, -1, handle, @@ -429,16 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex struct page *page = NULL; spinlock_t *ptl; pte_t *pte; + pte_t ptent; int ret; - if (pmd_leaf(*pmd) || !pmd_present(*pmd)) - return 0; - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (pte_present(*pte)) { - page = vm_normal_page(walk->vma, addr, *pte); - } else if (!pte_none(*pte)) { - swp_entry_t entry = pte_to_swp_entry(*pte); + if (!pte) + return 0; + ptent = ptep_get(pte); + if (pte_present(ptent)) { + page = vm_normal_page(walk->vma, addr, ptent); + } else if (!pte_none(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); /* * As KSM pages remain KSM pages until freed, no need to wait @@ -931,7 +932,7 @@ static int remove_stable_node(struct ksm_stable_node *stable_node) * The stable node did not yet appear stale to get_ksm_page(), * since that allows for an unmapped ksm page to be recognized * right up until it is freed; but the node is safe to remove. - * This page might be in a pagevec waiting to be freed, + * This page might be in an LRU cache waiting to be freed, * or it might be PageSwapCache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ @@ -1086,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; + pte_t entry; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1103,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; anon_exclusive = PageAnonExclusive(page); - if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + entry = ptep_get(pvmw.pte); + if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - pte_t entry; - swapped = PageSwapCache(page); flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* @@ -1148,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *pvmw.pte; + *orig_pte = entry; err = 0; out_unlock: @@ -1194,8 +1195,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ - pmde = *pmd; - barrier(); + pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; @@ -1204,7 +1204,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!pte_same(*ptep, orig_pte)) { + if (!ptep) + goto out_mn; + if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } @@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, dec_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. @@ -2301,8 +2303,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* - * A number of pages can hang around indefinitely on per-cpu - * pagevecs, raised page count preventing write_protect_page + * A number of pages can hang around indefinitely in per-cpu + * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented diff --git a/mm/madvise.c b/mm/madvise.c index b5ffbaf616f5..886f06066622 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -188,37 +188,43 @@ success: #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; - unsigned long index; struct swap_iocb *splug = NULL; + pte_t *ptep = NULL; + spinlock_t *ptl; + unsigned long addr; - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - return 0; - - for (index = start; index != end; index += PAGE_SIZE) { + for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct page *page; - spinlock_t *ptl; - pte_t *ptep; - ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl); - pte = *ptep; - pte_unmap_unlock(ptep, ptl); + if (!ptep++) { + ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!ptep) + break; + } + pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; + pte_unmap_unlock(ptep, ptl); + ptep = NULL; + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index, false, &splug); + vma, addr, false, &splug); if (page) put_page(page); } + + if (ptep) + pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); @@ -229,30 +235,34 @@ static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, }; -static void force_shm_swapin_readahead(struct vm_area_struct *vma, +static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); - pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); + pgoff_t end_index = linear_page_index(vma, end) - 1; struct page *page; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, page, end_index) { - swp_entry_t swap; + unsigned long addr; + swp_entry_t entry; if (!xa_is_value(page)) continue; - swap = radix_to_swp_entry(page); + entry = radix_to_swp_entry(page); /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swap)) + if (non_swap_entry(entry)) continue; + + addr = vma->vm_start + + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); - page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, - NULL, 0, false, &splug); + page = read_swap_cache_async(entry, mapping_gfp_mask(mapping), + vma, addr, false, &splug); if (page) put_page(page); @@ -260,8 +270,6 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, } rcu_read_unlock(); swap_read_unplug(splug); - - lru_add_drain(); /* Push any new pages onto the LRU now */ } #endif /* CONFIG_SWAP */ @@ -285,8 +293,8 @@ static long madvise_willneed(struct vm_area_struct *vma, } if (shmem_mapping(file->f_mapping)) { - force_shm_swapin_readahead(vma, start, end, - file->f_mapping); + shmem_swapin_range(vma, start, end, file->f_mapping); + lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else @@ -340,7 +348,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; - pte_t *orig_pte, *pte, ptent; + pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); @@ -422,15 +430,15 @@ huge_unlock: } regular_folio: - if (pmd_trans_unstable(pmd)) - return 0; #endif tlb_change_page_size(tlb, PAGE_SIZE); - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!start_pte) + return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -447,25 +455,28 @@ regular_folio: * are sure it's worth. Split it if we are only owner. */ if (folio_test_large(folio)) { + int err; + if (folio_mapcount(folio) != 1) break; if (pageout_anon_only_filter && !folio_test_anon(folio)) break; - folio_get(folio); - if (!folio_trylock(folio)) { - folio_put(folio); - break; - } - pte_unmap_unlock(orig_pte, ptl); - if (split_folio(folio)) { - folio_unlock(folio); - folio_put(folio); - orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!folio_trylock(folio)) break; - } + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); folio_unlock(folio); folio_put(folio); - orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (err) + break; + start_pte = pte = + pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); pte--; addr -= PAGE_SIZE; continue; @@ -510,8 +521,10 @@ regular_folio: folio_deactivate(folio); } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(orig_pte, ptl); + if (start_pte) { + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + } if (pageout) reclaim_pages(&folio_list); cond_resched(); @@ -612,7 +625,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; - pte_t *orig_pte, *pte, ptent; + pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; @@ -620,17 +633,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) - goto next; - - if (pmd_trans_unstable(pmd)) - return 0; + return 0; tlb_change_page_size(tlb, PAGE_SIZE); - orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -664,23 +676,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * deactivate all pages. */ if (folio_test_large(folio)) { + int err; + if (folio_mapcount(folio) != 1) - goto out; + break; + if (!folio_trylock(folio)) + break; folio_get(folio); - if (!folio_trylock(folio)) { - folio_put(folio); - goto out; - } - pte_unmap_unlock(orig_pte, ptl); - if (split_folio(folio)) { - folio_unlock(folio); - folio_put(folio); - orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - goto out; - } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); folio_unlock(folio); folio_put(folio); - orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (err) + break; + start_pte = pte = + pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); pte--; addr -= PAGE_SIZE; continue; @@ -725,17 +740,18 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } folio_mark_lazyfree(folio); } -out: + if (nr_swap) { if (current->mm == mm) sync_mm_rss(mm); - add_mm_counter(mm, MM_SWAPENTS, nr_swap); } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(orig_pte, ptl); + if (start_pte) { + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + } cond_resched(); -next: + return 0; } diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index e1eb33f49059..a26dd8bcfcdb 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct wp_walk *wpwalk = walk->private; - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_write(ptent)) { pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); @@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, { struct wp_walk *wpwalk = walk->private; struct clean_walk *cwalk = to_clean_walk(wpwalk); - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_dirty(ptent)) { pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + @@ -128,19 +128,11 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, { pmd_t pmdval = pmdp_get_lockless(pmd); - if (!pmd_trans_unstable(&pmdval)) - return 0; - - if (pmd_none(pmdval)) { - walk->action = ACTION_AGAIN; - return 0; - } - - /* Huge pmd, present or migrated */ - walk->action = ACTION_CONTINUE; - if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + /* Do not split a huge pmd, present or migrated */ + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) { WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); - + walk->action = ACTION_CONTINUE; + } return 0; } @@ -156,23 +148,15 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD pud_t pudval = READ_ONCE(*pud); - if (!pud_trans_unstable(&pudval)) - return 0; - - if (pud_none(pudval)) { - walk->action = ACTION_AGAIN; - return 0; - } - -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - /* Huge pud */ - walk->action = ACTION_CONTINUE; - if (pud_trans_huge(pudval) || pud_devmap(pudval)) + /* Do not split a huge pud */ + if (pud_trans_huge(pudval) || pud_devmap(pudval)) { WARN_ON(pud_write(pudval) || pud_dirty(pudval)); + walk->action = ACTION_CONTINUE; + } #endif - return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 3feafea06ab2..388bc0c78998 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1436,6 +1436,15 @@ done: */ kmemleak_alloc_phys(found, size, 0); + /* + * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, + * require memory to be accepted before it can be used by the + * guest. + * + * Accept the memory of the allocated buffer. + */ + accept_memory(found, found + size); + return found; } @@ -2082,19 +2091,30 @@ static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; phys_addr_t start, end; - u64 i; + int nid; + + /* + * set nid on all reserved pages and also treat struct + * pages for the NOMAP regions as PageReserved + */ + for_each_mem_region(region) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; + + if (memblock_is_nomap(region)) + reserve_bootmem_region(start, end, nid); + + memblock_set_node(start, end, &memblock.reserved, nid); + } /* initialize struct pages for the reserved regions */ - for_each_reserved_mem_range(i, &start, &end) - reserve_bootmem_region(start, end); + for_each_reserved_mem_region(region) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; - /* and also treat struct pages for the NOMAP regions as PageReserved */ - for_each_mem_region(region) { - if (memblock_is_nomap(region)) { - start = region->base; - end = start + region->size; - reserve_bootmem_region(start, end); - } + reserve_bootmem_region(start, end, nid); } } @@ -2122,7 +2142,7 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -void reset_node_managed_pages(pg_data_t *pgdat) +static void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b27e245a055..e8ca4bdcb03c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -485,7 +485,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) if (lru_gen_enabled()) { if (soft_limit_excess(memcg)) - lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); + lru_gen_soft_reclaim(memcg, nid); return; } @@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) } } -static void do_flush_stats(bool atomic) +static void do_flush_stats(void) { /* * We always flush the entire tree, so concurrent flushers can just @@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic) WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); - if (atomic) - cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup); - else - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); + cgroup_rstat_flush(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); atomic_set(&stats_flush_ongoing, 0); } -static bool should_flush_stats(void) -{ - return atomic_read(&stats_flush_threshold) > num_online_cpus(); -} - void mem_cgroup_flush_stats(void) { - if (should_flush_stats()) - do_flush_stats(false); -} - -void mem_cgroup_flush_stats_atomic(void) -{ - if (should_flush_stats()) - do_flush_stats(true); + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + do_flush_stats(); } void mem_cgroup_flush_stats_ratelimited(void) @@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * Always flush here so that flushing in latency-sensitive paths is * as cheap as possible. */ - do_flush_stats(false); + do_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } @@ -1273,13 +1259,13 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * * This function iterates over tasks attached to @memcg or to any of its * descendants and calls @fn for each task. If @fn returns a non-zero - * value, the function breaks the iteration loop and returns the value. - * Otherwise, it will iterate over all tasks and return 0. + * value, the function breaks the iteration loop. Otherwise, it will iterate + * over all tasks and return 0. * * This function must not be called for the root memory cgroup. */ -int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, - int (*fn)(struct task_struct *, void *), void *arg) +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) { struct mem_cgroup *iter; int ret = 0; @@ -1299,7 +1285,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, break; } } - return ret; } #ifdef CONFIG_DEBUG_VM @@ -1580,13 +1565,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) +static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { - struct seq_buf s; int i; - seq_buf_init(&s, buf, bufsize); - /* * Provide statistics on the state of the memory subsystem as * well as cumulative event counters that show past behavior. @@ -1603,21 +1585,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) u64 size; size = memcg_page_state_output(memcg, memory_stats[i].idx); - seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); + seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { size += memcg_page_state_output(memcg, NR_SLAB_RECLAIMABLE_B); - seq_buf_printf(&s, "slab %llu\n", size); + seq_buf_printf(s, "slab %llu\n", size); } } /* Accumulated memory events */ - seq_buf_printf(&s, "pgscan %lu\n", + seq_buf_printf(s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + memcg_events(memcg, PGSCAN_DIRECT) + memcg_events(memcg, PGSCAN_KHUGEPAGED)); - seq_buf_printf(&s, "pgsteal %lu\n", + seq_buf_printf(s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT) + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); @@ -1627,13 +1609,24 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_vm_event_stat[i] == PGPGOUT) continue; - seq_buf_printf(&s, "%s %lu\n", + seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); } /* The above should easily fit into one page */ - WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + WARN_ON_ONCE(seq_buf_has_overflowed(s)); +} + +static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); + +static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg_stat_format(memcg, s); + else + memcg1_stat_format(memcg, s); + WARN_ON_ONCE(seq_buf_has_overflowed(s)); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -1671,6 +1664,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { /* Use static buffer, for the caller is holding oom_lock. */ static char buf[PAGE_SIZE]; + struct seq_buf s; lockdep_assert_held(&oom_lock); @@ -1693,8 +1687,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("Memory cgroup stats for "); pr_cont_cgroup_path(memcg->css.cgroup); pr_cont(":"); - memory_stat_format(memcg, buf, sizeof(buf)); - pr_info("%s", buf); + seq_buf_init(&s, buf, sizeof(buf)); + memory_stat_format(memcg, &s); + seq_buf_do_printk(&s, KERN_INFO); } /* @@ -2028,26 +2023,12 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked) mem_cgroup_oom_notify(memcg); - if (locked && !READ_ONCE(memcg->oom_kill_disable)) { - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, - current->memcg_oom_order); - } else { - schedule(); - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); - } + schedule(); + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); - if (locked) { + if (locked) mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitly. - */ - memcg_oom_recover(memcg); - } cleanup: current->memcg_in_oom = NULL; css_put(&memcg->css); @@ -2166,17 +2147,12 @@ again: * When charge migration first begins, we can have multiple * critical sections holding the fast-path RCU lock and one * holding the slowpath move_lock. Track the task who has the - * move_lock for unlock_page_memcg(). + * move_lock for folio_memcg_unlock(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; } -void lock_page_memcg(struct page *page) -{ - folio_memcg_lock(page_folio(page)); -} - static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { @@ -2204,11 +2180,6 @@ void folio_memcg_unlock(struct folio *folio) __folio_memcg_unlock(folio_memcg(folio)); } -void unlock_page_memcg(struct page *page) -{ - folio_memcg_unlock(page_folio(page)); -} - struct memcg_stock_pcp { local_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ @@ -2275,7 +2246,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) { + if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } @@ -2290,7 +2261,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_stock(struct memcg_stock_pcp *stock) { - struct mem_cgroup *old = stock->cached; + struct mem_cgroup *old = READ_ONCE(stock->cached); if (!old) return; @@ -2303,7 +2274,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) } css_put(&old->css); - stock->cached = NULL; + WRITE_ONCE(stock->cached, NULL); } static void drain_local_stock(struct work_struct *dummy) @@ -2338,10 +2309,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) struct memcg_stock_pcp *stock; stock = this_cpu_ptr(&memcg_stock); - if (stock->cached != memcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ drain_stock(stock); css_get(&memcg->css); - stock->cached = memcg; + WRITE_ONCE(stock->cached, memcg); } stock->nr_pages += nr_pages; @@ -2383,7 +2354,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) bool flush = false; rcu_read_lock(); - memcg = stock->cached; + memcg = READ_ONCE(stock->cached); if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; @@ -2884,7 +2855,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * * - the page lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() */ @@ -3208,12 +3179,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, * accumulating over a page of vmstat data or when pgdat or idx * changes. */ - if (stock->cached_objcg != objcg) { + if (READ_ONCE(stock->cached_objcg) != objcg) { old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ @@ -3267,7 +3238,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } @@ -3279,7 +3250,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { - struct obj_cgroup *old = stock->cached_objcg; + struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) return NULL; @@ -3332,7 +3303,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) stock->cached_pgdat = NULL; } - stock->cached_objcg = NULL; + WRITE_ONCE(stock->cached_objcg, NULL); /* * The `old' objects needs to be released by the caller via * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. @@ -3343,10 +3314,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) { + struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; - if (stock->cached_objcg) { - memcg = obj_cgroup_memcg(stock->cached_objcg); + if (objcg) { + memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3365,10 +3337,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (stock->cached_objcg != objcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ old = drain_obj_stock(stock); obj_cgroup_get(objcg); - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; allow_uncharge = true; /* Allow uncharge when objcg changes */ @@ -3699,27 +3671,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) if (mem_cgroup_is_root(memcg)) { /* - * We can reach here from irq context through: - * uncharge_batch() - * |--memcg_check_events() - * |--mem_cgroup_threshold() - * |--__mem_cgroup_threshold() - * |--mem_cgroup_usage - * - * rstat flushing is an expensive operation that should not be - * done from irq context; use stale stats in this case. - * Arguably, usage threshold events are not reliable on the root - * memcg anyway since its usage is ill-defined. - * - * Additionally, other call paths through memcg_check_events() - * disable irqs, so make sure we are flushing stats atomically. + * Approximate root's usage from global state. This isn't + * perfect, but the root usage was always an approximation. */ - if (in_task()) - mem_cgroup_flush_stats_atomic(); - val = memcg_page_state(memcg, NR_FILE_PAGES) + - memcg_page_state(memcg, NR_ANON_MAPPED); + val = global_node_page_state(NR_FILE_PAGES) + + global_node_page_state(NR_ANON_MAPPED); if (swap) - val += memcg_page_state(memcg, MEMCG_SWAP); + val += total_swap_pages - get_nr_swap_pages(); } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -4135,9 +4093,8 @@ static const unsigned int memcg1_events[] = { PGMAJFAULT, }; -static int memcg_stat_show(struct seq_file *m, void *v) +static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -4152,18 +4109,18 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) - seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), - memcg_events_local(memcg, memcg1_events[i])); + seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), + memcg_events_local(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %lu\n", lru_list_name(i), - memcg_page_state_local(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + seq_buf_printf(s, "%s %lu\n", lru_list_name(i), + memcg_page_state_local(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -4171,11 +4128,11 @@ static int memcg_stat_show(struct seq_file *m, void *v) memory = min(memory, READ_ONCE(mi->memory.max)); memsw = min(memsw, READ_ONCE(mi->memsw.max)); } - seq_printf(m, "hierarchical_memory_limit %llu\n", - (u64)memory * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); if (do_memsw_account()) - seq_printf(m, "hierarchical_memsw_limit %llu\n", - (u64)memsw * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4183,19 +4140,19 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) - seq_printf(m, "total_%s %llu\n", - vm_event_name(memcg1_events[i]), - (u64)memcg_events(memcg, memcg1_events[i])); + seq_buf_printf(s, "total_%s %llu\n", + vm_event_name(memcg1_events[i]), + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "total_%s %llu\n", lru_list_name(i), - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -4210,12 +4167,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) anon_cost += mz->lruvec.anon_cost; file_cost += mz->lruvec.file_cost; } - seq_printf(m, "anon_cost %lu\n", anon_cost); - seq_printf(m, "file_cost %lu\n", file_cost); + seq_buf_printf(s, "anon_cost %lu\n", anon_cost); + seq_buf_printf(s, "file_cost %lu\n", file_cost); } #endif - - return 0; } static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, @@ -4648,11 +4603,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - /* - * wb_writeback() takes a spinlock and calls - * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep. - */ - mem_cgroup_flush_stats_atomic(); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5059,6 +5010,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p) } #endif +static int memory_stat_show(struct seq_file *m, void *v); + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5091,7 +5044,7 @@ static struct cftype mem_cgroup_legacy_files[] = { }, { .name = "stat", - .seq_show = memcg_stat_show, + .seq_show = memory_stat_show, }, { .name = "force_empty", @@ -5464,7 +5417,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, - 2UL*HZ); + FLUSH_TIME); lru_gen_online_memcg(memcg); return 0; offline_kmem: @@ -5865,7 +5818,7 @@ static int mem_cgroup_move_account(struct page *page, * with (un)charging, migration, LRU putback, or anything else * that would rely on a stable page's memory cgroup. * - * Note that lock_page_memcg is a memcg lock, not a page lock, + * Note that folio_memcg_lock is a memcg lock, not a page lock, * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. @@ -6057,11 +6010,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } - if (pmd_trans_unstable(pmd)) - return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return 0; for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -6277,12 +6230,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } - if (pmd_trans_unstable(pmd)) - return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return 0; for (; addr != end; addr += PAGE_SIZE) { - pte_t ptent = *(pte++); + pte_t ptent = ptep_get(pte++); bool device = false; swp_entry_t ent; @@ -6356,7 +6309,7 @@ static void mem_cgroup_move_charge(void) { lru_add_drain_all(); /* - * Signal lock_page_memcg() to take the memcg's move_lock + * Signal folio_memcg_lock() to take the memcg's move_lock * while we're moving its pages to another memcg. Then wait * for already started RCU-only updates to finish. */ @@ -6634,10 +6587,12 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + struct seq_buf s; if (!buf) return -ENOMEM; - memory_stat_format(memcg, buf, PAGE_SIZE); + seq_buf_init(&s, buf, PAGE_SIZE); + memory_stat_format(memcg, &s); seq_puts(m, buf); kfree(buf); return 0; @@ -6896,7 +6851,7 @@ static unsigned long effective_protection(unsigned long usage, protected = min(usage, setting); /* * If all cgroups at this level combined claim and use more - * protection then what the parent affords them, distribute + * protection than what the parent affords them, distribute * shares in proportion to utilization. * * We are using actual utilization rather than the statically @@ -7421,8 +7376,7 @@ static int __init mem_cgroup_init(void) for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE); + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); rtpn->rb_root = RB_ROOT; rtpn->rb_rightmost = NULL; @@ -7656,6 +7610,14 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static u64 swap_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)memcg->swap.watermark * PAGE_SIZE; +} + static int swap_high_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7735,6 +7697,11 @@ static struct cftype swap_files[] = { .write = swap_max_write, }, { + .name = "swap.peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_peak_read, + }, + { .name = "swap.events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct mem_cgroup, swap_events_file), diff --git a/mm/memfd.c b/mm/memfd.c index 69b90c31d38c..e763e76f1106 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create, inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - *file_seals |= F_SEAL_EXEC; + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5b663eca1f29..e245191e6b04 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -6,16 +6,16 @@ * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. - * + * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronously in respect to - * other VM users, because memory failures could happen anytime and - * anywhere. This could violate some of their assumptions. This is why - * this code has to be extremely careful. Generally it tries to use - * normal locking rules, as in get the standard locks, even if that means + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * It can be very tempting to add handling for obscure cases here. @@ -25,12 +25,12 @@ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in * tools/mm/page-types when running a real workload. - * + * * There are several operations here with exponential complexity because - * of unsuitable VM data structures. For example the operation to map back - * from RMAP chains to processes has to walk the complete process list and + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions - * are rare we hope to get away with this. This avoids impacting the core + * are rare we hope to get away with this. This avoids impacting the core * VM. */ @@ -123,7 +123,6 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -#ifdef CONFIG_SYSCTL static struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", @@ -146,14 +145,6 @@ static struct ctl_table memory_failure_table[] = { { } }; -static int __init memory_failure_sysctl_init(void) -{ - register_sysctl_init("vm", memory_failure_table); - return 0; -} -late_initcall(memory_failure_sysctl_init); -#endif /* CONFIG_SYSCTL */ - /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -395,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t ptent; VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); @@ -414,7 +406,10 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, if (pmd_devmap(*pmd)) return PMD_SHIFT; pte = pte_offset_map(pmd, address); - if (pte_present(*pte) && pte_devmap(*pte)) + if (!pte) + return 0; + ptent = ptep_get(pte); + if (pte_present(ptent) && pte_devmap(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -800,13 +795,13 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, goto out; } - if (pmd_trans_unstable(pmdp)) - goto out; - mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + if (!ptep) + goto out; + for (; addr != end; ptep++, addr += PAGE_SIZE) { - ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; @@ -2441,6 +2436,8 @@ static int __init memory_failure_init(void) INIT_WORK(&mf_cpu->work, memory_failure_work_func); } + register_sysctl_init("vm", memory_failure_table); + return 0; } core_initcall(memory_failure_init); diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index e593e56e530b..a516e303e304 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -366,7 +366,7 @@ static void establish_demotion_targets(void) lockdep_assert_held_once(&memory_tier_lock); - if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION)) + if (!node_demotion) return; disable_all_demotion_targets(); @@ -451,7 +451,6 @@ static void establish_demotion_targets(void) } #else -static inline void disable_all_demotion_targets(void) {} static inline void establish_demotion_targets(void) {} #endif /* CONFIG_MIGRATION */ diff --git a/mm/memory.c b/mm/memory.c index 5ce82a76201d..d8a9a770b1f1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,7 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/net_mm.h> #include <trace/events/kmem.h> @@ -699,15 +700,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + pte_t orig_pte; pte_t pte; swp_entry_t entry; + orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*ptep); - if (pte_swp_uffd_wp(*ptep)) + entry = pte_to_swp_entry(orig_pte); + if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -744,7 +747,7 @@ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { - swp_entry_t entry = pte_to_swp_entry(*src_pte); + swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { @@ -768,9 +771,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; - pte_t pte = *src_pte; + pte_t orig_pte = ptep_get(src_pte); + pte_t pte = orig_pte; struct page *page; - swp_entry_t entry = pte_to_swp_entry(pte); + swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -785,8 +789,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ - if (pte_swp_exclusive(*src_pte)) { - pte = pte_swp_clear_exclusive(*src_pte); + if (pte_swp_exclusive(orig_pte)) { + pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; @@ -805,9 +809,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -840,7 +844,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -904,7 +908,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); - if (userfaultfd_pte_wp(dst_vma, *src_pte)) + if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -922,7 +926,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = *src_pte; + pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1002,6 +1006,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; @@ -1012,13 +1017,25 @@ again: progress = 0; init_rss_vec(rss); + /* + * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the + * error handling here, assume that exclusive mmap_lock on dst and src + * protects anon from unexpected THP transitions; with shmem and file + * protected by mmap_lock-less collapse skipping areas with anon_vma + * (whereas vma_needs_copy() skips areas without anon_vma). A rework + * can remove such assumptions later, but this is good enough for now. + */ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } - src_pte = pte_offset_map(src_pmd, addr); - src_ptl = pte_lockptr(src_mm, src_pmd); + src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl); + if (!src_pte) { + pte_unmap_unlock(dst_pte, dst_ptl); + /* ret == 0 */ + goto out; + } spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; @@ -1035,17 +1052,18 @@ again: spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } - if (pte_none(*src_pte)) { + ptent = ptep_get(src_pte); + if (pte_none(ptent)) { progress++; continue; } - if (unlikely(!pte_present(*src_pte))) { + if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(*src_pte); + entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1083,8 +1101,7 @@ again: } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - spin_unlock(src_ptl); - pte_unmap(orig_src_pte); + pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -1388,14 +1405,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, swp_entry_t entry; tlb_change_page_size(tlb, PAGE_SIZE); -again: init_rss_vec(rss); - start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - pte = start_pte; + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return addr; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); struct page *page; if (pte_none(ptent)) @@ -1507,17 +1525,10 @@ again: * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched - * memory too. Restart if we didn't do everything. + * memory too. Come back again if we didn't do everything. */ - if (force_flush) { - force_flush = 0; + if (force_flush) tlb_flush_mmu(tlb); - } - - if (addr != end) { - cond_resched(); - goto again; - } return addr; } @@ -1536,8 +1547,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - else if (zap_huge_pmd(tlb, vma, pmd, addr)) - goto next; + else if (zap_huge_pmd(tlb, vma, pmd, addr)) { + addr = next; + continue; + } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && @@ -1550,20 +1563,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } - - /* - * Here there can be other concurrent MADV_DONTNEED or - * trans huge page faults running, and if the pmd is - * none or trans huge it can change under us. This is - * because MADV_DONTNEED holds the mmap_lock in read - * mode. - */ - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - goto next; - next = zap_pte_range(tlb, vma, pmd, addr, next, details); -next: - cond_resched(); - } while (pmd++, addr = next, addr != end); + if (pmd_none(*pmd)) { + addr = next; + continue; + } + addr = zap_pte_range(tlb, vma, pmd, addr, next, details); + if (addr != next) + pmd--; + } while (pmd++, cond_resched(), addr != end); return addr; } @@ -1821,7 +1828,7 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); @@ -1905,6 +1912,10 @@ more: const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + if (!start_pte) { + ret = -EFAULT; + goto out; + } for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); @@ -2111,7 +2122,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; - if (!pte_none(*pte)) { + entry = ptep_get(pte); + if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed @@ -2123,11 +2135,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } - entry = pte_mkyoung(*pte); + entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); @@ -2339,7 +2351,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; arch_enter_lazy_mmu_mode(); do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; @@ -2572,15 +2584,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -EINVAL; } - BUG_ON(pmd_huge(*pmd)); - arch_enter_lazy_mmu_mode(); if (fn) { do { - if (create || !pte_none(*pte)) { + if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; @@ -2781,10 +2793,9 @@ static inline int pte_unmap_same(struct vm_fault *vmf) int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - spin_lock(ptl); - same = pte_same(*vmf->pte, vmf->orig_pte); - spin_unlock(ptl); + spin_lock(vmf->ptl); + same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); + spin_unlock(vmf->ptl); } #endif pte_unmap(vmf->pte); @@ -2804,7 +2815,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, int ret; void *kaddr; void __user *uaddr; - bool locked = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; @@ -2830,17 +2840,18 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ + vmf->pte = NULL; if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - locked = true; - if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ - update_mmu_tlb(vma, addr, vmf->pte); + if (vmf->pte) + update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } @@ -2857,15 +2868,15 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { - if (locked) + if (vmf->pte) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - locked = true; - if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ - update_mmu_tlb(vma, addr, vmf->pte); + if (vmf->pte) + update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } @@ -2888,7 +2899,7 @@ warn: ret = 0; pte_unlock: - if (locked) + if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); kunmap_atomic(kaddr); flush_dcache_page(dst); @@ -3110,7 +3121,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(&old_folio->page)); @@ -3178,19 +3189,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) /* Free the old page.. */ new_folio = old_folio; page_copied = 1; - } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } - if (new_folio) - folio_put(new_folio); - - pte_unmap_unlock(vmf->pte, vmf->ptl); /* * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); + + if (new_folio) + folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(&old_folio->page); @@ -3230,11 +3242,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + return VM_FAULT_NOPAGE; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; @@ -3329,7 +3343,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct folio *folio = NULL; if (likely(!unshare)) { - if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3388,8 +3402,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) goto copy; if (!folio_test_lru(folio)) /* - * Note: We cannot easily detect+handle references from - * remote LRU pagevecs or references to LRU folios. + * We cannot easily detect+handle references from + * remote LRU caches or references to LRU folios. */ lru_add_drain(); if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) @@ -3591,10 +3605,11 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); folio_put(folio); @@ -3625,6 +3640,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + return 0; /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. @@ -3633,7 +3650,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. * So is_pte_marker() check is not enough to safely drop the pte. */ - if (pte_same(vmf->orig_pte, *vmf->pte)) + if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3728,10 +3745,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { - spin_unlock(vmf->ptl); - goto out; - } + if (unlikely(!vmf->pte || + !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) + goto unlock; /* * Get a page reference while we know the page can't be @@ -3807,7 +3824,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && + pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } @@ -3863,7 +3881,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * owner. Try removing the extra reference from the local LRU - * pagevecs if required. + * caches if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) @@ -3877,7 +3895,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { @@ -4003,13 +4021,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); out: if (si) put_swap_device(si); return ret; out_nomap: - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: folio_unlock(folio); out_release: @@ -4041,22 +4061,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). + * Use pte_alloc() instead of pte_alloc_map(), so that OOM can + * be distinguished from a transient failure of pte_offset_map(). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See comment in handle_pte_fault() */ - if (unlikely(pmd_trans_unstable(vmf->pmd))) - return 0; - /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { @@ -4064,6 +4074,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + goto unlock; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; @@ -4104,6 +4116,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + goto release; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; @@ -4131,7 +4145,8 @@ setpte: /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: folio_put(folio); @@ -4325,9 +4340,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) - return !pte_same(*vmf->pte, vmf->orig_pte); + return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); - return !pte_none(*vmf->pte); + return !pte_none(ptep_get(vmf->pte)); } /** @@ -4380,15 +4395,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } - /* - * See comment in handle_pte_fault() for how this scenario happens, we - * need to return NOPAGE so that we drop this page. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + return VM_FAULT_NOPAGE; /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { @@ -4630,17 +4640,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { - /* - * If we find a migration pmd entry or a none pmd entry, which - * should never happen, return SIGBUS - */ - if (unlikely(!pmd_present(*vmf->pmd))) + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) ret = VM_FAULT_SIGBUS; else { - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, - vmf->pmd, - vmf->address, - &vmf->ptl); /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update @@ -4648,7 +4652,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * we don't have concurrent modification by hardware * followed by an update. */ - if (unlikely(pte_none(*vmf->pte))) + if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; @@ -4703,9 +4707,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. */ - vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4774,9 +4777,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATED; } else { flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + goto out; + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4905,38 +4910,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead - * of pmd_trans_huge() to ensure the pmd didn't become - * pmd_trans_huge under us and then back to pmd_none, as a - * result of MADV_DONTNEED running immediately after a huge pmd - * fault in a different thread of this mm, in turn leading to a - * misleading pmd_trans_huge() retval. All we have to ensure is - * that it is a regular pmd that we can walk with - * pte_offset_map() and we can do that through an atomic read - * in C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return 0; - /* * A regular pmd is established and it can't morph into a huge - * pmd from under us anymore at this point because we hold the - * mmap_lock read mode and khugepaged takes it in write mode. - * So now it's safe to run pte_offset_map(). + * pmd by anon khugepaged, since that takes mmap_lock in write + * mode; but shmem or file collapse to THP could still morph + * it into a huge pmd: just retry later if so. */ - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - vmf->orig_pte = *vmf->pte; + vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + return 0; + vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; - /* - * some architectures can have larger ptes than wordsize, - * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and - * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic - * accesses. The code below just needs a consistent view - * for the ifs and we later double check anyway with the - * ptl lock held. So here a barrier will do. - */ - barrier(); if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; @@ -4952,10 +4937,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } @@ -5060,9 +5044,8 @@ retry_pud: if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - vmf.orig_pmd = *vmf.pmd; + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); - barrier(); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); @@ -5399,12 +5382,12 @@ retry: if (!vma) goto inval; - /* Only anonymous vmas are supported for now */ - if (!vma_is_anonymous(vma)) + /* Only anonymous and tcp vmas are supported for now */ + if (!vma_is_anonymous(vma) && !vma_is_tcp(vma)) goto inval; /* find_mergeable_anon_vma uses adjacent vmas which are not locked */ - if (!vma->anon_vma) + if (!vma->anon_vma && !vma_is_tcp(vma)) goto inval; if (!vma_start_read(vma)) @@ -5558,11 +5541,10 @@ int follow_pte(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!pte_present(*ptep)) + if (!ptep) + goto out; + if (!pte_present(ptep_get(ptep))) goto unlock; *ptepp = ptep; return 0; @@ -5599,7 +5581,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(ptep_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; } @@ -5619,7 +5601,7 @@ int follow_phys(struct vm_area_struct *vma, if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - pte = *ptep; + pte = ptep_get(ptep); if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -5663,7 +5645,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) return -EINVAL; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); prot = pgprot_val(pte_pgprot(pte)); @@ -5679,7 +5661,7 @@ retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) goto out_unmap; - if (!pte_same(pte, *ptep)) { + if (!pte_same(pte, ptep_get(ptep))) { pte_unmap_unlock(ptep, ptl); iounmap(maddr); @@ -5706,47 +5688,47 @@ EXPORT_SYMBOL_GPL(generic_access_phys); int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - struct vm_area_struct *vma; void *old_buf = buf; int write = gup_flags & FOLL_WRITE; if (mmap_read_lock_killable(mm)) return 0; - /* We might need to expand the stack to access it */ - vma = vma_lookup(mm, addr); - if (!vma) { - vma = expand_stack(mm, addr); - if (!vma) - return 0; - } - /* ignore errors, just check how much was successfully transferred */ while (len) { - int bytes, ret, offset; + int bytes, offset; void *maddr; - struct page *page = NULL; + struct vm_area_struct *vma = NULL; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); + + if (IS_ERR_OR_NULL(page)) { + /* We might need to expand the stack to access it */ + vma = vma_lookup(mm, addr); + if (!vma) { + vma = expand_stack(mm, addr); + + /* mmap_lock was dropped on failure */ + if (!vma) + return buf - old_buf; + + /* Try again if stack expansion worked */ + continue; + } + - ret = get_user_pages_remote(mm, addr, 1, - gup_flags, &page, &vma, NULL); - if (ret <= 0) { -#ifndef CONFIG_HAVE_IOREMAP_PROT - break; -#else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ - vma = vma_lookup(mm, addr); - if (!vma) - break; + bytes = 0; +#ifdef CONFIG_HAVE_IOREMAP_PROT if (vma->vm_ops && vma->vm_ops->access) - ret = vma->vm_ops->access(vma, addr, buf, - len, write); - if (ret <= 0) - break; - bytes = ret; + bytes = vma->vm_ops->access(vma, addr, buf, + len, write); #endif + if (bytes <= 0) + break; } else { bytes = len; offset = addr & (PAGE_SIZE-1); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8e0fa209d533..3f231cf1b410 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -13,7 +13,6 @@ #include <linux/pagemap.h> #include <linux/compiler.h> #include <linux/export.h> -#include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/sysctl.h> @@ -325,7 +324,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, } if (check_pfn_span(pfn, nr_pages)) { - WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); return -EINVAL; } @@ -492,18 +491,6 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, set_zone_contiguous(zone); } -static void __remove_section(unsigned long pfn, unsigned long nr_pages, - unsigned long map_offset, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - - if (WARN_ON_ONCE(!valid_section(ms))) - return; - - sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); -} - /** * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) @@ -520,12 +507,9 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; - unsigned long map_offset = 0; - - map_offset = vmem_altmap_offset(altmap); if (check_pfn_span(pfn, nr_pages)) { - WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); return; } @@ -534,8 +518,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); - __remove_section(pfn, cur_nr_pages, map_offset, altmap); - map_offset = 0; + sparse_remove_section(pfn, cur_nr_pages, altmap); } } @@ -1172,16 +1155,6 @@ failed_addition: return ret; } -static void reset_node_present_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->present_pages = 0; - - pgdat->node_present_pages = 0; -} - /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_init_pgdat(int nid) { @@ -1204,15 +1177,6 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) */ build_all_zonelists(pgdat); - /* - * When memory is hot-added, all the memory is in offline state. So - * clear all zones' present_pages because they will be updated in - * online_pages() and offline_pages(). - * TODO: should be in free_area_init_core_hotplug? - */ - reset_node_managed_pages(pgdat); - reset_node_present_pages(pgdat); - return pgdat; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1756389a0609..edc25195f5bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -508,20 +508,23 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; bool has_unmovable = false; pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) return queue_folios_pmd(pmd, ptl, addr, end, walk); - if (pmd_trans_unstable(pmd)) - return 0; - mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* @@ -1195,24 +1198,22 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *new_folio(struct folio *src, unsigned long start) { - struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; for_each_vma(vmi, vma) { - address = page_address_in_vma(page, vma); + address = page_address_in_vma(&src->page, vma); if (address != -EFAULT) break; } if (folio_test_hugetlb(src)) { - dst = alloc_hugetlb_folio_vma(folio_hstate(src), + return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); - return &dst->page; } if (folio_test_large(src)) @@ -1221,9 +1222,8 @@ static struct page *new_page(struct page *page, unsigned long start) /* * if !vma, vma_alloc_folio() will use task or system default policy */ - dst = vma_alloc_folio(gfp, folio_order(src), vma, address, + return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); - return &dst->page; } #else @@ -1239,7 +1239,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *new_folio(struct folio *src, unsigned long start) { return NULL; } @@ -1334,7 +1334,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, NULL, + nr_failed = migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 01cac26a3127..24baad2571e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -21,7 +21,6 @@ #include <linux/buffer_head.h> #include <linux/mm_inline.h> #include <linux/nsproxy.h> -#include <linux/pagevec.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> @@ -188,6 +187,7 @@ static bool remove_migration_pte(struct folio *folio, while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; + pte_t old_pte; pte_t pte; swp_entry_t entry; struct page *new; @@ -210,17 +210,18 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pvmw.pte)) + old_pte = ptep_get(pvmw.pte); + if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*pvmw.pte); + entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte); - else if (pte_swp_uffd_wp(*pvmw.pte)) + else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) @@ -234,9 +235,9 @@ static bool remove_migration_pte(struct folio *folio, entry = make_readable_device_private_entry( page_to_pfn(new)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*pvmw.pte)) + if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*pvmw.pte)) + if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } @@ -296,14 +297,21 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { + spinlock_t *ptl; + pte_t *ptep; pte_t pte; swp_entry_t entry; - spin_lock(ptl); - pte = *ptep; + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + return; + + pte = ptep_get(ptep); + pte_unmap(ptep); + if (!is_swap_pte(pte)) goto out; @@ -311,18 +319,10 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - migration_entry_wait_on_locked(entry, ptep, ptl); + migration_entry_wait_on_locked(entry, ptl); return; out: - pte_unmap_unlock(ptep, ptl); -} - -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) -{ - spinlock_t *ptl = pte_lockptr(mm, pmd); - pte_t *ptep = pte_offset_map(pmd, address); - __migration_entry_wait(mm, ptep, ptl); + spin_unlock(ptl); } #ifdef CONFIG_HUGETLB_PAGE @@ -332,9 +332,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, * * This function will release the vma lock before returning. */ -void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl) +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep) { + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); pte_t pte; hugetlb_vma_assert_locked(vma); @@ -352,16 +352,9 @@ void __migration_entry_wait_huge(struct vm_area_struct *vma, * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); + migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); } } - -void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) -{ - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); - - __migration_entry_wait_huge(vma, pte, ptl); -} #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -372,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); return; unlock: spin_unlock(ptl); @@ -492,6 +485,11 @@ int folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); + + if (folio_test_pmd_mappable(folio)) { + __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); + __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); + } } #ifdef CONFIG_SWAP if (folio_test_swapcache(folio)) { @@ -692,37 +690,32 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) { struct buffer_head *bh = head; + struct buffer_head *failed_bh; - /* Simple case, sync compaction */ - if (mode != MIGRATE_ASYNC) { - do { - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return true; - } - - /* async case, we cannot block on lock_buffer so use trylock_buffer */ do { if (!trylock_buffer(bh)) { - /* - * We failed to lock the buffer and cannot stall in - * async migration. Release the taken locks - */ - struct buffer_head *failed_bh = bh; - bh = head; - while (bh != failed_bh) { - unlock_buffer(bh); - bh = bh->b_this_page; - } - return false; + if (mode == MIGRATE_ASYNC) + goto unlock; + if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh)) + goto unlock; + lock_buffer(bh); } bh = bh->b_this_page; } while (bh != head); + return true; + +unlock: + /* We failed to lock the buffer and cannot stall. */ + failed_bh = bh; + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + bh = bh->b_this_page; + } + + return false; } static int __buffer_migrate_folio(struct address_space *mapping, @@ -1072,15 +1065,13 @@ static void migrate_folio_undo_src(struct folio *src, } /* Restore the destination folio to the original state upon failure */ -static void migrate_folio_undo_dst(struct folio *dst, - bool locked, - free_page_t put_new_page, - unsigned long private) +static void migrate_folio_undo_dst(struct folio *dst, bool locked, + free_folio_t put_new_folio, unsigned long private) { if (locked) folio_unlock(dst); - if (put_new_page) - put_new_page(&dst->page, private); + if (put_new_folio) + put_new_folio(dst, private); else folio_put(dst); } @@ -1104,14 +1095,13 @@ static void migrate_folio_done(struct folio *src, } /* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) +static int migrate_folio_unmap(new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + struct folio *src, struct folio **dstp, enum migrate_mode mode, + enum migrate_reason reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; - struct page *newpage = NULL; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); @@ -1128,10 +1118,9 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page return MIGRATEPAGE_SUCCESS; } - newpage = get_new_page(&src->page, private); - if (!newpage) + dst = get_new_folio(src, private); + if (!dst) return -ENOMEM; - dst = page_folio(newpage); *dstp = dst; dst->private = NULL; @@ -1156,6 +1145,14 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page if (current->flags & PF_MEMALLOC) goto out; + /* + * In "light" mode, we can wait for transient locks (eg + * inserting a page into the page table), but it's not + * worth waiting for I/O. + */ + if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src)) + goto out; + folio_lock(src); } locked = true; @@ -1251,13 +1248,13 @@ out: ret = NULL; migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); - migrate_folio_undo_dst(dst, dst_locked, put_new_page, private); + migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; } /* Migrate the folio to the newly allocated folio in dst. */ -static int migrate_folio_move(free_page_t put_new_page, unsigned long private, +static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio *dst, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) @@ -1329,7 +1326,7 @@ out: } migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); - migrate_folio_undo_dst(dst, true, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; } @@ -1352,16 +1349,14 @@ out: * because then pte is replaced with migration swap entry and direct I/O code * will wait in the page fault for migration to complete. */ -static int unmap_and_move_huge_page(new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - struct page *hpage, int force, - enum migrate_mode mode, int reason, - struct list_head *ret) +static int unmap_and_move_huge_page(new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + struct folio *src, int force, enum migrate_mode mode, + int reason, struct list_head *ret) { - struct folio *dst, *src = page_folio(hpage); + struct folio *dst; int rc = -EAGAIN; int page_was_mapped = 0; - struct page *new_hpage; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -1371,10 +1366,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return MIGRATEPAGE_SUCCESS; } - new_hpage = get_new_page(hpage, private); - if (!new_hpage) + dst = get_new_folio(src, private); + if (!dst) return -ENOMEM; - dst = page_folio(new_hpage); if (!folio_trylock(src)) { if (!force) @@ -1415,7 +1409,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_page_mapping_lock_write(&src->page); if (unlikely(!mapping)) goto unlock_put_anon; @@ -1445,7 +1439,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { move_hugetlb_state(src, dst, reason); - put_new_page = NULL; + put_new_folio = NULL; } out_unlock: @@ -1461,8 +1455,8 @@ out: * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ - if (put_new_page) - put_new_page(new_hpage, private); + if (put_new_folio) + put_new_folio(dst, private); else folio_putback_active_hugetlb(dst); @@ -1509,8 +1503,8 @@ struct migrate_pages_stats { * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. */ -static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, +static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct migrate_pages_stats *stats, struct list_head *ret_folios) @@ -1548,9 +1542,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, continue; } - rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, - &folio->page, pass > 2, mode, + rc = unmap_and_move_huge_page(get_new_folio, + put_new_folio, private, + folio, pass > 2, mode, reason, ret_folios); /* * The rules are: @@ -1607,20 +1601,17 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the * length of the from list must be <= 1. */ -static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct list_head *split_folios, struct migrate_pages_stats *stats, - int nr_pass) +static int migrate_pages_batch(struct list_head *from, + new_folio_t get_new_folio, free_folio_t put_new_folio, + unsigned long private, enum migrate_mode mode, int reason, + struct list_head *ret_folios, struct list_head *split_folios, + struct migrate_pages_stats *stats, int nr_pass) { int retry = 1; - int large_retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_retry_pages = 0; - int nr_large_failed = 0; int pass = 0; - bool is_large = false; bool is_thp = false; struct folio *folio, *folio2, *dst = NULL, *dst2; int rc, rc_saved = 0, nr_pages; @@ -1631,20 +1622,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && !list_empty(from) && !list_is_singular(from)); - for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { + for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; - large_retry = 0; thp_retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { - /* - * Large folio statistics is based on the source large - * folio. Capture required information that might get - * lost during migration. - */ - is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); + is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); @@ -1660,7 +1644,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, * list is processed. */ if (!thp_migration_supported() && is_thp) { - nr_large_failed++; + nr_failed++; stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios)) { stats->nr_thp_split++; @@ -1671,8 +1655,9 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, continue; } - rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, mode, reason, ret_folios); + rc = migrate_folio_unmap(get_new_folio, put_new_folio, + private, folio, &dst, mode, reason, + ret_folios); /* * The rules are: * Success: folio will be freed @@ -1688,38 +1673,33 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, * When memory is low, don't bother to try to migrate * other folios, move unmapped folios, then exit. */ - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - /* Large folio NUMA faulting doesn't split to retry. */ - if (!nosplit) { - int ret = try_split_folio(folio, split_folios); - - if (!ret) { - stats->nr_thp_split += is_thp; - break; - } else if (reason == MR_LONGTERM_PIN && - ret == -EAGAIN) { - /* - * Try again to split large folio to - * mitigate the failure of longterm pinning. - */ - large_retry++; - thp_retry += is_thp; - nr_retry_pages += nr_pages; - /* Undo duplicated failure counting. */ - nr_large_failed--; - stats->nr_thp_failed -= is_thp; - break; - } + nr_failed++; + stats->nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (folio_test_large(folio) && !nosplit) { + int ret = try_split_folio(folio, split_folios); + + if (!ret) { + stats->nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* + * Try again to split large folio to + * mitigate the failure of longterm pinning. + */ + retry++; + thp_retry += is_thp; + nr_retry_pages += nr_pages; + /* Undo duplicated failure counting. */ + nr_failed--; + stats->nr_thp_failed -= is_thp; + break; } - } else { - nr_failed++; } stats->nr_failed_pages += nr_pages + nr_retry_pages; /* nr_failed isn't updated for not used */ - nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; rc_saved = rc; if (list_empty(&unmap_folios)) @@ -1727,12 +1707,8 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, else goto move; case -EAGAIN: - if (is_large) { - large_retry++; - thp_retry += is_thp; - } else { - retry++; - } + retry++; + thp_retry += is_thp; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: @@ -1750,20 +1726,14 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, * removed from migration folio list and not * retried in the next outer loop. */ - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - } else { - nr_failed++; - } - + nr_failed++; + stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } } } nr_failed += retry; - nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; move: @@ -1771,22 +1741,20 @@ move: try_to_unmap_flush(); retry = 1; - for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { + for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; - large_retry = 0; thp_retry = 0; nr_retry_pages = 0; dst = list_first_entry(&dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); + is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); - rc = migrate_folio_move(put_new_page, private, + rc = migrate_folio_move(put_new_folio, private, folio, dst, mode, reason, ret_folios); /* @@ -1797,12 +1765,8 @@ move: */ switch(rc) { case -EAGAIN: - if (is_large) { - large_retry++; - thp_retry += is_thp; - } else { - retry++; - } + retry++; + thp_retry += is_thp; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: @@ -1810,13 +1774,8 @@ move: stats->nr_thp_succeeded += is_thp; break; default: - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - } else { - nr_failed++; - } - + nr_failed++; + stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } @@ -1825,14 +1784,10 @@ move: } } nr_failed += retry; - nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; - if (rc_saved) - rc = rc_saved; - else - rc = nr_failed + nr_large_failed; + rc = rc_saved ? : nr_failed; out: /* Cleanup remaining folios */ dst = list_first_entry(&dst_folios, struct folio, lru); @@ -1845,7 +1800,7 @@ out: migrate_folio_undo_src(folio, page_was_mapped, anon_vma, true, ret_folios); list_del(&dst->lru); - migrate_folio_undo_dst(dst, true, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; dst2 = list_next_entry(dst, lru); } @@ -1853,10 +1808,11 @@ out: return rc; } -static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct list_head *split_folios, struct migrate_pages_stats *stats) +static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + enum migrate_mode mode, int reason, + struct list_head *ret_folios, struct list_head *split_folios, + struct migrate_pages_stats *stats) { int rc, nr_failed = 0; LIST_HEAD(folios); @@ -1864,7 +1820,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, memset(&astats, 0, sizeof(astats)); /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ - rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC, + rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &folios, split_folios, &astats, NR_MAX_MIGRATE_ASYNC_RETRY); stats->nr_succeeded += astats.nr_succeeded; @@ -1886,7 +1842,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, list_splice_tail_init(&folios, from); while (!list_empty(from)) { list_move(from->next, &folios); - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, + rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, ret_folios, split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); list_splice_tail_init(&folios, ret_folios); @@ -1903,11 +1859,11 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, * supplied as the target for the page migration * * @from: The list of folios to be migrated. - * @get_new_page: The function used to allocate free folios to be used + * @get_new_folio: The function used to allocate free folios to be used * as the target of the folio migration. - * @put_new_page: The function used to free target folios if migration + * @put_new_folio: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. - * @private: Private data to be passed on to get_new_page() + * @private: Private data to be passed on to get_new_folio() * @mode: The migration mode that specifies the constraints for * folio migration, if any. * @reason: The reason for folio migration. @@ -1924,8 +1880,8 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, * considered as the number of non-migrated large folio, no matter how many * split folios of the large folio are migrated successfully. */ -int migrate_pages(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, +int migrate_pages(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; @@ -1940,7 +1896,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, memset(&stats, 0, sizeof(stats)); - rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private, + rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; @@ -1963,12 +1919,14 @@ again: else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &split_folios, &stats, - NR_MAX_MIGRATE_PAGES_RETRY); + rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, + private, mode, reason, &ret_folios, + &split_folios, &stats, + NR_MAX_MIGRATE_PAGES_RETRY); else - rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &split_folios, &stats); + rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, + private, mode, reason, &ret_folios, + &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; @@ -1981,8 +1939,9 @@ again: * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ - migrate_pages_batch(&split_folios, get_new_page, put_new_page, private, - MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); + migrate_pages_batch(&split_folios, get_new_folio, + put_new_folio, private, MIGRATE_ASYNC, reason, + &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; @@ -2017,14 +1976,11 @@ out: return rc_gather; } -struct page *alloc_migration_target(struct page *page, unsigned long private) +struct folio *alloc_migration_target(struct folio *src, unsigned long private) { - struct folio *folio = page_folio(page); struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; - struct folio *hugetlb_folio = NULL; - struct folio *new_folio = NULL; int nid; int zidx; @@ -2032,33 +1988,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) gfp_mask = mtc->gfp_mask; nid = mtc->nid; if (nid == NUMA_NO_NODE) - nid = folio_nid(folio); + nid = folio_nid(src); - if (folio_test_hugetlb(folio)) { - struct hstate *h = folio_hstate(folio); + if (folio_test_hugetlb(src)) { + struct hstate *h = folio_hstate(src); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + return alloc_hugetlb_folio_nodemask(h, nid, mtc->nmask, gfp_mask); - return &hugetlb_folio->page; } - if (folio_test_large(folio)) { + if (folio_test_large(src)) { /* * clear __GFP_RECLAIM to make the migration callback * consistent with regular THP allocations. */ gfp_mask &= ~__GFP_RECLAIM; gfp_mask |= GFP_TRANSHUGE; - order = folio_order(folio); + order = folio_order(src); } - zidx = zone_idx(folio_zone(folio)); + zidx = zone_idx(folio_zone(src)); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; - new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask); - - return &new_folio->page; + return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } #ifdef CONFIG_NUMA @@ -2509,13 +2462,12 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, return false; } -static struct page *alloc_misplaced_dst_page(struct page *page, +static struct folio *alloc_misplaced_dst_folio(struct folio *src, unsigned long data) { int nid = (int) data; - int order = compound_order(page); + int order = folio_order(src); gfp_t gfp = __GFP_THISNODE; - struct folio *new; if (order > 0) gfp |= GFP_TRANSHUGE_LIGHT; @@ -2524,9 +2476,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN; gfp &= ~__GFP_RECLAIM; } - new = __folio_alloc_node(gfp, order, nid); - - return &new->page; + return __folio_alloc_node(gfp, order, nid); } static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) @@ -2604,7 +2554,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining) { diff --git a/mm/migrate_device.c b/mm/migrate_device.c index d30c9de60b0d..8365158460ed 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -83,9 +83,6 @@ again: if (is_huge_zero_page(page)) { spin_unlock(ptl); split_huge_pmd(vma, pmdp, addr); - if (pmd_trans_unstable(pmdp)) - return migrate_vma_collect_skip(start, end, - walk); } else { int ret; @@ -100,16 +97,12 @@ again: if (ret) return migrate_vma_collect_skip(start, end, walk); - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, - walk); } } - if (unlikely(pmd_bad(*pmdp))) - return migrate_vma_collect_skip(start, end, walk); - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; arch_enter_lazy_mmu_mode(); for (; addr < end; addr += PAGE_SIZE, ptep++) { @@ -118,7 +111,7 @@ again: swp_entry_t entry; pte_t pte; - pte = *ptep; + pte = ptep_get(ptep); if (pte_none(pte)) { if (vma_is_anonymous(vma)) { @@ -201,7 +194,7 @@ again: bool anon_exclusive; pte_t swp_pte; - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(pte)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { pte = ptep_clear_flush(vma, addr, ptep); @@ -383,7 +376,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, /* ZONE_DEVICE pages are not on LRU */ if (!is_zone_device_page(page)) { if (!PageLRU(page) && allow_drain) { - /* Drain CPU's pagevec */ + /* Drain CPU's lru cache */ lru_add_drain_all(); allow_drain = false; } @@ -580,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pud_t *pudp; pmd_t *pmdp; pte_t *ptep; + pte_t orig_pte; /* Only allow populating anonymous memory */ if (!vma_is_anonymous(vma)) @@ -595,27 +589,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) goto abort; - - /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). - */ if (pte_alloc(mm, pmdp)) goto abort; - - /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(pmdp))) - goto abort; - if (unlikely(anon_vma_prepare(vma))) goto abort; if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) @@ -650,17 +627,20 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, } ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto abort; + orig_pte = ptep_get(ptep); if (check_stable_address_space(mm)) goto unlock_abort; - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); + if (pte_present(orig_pte)) { + unsigned long pfn = pte_pfn(orig_pte); if (!is_zero_pfn(pfn)) goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) + } else if (!pte_none(orig_pte)) goto unlock_abort; /* @@ -677,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, get_page(page); if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); diff --git a/mm/mincore.c b/mm/mincore.c index 2d5be013a25a..b7f7a516b26c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -113,14 +113,13 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - if (pmd_trans_unstable(pmd)) { - __mincore_unmapped_range(addr, end, vma, vec); - goto out; - } - ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!ptep) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; ptep++, addr += PAGE_SIZE) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 40b43f8740df..d7db94519884 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; + pte_t ptent; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); @@ -329,10 +330,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, } start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!start_pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio)) diff --git a/mm/mm_init.c b/mm/mm_init.c index 7f7f9c677854..a1963c3322af 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core, return 0; } +bool mirrored_kernelcore __initdata_memblock; + /* * kernelcore=size sets the amount of memory for use for allocations that * cannot be reclaimed or migrated. @@ -644,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) } /* Returns true if the struct page for the pfn is initialised */ -static inline bool __meminit early_page_initialised(unsigned long pfn) +static inline bool __meminit early_page_initialised(unsigned long pfn, int nid) { - int nid = early_pfn_to_nid(pfn); - if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return false; @@ -693,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn, int nid) { pg_data_t *pgdat; - int nid, zid; + int zid; - if (early_page_initialised(pfn)) + if (early_page_initialised(pfn, nid)) return; - nid = early_pfn_to_nid(pfn); pgdat = NODE_DATA(nid); for (zid = 0; zid < MAX_NR_ZONES; zid++) { @@ -715,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn) #else static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} -static inline bool early_page_initialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn, int nid) { return true; } @@ -725,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_reserved_page(unsigned long pfn) +static inline void init_reserved_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -736,7 +735,8 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) +void __meminit reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -745,7 +745,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) if (pfn_valid(start_pfn)) { struct page *page = pfn_to_page(start_pfn); - init_reserved_page(start_pfn); + init_reserved_page(start_pfn, nid); /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); @@ -1166,24 +1166,15 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn) + unsigned long zone_start_pfn, + unsigned long zone_end_pfn) { - unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; - unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long zone_start_pfn, zone_end_pfn; unsigned long nr_absent; - /* When hotadd a new node from cpu_up(), the node should be empty */ - if (!node_start_pfn && !node_end_pfn) + /* zone is empty, we don't have any absent pages */ + if (zone_start_pfn == zone_end_pfn) return 0; - zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); - zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); - - adjust_zone_range_for_zone_movable(nid, zone_type, - node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* @@ -1227,9 +1218,6 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - /* When hotadd a new node from cpu_up(), the node should be empty */ - if (!node_start_pfn && !node_end_pfn) - return 0; /* Get the start and end of the zone */ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); @@ -1250,6 +1238,24 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, return *zone_end_pfn - *zone_start_pfn; } +static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) { + z->zone_start_pfn = 0; + z->spanned_pages = 0; + z->present_pages = 0; +#if defined(CONFIG_MEMORY_HOTPLUG) + z->present_early_pages = 0; +#endif + } + + pgdat->node_spanned_pages = 0; + pgdat->node_present_pages = 0; + pr_debug("On node %d totalpages: 0\n", pgdat->node_id); +} + static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn) @@ -1261,7 +1267,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; unsigned long spanned, absent; - unsigned long size, real_size; + unsigned long real_size; spanned = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, @@ -1269,23 +1275,22 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, &zone_start_pfn, &zone_end_pfn); absent = zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn); + zone_start_pfn, + zone_end_pfn); - size = spanned; - real_size = size - absent; + real_size = spanned - absent; - if (size) + if (spanned) zone->zone_start_pfn = zone_start_pfn; else zone->zone_start_pfn = 0; - zone->spanned_pages = size; + zone->spanned_pages = spanned; zone->present_pages = real_size; #if defined(CONFIG_MEMORY_HOTPLUG) zone->present_early_pages = real_size; #endif - totalpages += size; + totalpages += spanned; realtotalpages += real_size; } @@ -1375,6 +1380,10 @@ static void __meminit zone_init_free_lists(struct zone *zone) INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; } + +#ifdef CONFIG_UNACCEPTED_MEMORY + INIT_LIST_HEAD(&zone->unaccepted_pages); +#endif } void __meminit init_currently_empty_zone(struct zone *zone, @@ -1502,6 +1511,8 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) pgdat->kswapd_order = 0; pgdat->kswapd_highest_zoneidx = 0; pgdat->node_start_pfn = 0; + pgdat->node_present_pages = 0; + for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; @@ -1509,8 +1520,17 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) memset(p, 0, sizeof(*p)); } - for (z = 0; z < MAX_NR_ZONES; z++) - zone_init_internals(&pgdat->node_zones[z], z, nid, 0); + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages and managed_pages because they will + * be updated in online_pages() and offline_pages(). + */ + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = pgdat->node_zones + z; + + zone->present_pages = 0; + zone_init_internals(zone, z, nid, 0); + } } #endif @@ -1578,7 +1598,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) if (!size) continue; - set_pageblock_order(); setup_usemap(zone); init_currently_empty_zone(zone, zone->zone_start_pfn, size); } @@ -1702,11 +1721,13 @@ static void __init free_area_init_node(int nid) pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + + calculate_node_totalpages(pgdat, start_pfn, end_pfn); } else { pr_info("Initmem setup node %d as memoryless\n", nid); - } - calculate_node_totalpages(pgdat, start_pfn, end_pfn); + reset_memoryless_node_totalpages(pgdat); + } alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -1716,7 +1737,7 @@ static void __init free_area_init_node(int nid) } /* Any regular or high memory on that node ? */ -static void check_for_memory(pg_data_t *pgdat, int nid) +static void check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; @@ -1724,9 +1745,9 @@ static void check_for_memory(pg_data_t *pgdat, int nid) struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { if (IS_ENABLED(CONFIG_HIGHMEM)) - node_set_state(nid, N_HIGH_MEMORY); + node_set_state(pgdat->node_id, N_HIGH_MEMORY); if (zone_type <= ZONE_NORMAL) - node_set_state(nid, N_NORMAL_MEMORY); + node_set_state(pgdat->node_id, N_NORMAL_MEMORY); break; } } @@ -1745,11 +1766,6 @@ void __init setup_nr_node_ids(void) } #endif -static void __init free_area_init_memoryless_node(int nid) -{ - free_area_init_node(nid); -} - /* * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For * such cases we allow max_zone_pfn sorted in the descending order @@ -1848,6 +1864,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); + set_pageblock_order(); + for_each_node(nid) { pg_data_t *pgdat; @@ -1860,7 +1878,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) panic("Cannot allocate %zuB for node %d.\n", sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); - free_area_init_memoryless_node(nid); + free_area_init_node(nid); /* * We do not want to confuse userspace by sysfs @@ -1881,7 +1899,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); - check_for_memory(pgdat, nid); + check_for_memory(pgdat); } memmap_init(); @@ -1960,6 +1978,9 @@ static void __init deferred_free_range(unsigned long pfn, return; } + /* Accept chunks smaller than MAX_ORDER upfront */ + accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -2328,6 +2349,28 @@ void __init init_cma_reserved_pageblock(struct page *page) } #endif +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = pageblock_end_pfn(block_start_pfn); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + cond_resched(); + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + void __init page_alloc_init_late(void) { struct zone *zone; @@ -2368,6 +2411,8 @@ void __init page_alloc_init_late(void) /* Initialize page ext after all struct pages are initialized. */ if (deferred_struct_pages) page_ext_init(); + + page_alloc_sysctl_init(); } #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES @@ -2532,8 +2577,14 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (!early_page_initialised(pfn)) - return; + + if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { + int nid = early_pfn_to_nid(pfn); + + if (!early_page_initialised(pfn, nid)) + return; + } + if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ return; @@ -2541,6 +2592,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, __free_pages_core(page, order); } +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); +EXPORT_SYMBOL(init_on_alloc); + +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); +EXPORT_SYMBOL(init_on_free); + static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) diff --git a/mm/mmap.c b/mm/mmap.c index bc510361acec..9b5188b65800 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len) if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; - return mlock_future_check(current->mm, current->mm->def_flags, len); + return mlock_future_ok(current->mm, current->mm->def_flags, len) + ? 0 : -EAGAIN; } static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); @@ -300,61 +301,40 @@ out: } #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) -extern void mt_validate(struct maple_tree *mt); -extern void mt_dump(const struct maple_tree *mt); - -/* Validate the maple tree */ -static void validate_mm_mt(struct mm_struct *mm) -{ - struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt; - - MA_STATE(mas, mt, 0, 0); - - mt_validate(&mm->mm_mt); - mas_for_each(&mas, vma_mt, ULONG_MAX) { - if ((vma_mt->vm_start != mas.index) || - (vma_mt->vm_end - 1 != mas.last)) { - pr_emerg("issue in %s\n", current->comm); - dump_stack(); - dump_vma(vma_mt); - pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, - mas.index, mas.last); - pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, - vma_mt->vm_start, vma_mt->vm_end); - - mt_dump(mas.tree); - if (vma_mt->vm_end != mas.last + 1) { - pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", - mm, vma_mt->vm_start, vma_mt->vm_end, - mas.index, mas.last); - mt_dump(mas.tree); - } - VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); - if (vma_mt->vm_start != mas.index) { - pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", - mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree); - } - VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); - } - } -} - static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; - MA_STATE(mas, &mm->mm_mt, 0, 0); - - validate_mm_mt(mm); + VMA_ITERATOR(vmi, mm, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + mt_validate(&mm->mm_mt); + for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; +#endif + unsigned long vmi_start, vmi_end; + bool warn = 0; + + vmi_start = vma_iter_addr(&vmi); + vmi_end = vma_iter_end(&vmi); + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) + warn = 1; + + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) + warn = 1; + if (warn) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma); + pr_emerg("tree range: %px start %lx end %lx\n", vma, + vmi_start, vmi_end - 1); + vma_iter_dump_tree(&vmi); + } + +#ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) @@ -365,14 +345,13 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ -#define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ @@ -1167,21 +1146,21 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len) +bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, + unsigned long bytes) { - unsigned long locked, lock_limit; + unsigned long locked_pages, limit_pages; - /* mlock MCL_FUTURE? */ - if (flags & VM_LOCKED) { - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } - return 0; + if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + return true; + + locked_pages = bytes >> PAGE_SHIFT; + locked_pages += mm->locked_vm; + + limit_pages = rlimit(RLIMIT_MEMLOCK); + limit_pages >>= PAGE_SHIFT; + + return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) @@ -1293,7 +1272,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - if (mlock_future_check(mm, vm_flags, len)) + if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { @@ -1475,6 +1454,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) +{ + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); +} + +static bool vma_is_shared_writable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == + (VM_WRITE | VM_SHARED); +} + +static bool vma_fs_can_writeback(struct vm_area_struct *vma) +{ + /* No managed pages to writeback. */ + if (vma->vm_flags & VM_PFNMAP) + return false; + + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * Does this VMA require the underlying folios to have their dirty state + * tracked? + */ +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) +{ + /* Only shared, writable VMAs require dirty tracking. */ + if (!vma_is_shared_writable(vma)) + return false; + + /* Does the filesystem need to be notified? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* + * Even if the filesystem doesn't indicate a need for writenotify, if it + * can writeback, dirty tracking is still required. + */ + return vma_fs_can_writeback(vma); +} + /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1483,21 +1504,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { - vm_flags_t vm_flags = vma->vm_flags; - const struct vm_operations_struct *vm_ops = vma->vm_ops; - /* If it was private or non-writable, the write bit is already clear */ - if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + if (!vma_is_shared_writable(vma)) return 0; /* The backer wishes to know when pages are first written to? */ - if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) + if (vm_ops_needs_writenotify(vma->vm_ops)) return 1; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != - pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return 0; /* @@ -1511,13 +1529,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (userfaultfd_wp(vma)) return 1; - /* Specialty mapping? */ - if (vm_flags & VM_PFNMAP) - return 0; - /* Can the mapping track the dirty pages? */ - return vma->vm_file && vma->vm_file->f_mapping && - mapping_can_writeback(vma->vm_file->f_mapping); + return vma_fs_can_writeback(vma); } /* @@ -1911,7 +1924,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ @@ -2323,7 +2336,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); @@ -2381,7 +2394,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Success. */ if (new_below) vma_next(vmi); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return 0; out_free_mpol: @@ -2390,7 +2403,7 @@ out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return err; } @@ -2483,28 +2496,32 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, locked_vm += vma_pages(next); count++; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain split, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + error = userfaultfd_unmap_prep(next, start, end, uf); + + if (error) + goto userfaultfd_error; + } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < start); BUG_ON(next->vm_start > end); #endif } - next = vma_next(vmi); - if (unlikely(uf)) { - /* - * If userfaultfd_unmap_prep returns an error the vmas - * will remain split, but userland will get a - * highly unexpected error anyway. This is no - * different than the case where the first of the two - * __split_vma fails, but we don't undo the first - * split, despite we could. This is unlikely enough - * failure that it's not worth optimizing it for. - */ - error = userfaultfd_unmap_prep(mm, start, end, uf); + if (vma_iter_end(vmi) > end) + next = vma_iter_load(vmi); - if (error) - goto userfaultfd_error; - } + if (!next) + next = vma_next(vmi); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ @@ -2709,6 +2726,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } cannot_expand: + if (prev) + vma_iter_next_range(&vmi); + /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but @@ -3022,7 +3042,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, arch_unmap(mm, start, end); ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); - validate_mm_mt(mm); + validate_mm(mm); return ret; } @@ -3044,7 +3064,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; struct vma_prepare vp; - validate_mm_mt(mm); + validate_mm(mm); /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -3285,7 +3305,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); - validate_mm_mt(mm); + validate_mm(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3344,7 +3364,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, goto out_vma_link; *need_rmap_locks = false; } - validate_mm_mt(mm); + validate_mm(mm); return new_vma; out_vma_link: @@ -3360,7 +3380,7 @@ out_free_mempol: out_free_vma: vm_area_free(new_vma); out: - validate_mm_mt(mm); + validate_mm(mm); return NULL; } @@ -3497,7 +3517,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - validate_mm_mt(mm); + validate_mm(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3520,12 +3540,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); - validate_mm_mt(mm); + validate_mm(mm); return vma; out: vm_area_free(vma); - validate_mm_mt(mm); + validate_mm(mm); return ERR_PTR(ret); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..6f658d483704 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -93,22 +93,9 @@ static long change_pte_range(struct mmu_gather *tlb, bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; tlb_change_page_size(tlb, PAGE_SIZE); - - /* - * Can be called with only the mmap_lock for reading by - * prot_numa so we must check the pmd isn't constantly - * changing from under us from pmd_none to pmd_trans_huge - * and/or the other way around. - */ - if (pmd_trans_unstable(pmd)) - return 0; - - /* - * The pmd points to a regular pte so the pmd can't change - * from under us even if the mmap_lock is only hold for - * reading. - */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return -EAGAIN; /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && @@ -118,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; + oldpte = ptep_get(pte); if (pte_present(oldpte)) { pte_t ptent; @@ -302,31 +289,6 @@ static long change_pte_range(struct mmu_gather *tlb, } /* - * Used when setting automatic NUMA hinting protection where it is - * critical that a numa hinting PMD is not confused with a bad PMD. - */ -static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) -{ - pmd_t pmdval = pmdp_get_lockless(pmd); - - /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - barrier(); -#endif - - if (pmd_none(pmdval)) - return 1; - if (pmd_trans_huge(pmdval)) - return 0; - if (unlikely(pmd_bad(pmdval))) { - pmd_clear_bad(pmd); - return 1; - } - - return 0; -} - -/* * Return true if we want to split THPs into PTE mappings in change * protection procedure, false otherwise. */ @@ -403,7 +365,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { long ret; - + pmd_t _pmd; +again: next = pmd_addr_end(addr, end); ret = change_pmd_prepare(vma, pmd, cp_flags); @@ -411,16 +374,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, pages = ret; break; } - /* - * Automatic NUMA balancing walks the tables with mmap_lock - * held for read. It's possible a parallel update to occur - * between pmd_trans_huge() and a pmd_none_or_clear_bad() - * check leading to a false positive and clearing. - * Hence, it's necessary to atomically read the PMD value - * for all the checks. - */ - if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && - pmd_none_or_clear_bad_unless_trans_huge(pmd)) + + if (pmd_none(*pmd)) goto next; /* invoke the mmu notifier if the pmd is populated */ @@ -431,7 +386,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, mmu_notifier_invalidate_range_start(&range); } - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + _pmd = pmdp_get_lockless(pmd); + if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false, NULL); @@ -446,15 +402,10 @@ static inline long change_pmd_range(struct mmu_gather *tlb, break; } } else { - /* - * change_huge_pmd() does not defer TLB flushes, - * so no need to propagate the tlb argument. - */ - int nr_ptes = change_huge_pmd(tlb, vma, pmd, + ret = change_huge_pmd(tlb, vma, pmd, addr, newprot, cp_flags); - - if (nr_ptes) { - if (nr_ptes == HPAGE_PMD_NR) { + if (ret) { + if (ret == HPAGE_PMD_NR) { pages += HPAGE_PMD_NR; nr_huge_updates++; } @@ -465,8 +416,12 @@ static inline long change_pmd_range(struct mmu_gather *tlb, } /* fall through, the trans huge pmd just split */ } - pages += change_pte_range(tlb, vma, pmd, addr, next, - newprot, cp_flags); + + ret = change_pte_range(tlb, vma, pmd, addr, next, newprot, + cp_flags); + if (ret < 0) + goto again; + pages += ret; next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -589,7 +544,8 @@ long change_protection(struct mmu_gather *tlb, static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -597,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -867,7 +824,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (!error && vma_iter_end(&vmi) < end) + if (!error && tmp < end) error = -ENOMEM; out: diff --git a/mm/mremap.c b/mm/mremap.c index b11ce6c92099..fe6b722ae633 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -133,7 +133,7 @@ static pte_t move_soft_dirty_pte(pte_t pte) return pte; } -static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, +static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) @@ -143,6 +143,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long len = old_end - old_addr; + int err = 0; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma @@ -170,8 +171,16 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * pte locks because exclusive mmap_lock prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map(new_pmd, new_addr); - new_ptl = pte_lockptr(mm, new_pmd); + if (!old_pte) { + err = -EAGAIN; + goto out; + } + new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl); + if (!new_pte) { + pte_unmap_unlock(old_pte, old_ptl); + err = -EAGAIN; + goto out; + } if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); @@ -179,7 +188,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(*old_pte)) + if (pte_none(ptep_get(old_pte))) continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); @@ -208,8 +217,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); +out: if (need_rmap_locks) drop_rmap_locks(vma); + return err; } #ifndef arch_supports_page_table_move @@ -537,6 +548,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; +again: if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && @@ -544,8 +556,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd, new_pmd, need_rmap_locks)) continue; split_huge_pmd(vma, old_pmd, old_addr); - if (pmd_trans_unstable(old_pmd)) - continue; } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && extent == PMD_SIZE) { /* @@ -556,11 +566,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd, new_pmd, true)) continue; } - + if (pmd_none(*old_pmd)) + continue; if (pte_alloc(new_vma->vm_mm, new_pmd)) break; - move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, - new_pmd, new_addr, need_rmap_locks); + if (move_ptes(vma, old_pmd, old_addr, old_addr + extent, + new_vma, new_pmd, new_addr, need_rmap_locks) < 0) + goto again; } mmu_notifier_invalidate_range_end(&range); @@ -775,7 +787,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return ERR_PTR(-EFAULT); - if (mlock_future_check(mm, vma->vm_flags, new_len - old_len)) + if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len)) return ERR_PTR(-EAGAIN); if (!may_expand_vm(mm, vma->vm_flags, @@ -914,7 +926,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * mapping address intact. A non-zero tag will cause the subsequent * range checks to reject the address as invalid. * - * See Documentation/arm64/tagged-address-abi.rst for more information. + * See Documentation/arch/arm64/tagged-address-abi.rst for more + * information. */ addr = untagged_addr(addr); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 044e1eed720e..612b5597d3af 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1130,12 +1130,10 @@ bool out_of_memory(struct oom_control *oc) /* * The OOM killer does not compensate for IO-less reclaim. - * pagefault_out_of_memory lost its gfp context so we have to - * make sure exclude 0 mask - all other users should have at least - * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to - * invoke the OOM killer even if it is a GFP_NOFS allocation. + * But mem_cgroup_oom() has to invoke the OOM killer even + * if it is a GFP_NOFS allocation. */ - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) + if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) return true; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index db7943999007..1d17fb1ec863 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2597,7 +2597,7 @@ EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). * * NOTE: This relies on being atomic wrt interrupts. */ @@ -2631,7 +2631,7 @@ static void folio_account_dirtied(struct folio *folio, /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { @@ -2650,7 +2650,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). Most callers have the folio + * The caller must hold folio_memcg_lock(). Most callers have the folio * locked. A few have the folio blocked from truncation through other * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc12b..7d3460c7a480 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,21 +18,14 @@ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/swapops.h> #include <linux/interrupt.h> -#include <linux/pagemap.h> #include <linux/jiffies.h> -#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kasan.h> #include <linux/kmsan.h> #include <linux/module.h> #include <linux/suspend.h> -#include <linux/pagevec.h> -#include <linux/blkdev.h> -#include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/oom.h> #include <linux/topology.h> @@ -41,19 +34,8 @@ #include <linux/cpuset.h> #include <linux/memory_hotplug.h> #include <linux/nodemask.h> -#include <linux/vmalloc.h> #include <linux/vmstat.h> -#include <linux/mempolicy.h> -#include <linux/memremap.h> -#include <linux/stop_machine.h> -#include <linux/random.h> -#include <linux/sort.h> -#include <linux/pfn.h> -#include <linux/backing-dev.h> #include <linux/fault-inject.h> -#include <linux/page-isolation.h> -#include <linux/debugobjects.h> -#include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <trace/events/oom.h> @@ -61,26 +43,19 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> -#include <linux/hugetlb.h> -#include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/page_table_check.h> -#include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/lockdep.h> -#include <linux/nmi.h> #include <linux/psi.h> #include <linux/khugepaged.h> #include <linux/delayacct.h> -#include <asm/sections.h> -#include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" #include "shuffle.h" #include "page_reporting.h" -#include "swap.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; @@ -227,18 +202,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -atomic_long_t _totalram_pages __read_mostly; -EXPORT_SYMBOL(_totalram_pages); -unsigned long totalreserve_pages __read_mostly; -unsigned long totalcma_pages __read_mostly; - -int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); -EXPORT_SYMBOL(init_on_alloc); - -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); -EXPORT_SYMBOL(init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -258,44 +222,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) page->index = migratetype; } -#ifdef CONFIG_PM_SLEEP -/* - * The following functions are used by the suspend/hibernate code to temporarily - * change gfp_allowed_mask in order to avoid using I/O during memory allocations - * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with system_transition_mutex held - * (gfp_allowed_mask also should only be modified with system_transition_mutex - * held, unless the suspend/hibernate code is guaranteed not to run in parallel - * with that modification). - */ - -static gfp_t saved_gfp_mask; - -void pm_restore_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (saved_gfp_mask) { - gfp_allowed_mask = saved_gfp_mask; - saved_gfp_mask = 0; - } -} - -void pm_restrict_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - WARN_ON(saved_gfp_mask); - saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); -} - -bool pm_suspended_storage(void) -{ - if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - return false; - return true; -} -#endif /* CONFIG_PM_SLEEP */ - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -314,7 +240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { +static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA [ZONE_DMA] = 256, #endif @@ -358,7 +284,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { +static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE @@ -371,10 +297,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; -int watermark_scale_factor = 10; - -bool mirrored_kernelcore __initdata_memblock; +static int watermark_boost_factor __read_mostly = 15000; +static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -387,6 +311,12 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +static bool page_contains_unaccepted(struct page *page, unsigned int order); +static void accept_page(struct page *page, unsigned int order); +static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static inline bool has_unaccepted_memory(void); +static bool __free_unaccepted(struct page *page); + int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -550,13 +480,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) return ret; } -static int page_is_consistent(struct zone *zone, struct page *page) -{ - if (zone != page_zone(page)) - return 0; - - return 1; -} /* * Temporary debugging check for pages not lying within a given zone. */ @@ -564,7 +487,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; - if (!page_is_consistent(zone, page)) + if (zone != page_zone(page)) return 1; return 0; @@ -704,75 +627,6 @@ void destroy_large_folio(struct folio *folio) compound_page_dtors[dtor](&folio->page); } -#ifdef CONFIG_DEBUG_PAGEALLOC -unsigned int _debug_guardpage_minorder; - -bool _debug_pagealloc_enabled_early __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); -EXPORT_SYMBOL(_debug_pagealloc_enabled_early); -DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -EXPORT_SYMBOL(_debug_pagealloc_enabled); - -DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static int __init early_debug_pagealloc(char *buf) -{ - return kstrtobool(buf, &_debug_pagealloc_enabled_early); -} -early_param("debug_pagealloc", early_debug_pagealloc); - -static int __init debug_guardpage_minorder_setup(char *buf) -{ - unsigned long res; - - if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); - return 0; - } - _debug_guardpage_minorder = res; - pr_info("Setting debug_guardpage_minorder to %lu\n", res); - return 0; -} -early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); - -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return false; - - if (order >= debug_guardpage_minorder()) - return false; - - __SetPageGuard(page); - INIT_LIST_HEAD(&page->buddy_list); - set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); - - return true; -} - -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return; - - __ClearPageGuard(page); - - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); -} -#else -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} -#endif - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -879,7 +733,7 @@ static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { return list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); + struct page, buddy_list); } /* @@ -1131,6 +985,11 @@ static inline bool free_page_is_bad(struct page *page) return true; } +static inline bool is_check_pages_enabled(void) +{ + return static_branch_unlikely(&check_pages_enabled); +} + static int free_tail_page_prepare(struct page *head_page, struct page *page) { struct folio *folio = (struct folio *)head_page; @@ -1142,7 +1001,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) */ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - if (!static_branch_unlikely(&check_pages_enabled)) { + if (!is_check_pages_enabled()) { ret = 0; goto out; } @@ -1481,6 +1340,13 @@ void __free_pages_core(struct page *page, unsigned int order) atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + if (page_contains_unaccepted(page, order)) { + if (order == MAX_ORDER && __free_unaccepted(page)) + return; + + accept_page(page, order); + } + /* * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. @@ -1521,7 +1387,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, /* end_pfn is one past the range we are checking */ end_pfn--; - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + if (!pfn_valid(end_pfn)) return NULL; start_page = pfn_to_online_page(start_pfn); @@ -1540,33 +1406,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, return start_page; } -void set_zone_contiguous(struct zone *zone) -{ - unsigned long block_start_pfn = zone->zone_start_pfn; - unsigned long block_end_pfn; - - block_end_pfn = pageblock_end_pfn(block_start_pfn); - for (; block_start_pfn < zone_end_pfn(zone); - block_start_pfn = block_end_pfn, - block_end_pfn += pageblock_nr_pages) { - - block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); - - if (!__pageblock_pfn_to_page(block_start_pfn, - block_end_pfn, zone)) - return; - cond_resched(); - } - - /* We confirm that there is no hole */ - zone->contiguous = true; -} - -void clear_zone_contiguous(struct zone *zone) -{ - zone->contiguous = false; -} - /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression @@ -2501,61 +2340,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -#ifdef CONFIG_HIBERNATION - -/* - * Touch the watchdog for every WD_PAGE_COUNT pages. - */ -#define WD_PAGE_COUNT (128*1024) - -void mark_free_pages(struct zone *zone) -{ - unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; - unsigned long flags; - unsigned int order, t; - struct page *page; - - if (zone_is_empty(zone)) - return; - - spin_lock_irqsave(&zone->lock, flags); - - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - - if (page_zone(page) != zone) - continue; - - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } - - for_each_migratetype_order(order, t) { - list_for_each_entry(page, - &zone->free_area[order].free_list[t], buddy_list) { - unsigned long i; - - pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) { - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - swsusp_set_page_free(pfn_to_page(pfn + i)); - } - } - } - spin_unlock_irqrestore(&zone->lock, flags); -} -#endif /* CONFIG_PM */ - static bool free_unref_page_prepare(struct page *page, unsigned long pfn, unsigned int order) { @@ -3052,7 +2836,8 @@ struct page *rmqueue(struct zone *preferred_zone, out: /* Separate test+clear to avoid unnecessary atomics */ - if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { + if ((alloc_flags & ALLOC_KSWAPD) && + unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } @@ -3061,80 +2846,6 @@ out: return page; } -#ifdef CONFIG_FAIL_PAGE_ALLOC - -static struct { - struct fault_attr attr; - - bool ignore_gfp_highmem; - bool ignore_gfp_reclaim; - u32 min_order; -} fail_page_alloc = { - .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_reclaim = true, - .ignore_gfp_highmem = true, - .min_order = 1, -}; - -static int __init setup_fail_page_alloc(char *str) -{ - return setup_fault_attr(&fail_page_alloc.attr, str); -} -__setup("fail_page_alloc=", setup_fail_page_alloc); - -static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - int flags = 0; - - if (order < fail_page_alloc.min_order) - return false; - if (gfp_mask & __GFP_NOFAIL) - return false; - if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return false; - if (fail_page_alloc.ignore_gfp_reclaim && - (gfp_mask & __GFP_DIRECT_RECLAIM)) - return false; - - /* See comment in __should_failslab() */ - if (gfp_mask & __GFP_NOWARN) - flags |= FAULT_NOWARN; - - return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); -} - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - -static int __init fail_page_alloc_debugfs(void) -{ - umode_t mode = S_IFREG | 0600; - struct dentry *dir; - - dir = fault_create_debugfs_attr("fail_page_alloc", NULL, - &fail_page_alloc.attr); - - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim); - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); - - return 0; -} - -late_initcall(fail_page_alloc_debugfs); - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - -#else /* CONFIG_FAIL_PAGE_ALLOC */ - -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - return false; -} - -#endif /* CONFIG_FAIL_PAGE_ALLOC */ - noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); @@ -3159,6 +2870,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + unusable_free += zone_page_state(z, NR_UNACCEPTED); +#endif return unusable_free; } @@ -3458,6 +3172,11 @@ retry: gfp_mask)) { int ret; + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, order)) + goto try_this_zone; + } + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can @@ -3510,6 +3229,11 @@ try_this_zone: return page; } else { + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, order)) + goto try_this_zone; + } + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { @@ -3768,56 +3492,41 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (fatal_signal_pending(current)) return false; - if (compaction_made_progress(compact_result)) - (*compaction_retries)++; - - /* - * compaction considers all the zone as desperately out of memory - * so it doesn't really make much sense to retry except when the - * failure could be caused by insufficient priority - */ - if (compaction_failed(compact_result)) - goto check_priority; - /* - * compaction was skipped because there are not enough order-0 pages - * to work with, so we retry only if it looks like reclaim can help. + * Compaction was skipped due to a lack of free order-0 + * migration targets. Continue if reclaim can help. */ - if (compaction_needs_reclaim(compact_result)) { + if (compact_result == COMPACT_SKIPPED) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But the next retry should use a higher priority if allowed, so - * we don't just keep bailing out endlessly. + * Compaction managed to coalesce some page blocks, but the + * allocation failed presumably due to a race. Retry some. */ - if (compaction_withdrawn(compact_result)) { - goto check_priority; - } + if (compact_result == COMPACT_SUCCESS) { + /* + * !costly requests are much more important than + * __GFP_RETRY_MAYFAIL costly ones because they are de + * facto nofail and invoke OOM killer to move on while + * costly can fail and users are ready to cope with + * that. 1/4 retries is rather arbitrary but we would + * need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; - /* - * !costly requests are much more important than __GFP_RETRY_MAYFAIL - * costly ones because they are de facto nofail and invoke OOM - * killer to move on while costly can fail and users are ready - * to cope with that. 1/4 retries is rather arbitrary but we - * would need much more detailed feedback from compaction to - * make a better decision. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - max_retries /= 4; - if (*compaction_retries <= max_retries) { - ret = true; - goto out; + if (++(*compaction_retries) <= max_retries) { + ret = true; + goto out; + } } /* - * Make sure there are attempts at the highest priority if we exhausted - * all retries or failed at the lower priorities. + * Compaction failed. Retry with increasing priority. */ -check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; @@ -5137,383 +4846,6 @@ unsigned long nr_free_buffer_pages(void) } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -static inline void show_node(struct zone *zone) -{ - if (IS_ENABLED(CONFIG_NUMA)) - printk("Node %d ", zone_to_nid(zone)); -} - -long si_mem_available(void) -{ - long available; - unsigned long pagecache; - unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; - unsigned long reclaimable; - struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); - - for_each_zone(zone) - wmark_low += low_wmark_pages(zone); - - /* - * Estimate the amount of memory available for userspace allocations, - * without causing swapping or OOM. - */ - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; - - /* - * Not all the page cache can be freed, otherwise the system will - * start swapping or thrashing. Assume at least half of the page - * cache, or the low watermark worth of cache, needs to stay. - */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; - pagecache -= min(pagecache / 2, wmark_low); - available += pagecache; - - /* - * Part of the reclaimable slab and other kernel memory consists of - * items that are in use, and cannot be freed. Cap this estimate at the - * low watermark. - */ - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - available += reclaimable - min(reclaimable / 2, wmark_low); - - if (available < 0) - available = 0; - return available; -} -EXPORT_SYMBOL_GPL(si_mem_available); - -void si_meminfo(struct sysinfo *val) -{ - val->totalram = totalram_pages(); - val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_zone_page_state(NR_FREE_PAGES); - val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages(); - val->freehigh = nr_free_highpages(); - val->mem_unit = PAGE_SIZE; -} - -EXPORT_SYMBOL(si_meminfo); - -#ifdef CONFIG_NUMA -void si_meminfo_node(struct sysinfo *val, int nid) -{ - int zone_type; /* needs to be signed */ - unsigned long managed_pages = 0; - unsigned long managed_highpages = 0; - unsigned long free_highpages = 0; - pg_data_t *pgdat = NODE_DATA(nid); - - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - struct zone *zone = &pgdat->node_zones[zone_type]; - - if (is_highmem(zone)) { - managed_highpages += zone_managed_pages(zone); - free_highpages += zone_page_state(zone, NR_FREE_PAGES); - } - } - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif - val->mem_unit = PAGE_SIZE; -} -#endif - -/* - * Determine whether the node should be displayed or not, depending on whether - * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). - */ -static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) -{ - if (!(flags & SHOW_MEM_FILTER_NODES)) - return false; - - /* - * no node mask - aka implicit memory numa policy. Do not bother with - * the synchronization - read_mems_allowed_begin - because we do not - * have to be precise here. - */ - if (!nodemask) - nodemask = &cpuset_current_mems_allowed; - - return !node_isset(nid, *nodemask); -} - -static void show_migration_types(unsigned char type) -{ - static const char types[MIGRATE_TYPES] = { - [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RECLAIMABLE] = 'E', - [MIGRATE_HIGHATOMIC] = 'H', -#ifdef CONFIG_CMA - [MIGRATE_CMA] = 'C', -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = 'I', -#endif - }; - char tmp[MIGRATE_TYPES + 1]; - char *p = tmp; - int i; - - for (i = 0; i < MIGRATE_TYPES; i++) { - if (type & (1 << i)) - *p++ = types[i]; - } - - *p = '\0'; - printk(KERN_CONT "(%s) ", tmp); -} - -static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) -{ - int zone_idx; - for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) - if (zone_managed_pages(pgdat->node_zones + zone_idx)) - return true; - return false; -} - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - * - * Bits in @filter: - * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's - * cpuset. - */ -void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) -{ - unsigned long free_pcp = 0; - int cpu, nid; - struct zone *zone; - pg_data_t *pgdat; - - for_each_populated_zone(zone) { - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - - for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; - } - - printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" - " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu\n" - " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu\n" - " sec_pagetables:%lu bounce:%lu\n" - " kernel_misc_reclaimable:%lu\n" - " free:%lu free_pcp:%lu free_cma:%lu\n", - global_node_page_state(NR_ACTIVE_ANON), - global_node_page_state(NR_INACTIVE_ANON), - global_node_page_state(NR_ISOLATED_ANON), - global_node_page_state(NR_ACTIVE_FILE), - global_node_page_state(NR_INACTIVE_FILE), - global_node_page_state(NR_ISOLATED_FILE), - global_node_page_state(NR_UNEVICTABLE), - global_node_page_state(NR_FILE_DIRTY), - global_node_page_state(NR_WRITEBACK), - global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), - global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), - global_node_page_state(NR_FILE_MAPPED), - global_node_page_state(NR_SHMEM), - global_node_page_state(NR_PAGETABLE), - global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), - global_zone_page_state(NR_FREE_PAGES), - free_pcp, - global_zone_page_state(NR_FREE_CMA_PAGES)); - - for_each_online_pgdat(pgdat) { - if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) - continue; - if (!node_has_managed_zones(pgdat, max_zone_idx)) - continue; - - printk("Node %d" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" - " mapped:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " shmem:%lukB" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" -#endif - " writeback_tmp:%lukB" - " kernel_stack:%lukB" -#ifdef CONFIG_SHADOW_CALL_STACK - " shadow_call_stack:%lukB" -#endif - " pagetables:%lukB" - " sec_pagetables:%lukB" - " all_unreclaimable? %s" - "\n", - pgdat->node_id, - K(node_page_state(pgdat, NR_ACTIVE_ANON)), - K(node_page_state(pgdat, NR_INACTIVE_ANON)), - K(node_page_state(pgdat, NR_ACTIVE_FILE)), - K(node_page_state(pgdat, NR_INACTIVE_FILE)), - K(node_page_state(pgdat, NR_UNEVICTABLE)), - K(node_page_state(pgdat, NR_ISOLATED_ANON)), - K(node_page_state(pgdat, NR_ISOLATED_FILE)), - K(node_page_state(pgdat, NR_FILE_MAPPED)), - K(node_page_state(pgdat, NR_FILE_DIRTY)), - K(node_page_state(pgdat, NR_WRITEBACK)), - K(node_page_state(pgdat, NR_SHMEM)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(node_page_state(pgdat, NR_SHMEM_THPS)), - K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), - K(node_page_state(pgdat, NR_ANON_THPS)), -#endif - K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - node_page_state(pgdat, NR_KERNEL_STACK_KB), -#ifdef CONFIG_SHADOW_CALL_STACK - node_page_state(pgdat, NR_KERNEL_SCS_KB), -#endif - K(node_page_state(pgdat, NR_PAGETABLE)), - K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? - "yes" : "no"); - } - - for_each_populated_zone(zone) { - int i; - - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - - free_pcp = 0; - for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; - - show_node(zone); - printk(KERN_CONT - "%s" - " free:%lukB" - " boost:%lukB" - " min:%lukB" - " low:%lukB" - " high:%lukB" - " reserved_highatomic:%luKB" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " writepending:%lukB" - " present:%lukB" - " managed:%lukB" - " mlocked:%lukB" - " bounce:%lukB" - " free_pcp:%lukB" - " local_pcp:%ukB" - " free_cma:%lukB" - "\n", - zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->watermark_boost), - K(min_wmark_pages(zone)), - K(low_wmark_pages(zone)), - K(high_wmark_pages(zone)), - K(zone->nr_reserved_highatomic), - K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), - K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), - K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), - K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), - K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), - K(zone->present_pages), - K(zone_managed_pages(zone)), - K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), - K(free_pcp), - K(this_cpu_read(zone->per_cpu_pageset->count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES))); - printk("lowmem_reserve[]:"); - for (i = 0; i < MAX_NR_ZONES; i++) - printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); - printk(KERN_CONT "\n"); - } - - for_each_populated_zone(zone) { - unsigned int order; - unsigned long nr[MAX_ORDER + 1], flags, total = 0; - unsigned char types[MAX_ORDER + 1]; - - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - show_node(zone); - printk(KERN_CONT "%s: ", zone->name); - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[order]; - int type; - - nr[order] = area->nr_free; - total += nr[order] << order; - - types[order] = 0; - for (type = 0; type < MIGRATE_TYPES; type++) { - if (!free_area_empty(area, type)) - types[order] |= 1 << type; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { - printk(KERN_CONT "%lu*%lukB ", - nr[order], K(1UL) << order); - if (nr[order]) - show_migration_types(types[order]); - } - printk(KERN_CONT "= %lukB\n", K(total)); - } - - for_each_online_node(nid) { - if (show_mem_node_skip(filter, nid, nodemask)) - continue; - hugetlb_show_meminfo_node(nid); - } - - printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); - - show_swap_cache_info(); -} - static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; @@ -5560,12 +4892,12 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -char numa_zonelist_order[] = "Node"; - +static char numa_zonelist_order[] = "Node"; +#define NUMA_ZONELIST_ORDER_LEN 16 /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(struct ctl_table *table, int write, +static int numa_zonelist_order_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { if (write) @@ -5573,7 +4905,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, return proc_dostring(table, write, buffer, length, ppos); } - static int node_load[MAX_NUMNODES]; /** @@ -5976,6 +5307,7 @@ static int zone_batchsize(struct zone *zone) #endif } +static int percpu_pagelist_high_fraction; static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU @@ -6505,7 +5837,7 @@ postcore_initcall(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, +static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6521,7 +5853,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, +static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6551,7 +5883,7 @@ static void setup_min_unmapped_ratio(void) } -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6578,7 +5910,7 @@ static void setup_min_slab_ratio(void) sysctl_min_slab_ratio) / 100; } -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6602,8 +5934,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) +static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) { int i; @@ -6623,7 +5955,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, +static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -6656,9 +5988,83 @@ out: return ret; } +static struct ctl_table page_alloc_sysctl_table[] = { + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_THREE_THOUSAND, + }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif + {} +}; + +void __init page_alloc_sysctl_init(void) +{ + register_sysctl_init("vm", page_alloc_sysctl_table); +} + #ifdef CONFIG_CONTIG_ALLOC -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) { @@ -6672,11 +6078,6 @@ static void alloc_contig_dump_pages(struct list_head *page_list) dump_page(page, "migration failure"); } } -#else -static inline void alloc_contig_dump_pages(struct list_head *page_list) -{ -} -#endif /* [start, end) must belong to a single zone. */ int __alloc_contig_migrate_range(struct compact_control *cc, @@ -7215,3 +6616,150 @@ bool has_managed_dma(void) return false; } #endif /* CONFIG_ZONE_DMA */ + +#ifdef CONFIG_UNACCEPTED_MEMORY + +/* Counts number of zones with unaccepted pages. */ +static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); + +static bool lazy_accept = true; + +static int __init accept_memory_parse(char *p) +{ + if (!strcmp(p, "lazy")) { + lazy_accept = true; + return 0; + } else if (!strcmp(p, "eager")) { + lazy_accept = false; + return 0; + } else { + return -EINVAL; + } +} +early_param("accept_memory", accept_memory_parse); + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ + phys_addr_t start = page_to_phys(page); + phys_addr_t end = start + (PAGE_SIZE << order); + + return range_contains_unaccepted_memory(start, end); +} + +static void accept_page(struct page *page, unsigned int order) +{ + phys_addr_t start = page_to_phys(page); + + accept_memory(start, start + (PAGE_SIZE << order)); +} + +static bool try_to_accept_memory_one(struct zone *zone) +{ + unsigned long flags; + struct page *page; + bool last; + + if (list_empty(&zone->unaccepted_pages)) + return false; + + spin_lock_irqsave(&zone->lock, flags); + page = list_first_entry_or_null(&zone->unaccepted_pages, + struct page, lru); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return false; + } + + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + + __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + + accept_page(page, MAX_ORDER); + + __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); + + if (last) + static_branch_dec(&zones_with_unaccepted_pages); + + return true; +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + long to_accept; + int ret = false; + + /* How much to accept to get to high watermark? */ + to_accept = high_wmark_pages(zone) - + (zone_page_state(zone, NR_FREE_PAGES) - + __zone_watermark_unusable_free(zone, order, 0)); + + /* Accept at least one page */ + do { + if (!try_to_accept_memory_one(zone)) + break; + ret = true; + to_accept -= MAX_ORDER_NR_PAGES; + } while (to_accept > 0); + + return ret; +} + +static inline bool has_unaccepted_memory(void) +{ + return static_branch_unlikely(&zones_with_unaccepted_pages); +} + +static bool __free_unaccepted(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + bool first = false; + + if (!lazy_accept) + return false; + + spin_lock_irqsave(&zone->lock, flags); + first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); + __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + + if (first) + static_branch_inc(&zones_with_unaccepted_pages); + + return true; +} + +#else + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ + return false; +} + +static void accept_page(struct page *page, unsigned int order) +{ +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + return false; +} + +static inline bool has_unaccepted_memory(void) +{ + return false; +} + +static bool __free_unaccepted(struct page *page) +{ + BUILD_BUG(); + return false; +} + +#endif /* CONFIG_UNACCEPTED_MEMORY */ diff --git a/mm/page_io.c b/mm/page_io.c index 87b682d18850..684cd3c7b59b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,7 +338,7 @@ static void swap_writepage_bdev_sync(struct page *page, bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); bio.bi_iter.bi_sector = swap_page_sector(page); - bio_add_page(&bio, page, thp_size(page), 0); + __bio_add_page(&bio, page, thp_size(page), 0); bio_associate_blkg_from_page(&bio, page); count_swpout_vm_event(page); @@ -360,7 +360,7 @@ static void swap_writepage_bdev_async(struct page *page, GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_write; - bio_add_page(bio, page, thp_size(page), 0); + __bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); @@ -468,7 +468,7 @@ static void swap_readpage_bdev_sync(struct page *page, bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = swap_page_sector(page); - bio_add_page(&bio, page, thp_size(page), 0); + __bio_add_page(&bio, page, thp_size(page), 0); /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. @@ -488,7 +488,7 @@ static void swap_readpage_bdev_async(struct page *page, bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; - bio_add_page(bio, page, thp_size(page), 0); + __bio_add_page(bio, page, thp_size(page), 0); count_vm_event(PSWPIN); submit_bio(bio); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c6f3605e37ab..6599cc965e21 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -481,10 +481,9 @@ failed: } /** - * start_isolate_page_range() - make page-allocation-type of range of pages to - * be MIGRATE_ISOLATE. - * @start_pfn: The lower PFN of the range to be isolated. - * @end_pfn: The upper PFN of the range to be isolated. + * start_isolate_page_range() - mark page range MIGRATE_ISOLATE + * @start_pfn: The first PFN of the range to be isolated. + * @end_pfn: The last PFN of the range to be isolated. * @migratetype: Migrate type to set in error recovery. * @flags: The following flags are allowed (they can be combined in * a bit mask) @@ -571,8 +570,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, return 0; } -/* - * Make isolated pages available again. +/** + * undo_isolate_page_range - undo effects of start_isolate_page_range() + * @start_pfn: The first PFN of the isolated range + * @end_pfn: The last PFN of the isolated range + * @migratetype: New migrate type to set on the range + * + * This finds every MIGRATE_ISOLATE page block in the given range + * and switches it to @migratetype. */ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype) @@ -631,7 +636,21 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } -/* Caller should ensure that requested range is in a single zone */ +/** + * test_pages_isolated - check if pageblocks in range are isolated + * @start_pfn: The first PFN of the isolated range + * @end_pfn: The first PFN *after* the isolated range + * @isol_flags: Testing mode flags + * + * This tests if all in the specified range are free. + * + * If %MEMORY_OFFLINE is specified in @flags, it will consider + * poisoned and offlined pages free as well. + * + * Caller must ensure the requested range doesn't span zones. + * + * Returns 0 if true, -EBUSY if one or more pages are in use. + */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int isol_flags) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 31169b3e7f06..c93baef0148f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -418,7 +418,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); ret += scnprintf(kbuf + ret, count - ret, - "PFN %lu type %s Block %lu type %s Flags %pGp\n", + "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], pfn >> pageblock_order, diff --git a/mm/page_table_check.c b/mm/page_table_check.c index f2baf97d5f38..93ec7690a0d8 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -196,7 +196,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -246,8 +246,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, pte_t *ptep = pte_offset_map(&pmd, addr); unsigned long i; + if (WARN_ON(!ptep)) + return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 4e448cfbc6ef..49e0d28f0379 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -13,42 +13,61 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) return false; } -static bool map_pte(struct page_vma_mapped_walk *pvmw) +static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) { - pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address); - if (!(pvmw->flags & PVMW_SYNC)) { - if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(*pvmw->pte)) - return false; - } else { - /* - * We get here when we are trying to unmap a private - * device page from the process address space. Such - * page is not CPU accessible and thus is mapped as - * a special swap entry, nonetheless it still does - * count as a valid regular mapping for the page (and - * is accounted as such in page maps count). - * - * So handle this special case as if it was a normal - * page mapping ie lock CPU page table and returns - * true. - * - * For more details on device private memory see HMM - * (include/linux/hmm.h or mm/hmm.c). - */ - if (is_swap_pte(*pvmw->pte)) { - swp_entry_t entry; + pte_t ptent; - /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(*pvmw->pte); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) - return false; - } else if (!pte_present(*pvmw->pte)) - return false; - } + if (pvmw->flags & PVMW_SYNC) { + /* Use the stricter lookup */ + pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd, + pvmw->address, &pvmw->ptl); + *ptlp = pvmw->ptl; + return !!pvmw->pte; } - pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); + + /* + * It is important to return the ptl corresponding to pte, + * in case *pvmw->pmd changes underneath us; so we need to + * return it even when choosing not to lock, in case caller + * proceeds to loop over next ptes, and finds a match later. + * Though, in most cases, page lock already protects this. + */ + pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, + pvmw->address, ptlp); + if (!pvmw->pte) + return false; + + ptent = ptep_get(pvmw->pte); + + if (pvmw->flags & PVMW_MIGRATION) { + if (!is_swap_pte(ptent)) + return false; + } else if (is_swap_pte(ptent)) { + swp_entry_t entry; + /* + * Handle un-addressable ZONE_DEVICE memory. + * + * We get here when we are trying to unmap a private + * device page from the process address space. Such + * page is not CPU accessible and thus is mapped as + * a special swap entry, nonetheless it still does + * count as a valid regular mapping for the page + * (and is accounted as such in page maps count). + * + * So handle this special case as if it was a normal + * page mapping ie lock CPU page table and return true. + * + * For more details on device private memory see HMM + * (include/linux/hmm.h or mm/hmm.c). + */ + entry = pte_to_swp_entry(ptent); + if (!is_device_private_entry(entry) && + !is_device_exclusive_entry(entry)) + return false; + } else if (!pte_present(ptent)) { + return false; + } + pvmw->ptl = *ptlp; spin_lock(pvmw->ptl); return true; } @@ -75,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) static bool check_pte(struct page_vma_mapped_walk *pvmw) { unsigned long pfn; + pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { swp_entry_t entry; - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_migration_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); } else { - if (!pte_present(*pvmw->pte)) + if (!pte_present(ptent)) return false; - pfn = pte_pfn(*pvmw->pte); + pfn = pte_pfn(ptent); } return (pfn - pvmw->pfn) < pvmw->nr_pages; @@ -153,6 +173,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long end; + spinlock_t *ptl; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -210,7 +231,7 @@ restart: * compiler and used as a stale value after we've observed a * subsequent update. */ - pmde = READ_ONCE(*pvmw->pmd); + pmde = pmdp_get_lockless(pvmw->pmd); if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) || (pmd_present(pmde) && pmd_devmap(pmde))) { @@ -254,8 +275,11 @@ restart: step_forward(pvmw, PMD_SIZE); continue; } - if (!map_pte(pvmw)) + if (!map_pte(pvmw, &ptl)) { + if (!pvmw->pte) + goto restart; goto next_pte; + } this_pte: if (check_pte(pvmw)) return true; @@ -275,14 +299,10 @@ next_pte: goto restart; } pvmw->pte++; - if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { - pvmw->ptl = pte_lockptr(mm, pvmw->pmd); - spin_lock(pvmw->ptl); - } - } while (pte_none(*pvmw->pte)); + } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { - pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + pvmw->ptl = ptl; spin_lock(pvmw->ptl); } goto this_pte; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index cb23f8a15c13..64437105fe0d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -46,15 +46,27 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, spinlock_t *ptl; if (walk->no_vma) { - pte = pte_offset_map(pmd, addr); - err = walk_pte_range_inner(pte, addr, end, walk); - pte_unmap(pte); + /* + * pte_offset_map() might apply user-specific validation. + */ + if (walk->mm == &init_mm) + pte = pte_offset_kernel(pmd, addr); + else + pte = pte_offset_map(pmd, addr); + if (pte) { + err = walk_pte_range_inner(pte, addr, end, walk); + if (walk->mm != &init_mm) + pte_unmap(pte); + } } else { pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - err = walk_pte_range_inner(pte, addr, end, walk); - pte_unmap_unlock(pte, ptl); + if (pte) { + err = walk_pte_range_inner(pte, addr, end, walk); + pte_unmap_unlock(pte, ptl); + } } - + if (!pte) + walk->action = ACTION_AGAIN; return err; } @@ -141,11 +153,8 @@ again: !(ops->pte_entry)) continue; - if (walk->vma) { + if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); - if (pmd_trans_unstable(pmd)) - goto again; - } if (is_hugepd(__hugepd(pmd_val(*pmd)))) err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); @@ -153,6 +162,10 @@ again: err = walk_pte_range(pmd, addr, next, walk); if (err) break; + + if (walk->action == ACTION_AGAIN) + goto again; + } while (pmd++, addr = next, addr != end); return err; diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index f9847c131998..cdd0aa597a81 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -41,10 +41,17 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; - void *base_addr; /* base address of this chunk */ + unsigned long *bound_map; /* boundary map */ + + /* + * base_addr is the base address of this chunk. + * To reduce false sharing, current layout is optimized to make sure + * base_addr locate in the different cacheline with free_bytes and + * chunk_md. + */ + void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ - unsigned long *bound_map; /* boundary map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index d2fc52bffafc..4d454953046f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,8 @@ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <linux/mm_inline.h> #include <asm/tlb.h> @@ -66,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - int changed = !pte_same(*ptep, entry); + int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); @@ -229,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +{ + pmd_t pmdval; + + /* rcu_read_lock() to be added later */ + pmdval = pmdp_get_lockless(pmd); + if (pmdvalp) + *pmdvalp = pmdval; + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + goto nomap; + if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) + goto nomap; + if (unlikely(pmd_bad(pmdval))) { + pmd_clear_bad(pmd); + goto nomap; + } + return __pte_map(&pmdval, addr); +nomap: + /* rcu_read_unlock() to be added later */ + return NULL; +} + +pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + pmd_t pmdval; + pte_t *pte; + + pte = __pte_offset_map(pmd, addr, &pmdval); + if (likely(pte)) + *ptlp = pte_lockptr(mm, &pmdval); + return pte; +} + +pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + spinlock_t *ptl; + pmd_t pmdval; + pte_t *pte; +again: + pte = __pte_offset_map(pmd, addr, &pmdval); + if (unlikely(!pte)) + return pte; + ptl = pte_lockptr(mm, &pmdval); + spin_lock(ptl); + if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + *ptlp = ptl; + return pte; + } + pte_unmap_unlock(pte, ptl); + goto again; +} diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 78dfaf9e8990..0523edab03a6 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr, mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, - NULL, &locked); + &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) diff --git a/mm/ptdump.c b/mm/ptdump.c index 8adab455a68b..03c1bdae4a43 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -119,7 +119,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pte_t val = ptep_get(pte); + pte_t val = ptep_get_lockless(pte); if (st->effective_prot) st->effective_prot(st, 4, pte_val(val)); diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122..a9c999aa19af 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -120,7 +120,6 @@ #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> -#include <linux/pagevec.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> diff --git a/mm/rmap.c b/mm/rmap.c index 19392e090bec..0c0d8857dfce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + if (lru_gen_enabled() && + pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; } @@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) address = pvmw->address; if (pvmw->pte) { - pte_t entry; pte_t *pte = pvmw->pte; + pte_t entry = ptep_get(pte); - if (!pte_dirty(*pte) && !pte_write(*pte)) + if (!pte_dirty(entry) && !pte_write(entry)) continue; - flush_cache_page(vma, address, pte_pfn(*pte)); + flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); @@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * @folio: Folio which contains page. * @page: Page to add to rmap. * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct folio *folio, struct page *page, @@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pfn = pte_pfn(ptep_get(pvmw.pte)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); + pfn = pte_pfn(ptep_get(pvmw.pte)); + if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and @@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; + pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, @@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - if (!pte_present(*pvmw.pte)) { + ptent = ptep_get(pvmw.pte); + if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ @@ -2328,7 +2334,7 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, npages = get_user_pages_remote(mm, start, npages, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, - pages, NULL, NULL); + pages, NULL); if (npages < 0) return npages; diff --git a/mm/secretmem.c b/mm/secretmem.c index 0b502625cd30..86442a15d12f 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -35,7 +35,7 @@ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK -static bool secretmem_enable __ro_after_init; +static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); @@ -125,7 +125,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; - if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); diff --git a/mm/shmem.c b/mm/shmem.c index e40a08c5c6d7..2f2e0e618072 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2731,6 +2731,138 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } +static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return true; +} + +static void zero_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ +} + +static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return false; +} + +static const struct pipe_buf_operations zero_pipe_buf_ops = { + .release = zero_pipe_buf_release, + .try_steal = zero_pipe_buf_try_steal, + .get = zero_pipe_buf_get, +}; + +static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, + loff_t fpos, size_t size) +{ + size_t offset = fpos & ~PAGE_MASK; + + size = min_t(size_t, size, PAGE_SIZE - offset); + + if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + + *buf = (struct pipe_buffer) { + .ops = &zero_pipe_buf_ops, + .page = ZERO_PAGE(0), + .offset = offset, + .len = size, + }; + pipe->head++; + } + + return size; +} + +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct address_space *mapping = inode->i_mapping; + struct folio *folio = NULL; + size_t total_spliced = 0, used, npages, n, part; + loff_t isize; + int error = 0; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + + do { + if (*ppos >= i_size_read(inode)) + break; + + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (folio) { + folio_unlock(folio); + + if (folio_test_hwpoison(folio)) { + error = -EIO; + break; + } + } + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + break; + part = min_t(loff_t, isize - *ppos, len); + + if (folio) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so we can + * now splice it into the pipe. + */ + n = splice_folio_into_pipe(pipe, folio, *ppos, part); + folio_put(folio); + folio = NULL; + } else { + n = splice_zeropage_into_pipe(pipe, *ppos, len); + } + + if (!n) + break; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + break; + + cond_resched(); + } while (len); + + if (folio) + folio_put(folio); + + file_accessed(in); + return total_spliced ? total_spliced : error; +} + static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; @@ -3726,6 +3858,7 @@ out: static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); + struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", @@ -3768,7 +3901,9 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif - shmem_show_mpol(seq, sbinfo->mpol); + mpol = shmem_get_sbmpol(sbinfo); + shmem_show_mpol(seq, mpol); + mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); return 0; @@ -3971,7 +4106,7 @@ static const struct file_operations shmem_file_operations = { .read_iter = shmem_file_read_iter, .write_iter = generic_file_write_iter, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif @@ -4196,7 +4331,7 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; diff --git a/mm/show_mem.c b/mm/show_mem.c new file mode 100644 index 000000000000..01f8e9905817 --- /dev/null +++ b/mm/show_mem.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generic show_mem() implementation + * + * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de> + */ + +#include <linux/blkdev.h> +#include <linux/cma.h> +#include <linux/cpuset.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/swap.h> +#include <linux/vmstat.h> + +#include "internal.h" +#include "swap.h" + +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); +unsigned long totalreserve_pages __read_mostly; +unsigned long totalcma_pages __read_mostly; + +static inline void show_node(struct zone *zone) +{ + if (IS_ENABLED(CONFIG_NUMA)) + printk("Node %d ", zone_to_nid(zone)); +} + +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += low_wmark_pages(zone); + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping or OOM. + */ + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. + */ + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages(); + val->sharedram = global_node_page_state(NR_SHMEM); + val->freeram = global_zone_page_state(NR_FREE_PAGES); + val->bufferram = nr_blockdev_pages(); + val->totalhigh = totalhigh_pages(); + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; + pg_data_t *pgdat = NODE_DATA(nid); + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); +#ifdef CONFIG_HIGHMEM + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone_managed_pages(zone); + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; +#else + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; +#endif + val->mem_unit = PAGE_SIZE; +} +#endif + +/* + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). + */ +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) +{ + if (!(flags & SHOW_MEM_FILTER_NODES)) + return false; + + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); +} + +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = 'I', +#endif + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk(KERN_CONT "(%s) ", tmp); +} + +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. + */ +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +{ + unsigned long free_pcp = 0; + int cpu, nid; + struct zone *zone; + pg_data_t *pgdat; + + for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; + } + + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" + " free:%lu free_pcp:%lu free_cma:%lu\n", + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), + global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), + global_zone_page_state(NR_FREE_PAGES), + free_pcp, + global_zone_page_state(NR_FREE_CMA_PAGES)); + + for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; + + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif + " pagetables:%lukB" + " sec_pagetables:%lukB" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), + K(node_page_state(pgdat, NR_SHMEM)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS)), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), + K(node_page_state(pgdat, NR_ANON_THPS)), +#endif + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + node_page_state(pgdat, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + node_page_state(pgdat, NR_KERNEL_SCS_KB), +#endif + K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); + } + + for_each_populated_zone(zone) { + int i; + + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; + + show_node(zone); + printk(KERN_CONT + "%s" + " free:%lukB" + " boost:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " reserved_highatomic:%luKB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " writepending:%lukB" + " present:%lukB" + " managed:%lukB" + " mlocked:%lukB" + " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" + " free_cma:%lukB" + "\n", + zone->name, + K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->watermark_boost), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), + K(zone->nr_reserved_highatomic), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), + K(zone->present_pages), + K(zone_managed_pages(zone)), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->per_cpu_pageset->count)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); + } + + for_each_populated_zone(zone) { + unsigned int order; + unsigned long nr[MAX_ORDER + 1], flags, total = 0; + unsigned char types[MAX_ORDER + 1]; + + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + show_node(zone); + printk(KERN_CONT "%s: ", zone->name); + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order <= MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; + total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!free_area_empty(area, type)) + types[order] |= 1 << type; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order <= MAX_ORDER; order++) { + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } + printk(KERN_CONT "= %lukB\n", K(total)); + } + + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } + + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); + + show_swap_cache_info(); +} + +void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +{ + unsigned long total = 0, reserved = 0, highmem = 0; + struct zone *zone; + + printk("Mem-Info:\n"); + __show_free_areas(filter, nodemask, max_zone_idx); + + for_each_populated_zone(zone) { + + total += zone->present_pages; + reserved += zone->present_pages - zone_managed_pages(zone); + + if (is_highmem(zone)) + highmem += zone->present_pages; + } + + printk("%lu pages RAM\n", total); + printk("%lu pages HighMem/MovableOnly\n", highmem); + printk("%lu pages reserved\n", reserved); +#ifdef CONFIG_CMA + printk("%lu pages cma reserved\n", totalcma_pages); +#endif +#ifdef CONFIG_MEMORY_FAILURE + printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); +#endif +} diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index fe10436d9911..3ab53fad8876 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -5,12 +5,10 @@ #include <linux/seq_file.h> #include <linux/shrinker.h> #include <linux/memcontrol.h> -#include <linux/srcu.h> /* defined in vmscan.c */ -extern struct mutex shrinker_mutex; +extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -extern struct srcu_struct shrinker_srcu; static DEFINE_IDA(shrinker_debugfs_ida); static struct dentry *shrinker_debugfs_root; @@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret = 0, nid, srcu_idx; + int ret, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); + up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, .gfp_mask = GFP_KERNEL, }; struct mem_cgroup *memcg = NULL; - int nid, srcu_idx; + int nid; char kbuf[72]; + ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } sc.nid = nid; sc.memcg = memcg; @@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; @@ -168,7 +177,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -211,7 +220,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); old = shrinker->name; shrinker->name = new; @@ -229,7 +238,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); kfree_const(old); @@ -242,7 +251,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; @@ -271,14 +280,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } diff --git a/mm/slab.c b/mm/slab.c index bb57f7fdbae1..b7817dcba63e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1240,11 +1240,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( - kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL], - kmalloc_info[INDEX_NODE].size, - ARCH_KMALLOC_FLAGS, 0, - kmalloc_info[INDEX_NODE].size); + new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); diff --git a/mm/slab.h b/mm/slab.h index f01ac256a8f5..a59c8e5d2441 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -6,6 +6,38 @@ */ void __init kmem_cache_init(void); +#ifdef CONFIG_64BIT +# ifdef system_has_cmpxchg128 +# define system_has_freelist_aba() system_has_cmpxchg128() +# define try_cmpxchg_freelist try_cmpxchg128 +# endif +#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 +typedef u128 freelist_full_t; +#else /* CONFIG_64BIT */ +# ifdef system_has_cmpxchg64 +# define system_has_freelist_aba() system_has_cmpxchg64() +# define try_cmpxchg_freelist try_cmpxchg64 +# endif +#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 +typedef u64 freelist_full_t; +#endif /* CONFIG_64BIT */ + +#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#undef system_has_freelist_aba +#endif + +/* + * Freelist pointer and counter to cmpxchg together, avoids the typical ABA + * problems with cmpxchg of just a pointer. + */ +typedef union { + struct { + void *freelist; + unsigned long counter; + }; + freelist_full_t full; +} freelist_aba_t; + /* Reuses the bits in struct page */ struct slab { unsigned long __page_flags; @@ -38,14 +70,21 @@ struct slab { #endif }; /* Double-word boundary */ - void *freelist; /* first free object */ union { - unsigned long counters; struct { - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; }; +#ifdef system_has_freelist_aba + freelist_aba_t freelist_counter; +#endif }; }; struct rcu_head rcu_head; @@ -72,8 +111,8 @@ SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB) -static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *))); +#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB) +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); #endif /** @@ -255,9 +294,8 @@ gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); -struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, - slab_flags_t flags, unsigned int useroffset, - unsigned int usersize); +void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, + slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); diff --git a/mm/slab_common.c b/mm/slab_common.c index 607249785c07..43c008165f56 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -17,6 +17,8 @@ #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/seq_file.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/kasan.h> @@ -658,17 +660,16 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, s->refcount = -1; /* Exempt from merging for now */ } -struct kmem_cache *__init create_kmalloc_cache(const char *name, - unsigned int size, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize) +static struct kmem_cache *__init create_kmalloc_cache(const char *name, + unsigned int size, + slab_flags_t flags) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); if (!s) panic("Out of memory when creating slab %s\n", name); - create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, - usersize); + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size); list_add(&s->list, &slab_caches); s->refcount = 1; return s; @@ -863,9 +864,22 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init +static unsigned int __kmalloc_minalign(void) +{ +#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC + if (io_tlb_default_mem.nslabs) + return ARCH_KMALLOC_MINALIGN; +#endif + return dma_get_cache_alignment(); +} + +void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { + unsigned int minalign = __kmalloc_minalign(); + unsigned int aligned_size = kmalloc_info[idx].size; + int aligned_idx = idx; + if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { @@ -878,10 +892,17 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) flags |= SLAB_CACHE_DMA; } - kmalloc_caches[type][idx] = create_kmalloc_cache( - kmalloc_info[idx].name[type], - kmalloc_info[idx].size, flags, 0, - kmalloc_info[idx].size); + if (minalign > ARCH_KMALLOC_MINALIGN) { + aligned_size = ALIGN(aligned_size, minalign); + aligned_idx = __kmalloc_index(aligned_size, false); + } + + if (!kmalloc_caches[type][aligned_idx]) + kmalloc_caches[type][aligned_idx] = create_kmalloc_cache( + kmalloc_info[aligned_idx].name[type], + aligned_size, flags); + if (idx != aligned_idx) + kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx]; /* * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for diff --git a/mm/slub.c b/mm/slub.c index c87628cd8a9a..7529626bbec2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -292,7 +292,12 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Poison object */ #define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) /* Use cmpxchg_double */ + +#ifdef system_has_freelist_aba #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) +#else +#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U) +#endif /* * Tracking user of a slab. @@ -512,6 +517,40 @@ static __always_inline void slab_unlock(struct slab *slab) __bit_spin_unlock(PG_locked, &page->flags); } +static inline bool +__update_freelist_fast(struct slab *slab, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new) +{ +#ifdef system_has_freelist_aba + freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; + freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; + + return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); +#else + return false; +#endif +} + +static inline bool +__update_freelist_slow(struct slab *slab, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new) +{ + bool ret = false; + + slab_lock(slab); + if (slab->freelist == freelist_old && + slab->counters == counters_old) { + slab->freelist = freelist_new; + slab->counters = counters_new; + ret = true; + } + slab_unlock(slab); + + return ret; +} + /* * Interrupts must be disabled (for the fallback code to work right), typically * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is @@ -519,33 +558,25 @@ static __always_inline void slab_unlock(struct slab *slab) * allocation/ free operation in hardirq context. Therefore nothing can * interrupt the operation. */ -static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, +static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { + bool ret; + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&slab->freelist, &slab->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return true; - } else -#endif - { - slab_lock(slab); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; - slab_unlock(slab); - return true; - } - slab_unlock(slab); + ret = __update_freelist_fast(slab, freelist_old, counters_old, + freelist_new, counters_new); + } else { + ret = __update_freelist_slow(slab, freelist_old, counters_old, + freelist_new, counters_new); } + if (likely(ret)) + return true; cpu_relax(); stat(s, CMPXCHG_DOUBLE_FAIL); @@ -557,36 +588,26 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab return false; } -static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, +static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + bool ret; + if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&slab->freelist, &slab->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return true; - } else -#endif - { + ret = __update_freelist_fast(slab, freelist_old, counters_old, + freelist_new, counters_new); + } else { unsigned long flags; local_irq_save(flags); - slab_lock(slab); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; - slab_unlock(slab); - local_irq_restore(flags); - return true; - } - slab_unlock(slab); + ret = __update_freelist_slow(slab, freelist_old, counters_old, + freelist_new, counters_new); local_irq_restore(flags); } + if (likely(ret)) + return true; cpu_relax(); stat(s, CMPXCHG_DOUBLE_FAIL); @@ -2228,7 +2249,7 @@ static inline void *acquire_slab(struct kmem_cache *s, VM_BUG_ON(new.frozen); new.frozen = 1; - if (!__cmpxchg_double_slab(s, slab, + if (!__slab_update_freelist(s, slab, freelist, counters, new.freelist, new.counters, "acquire_slab")) @@ -2554,7 +2575,7 @@ redo: } - if (!cmpxchg_double_slab(s, slab, + if (!slab_update_freelist(s, slab, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) { @@ -2611,7 +2632,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) new.frozen = 0; - } while (!__cmpxchg_double_slab(s, slab, + } while (!__slab_update_freelist(s, slab, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")); @@ -3008,6 +3029,18 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) } #ifndef CONFIG_SLUB_TINY +static inline bool +__update_cpu_freelist_fast(struct kmem_cache *s, + void *freelist_old, void *freelist_new, + unsigned long tid) +{ + freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; + freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; + + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, + &old.full, new.full); +} + /* * Check the slab->freelist and either transfer the freelist to the * per cpu freelist or deactivate the slab. @@ -3034,7 +3067,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) new.inuse = slab->objects; new.frozen = freelist != NULL; - } while (!__cmpxchg_double_slab(s, slab, + } while (!__slab_update_freelist(s, slab, freelist, counters, NULL, new.counters, "get_freelist")); @@ -3359,11 +3392,7 @@ redo: * against code executing on this cpu *not* from access by * other cpus. */ - if (unlikely(!this_cpu_cmpxchg_double( - s->cpu_slab->freelist, s->cpu_slab->tid, - object, tid, - next_object, next_tid(tid)))) { - + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { note_cmpxchg_failure("slab_alloc", s, tid); goto redo; } @@ -3631,7 +3660,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, } } - } while (!cmpxchg_double_slab(s, slab, + } while (!slab_update_freelist(s, slab, prior, counters, head, new.counters, "__slab_free")); @@ -3736,11 +3765,7 @@ redo: set_freepointer(s, tail_obj, freelist); - if (unlikely(!this_cpu_cmpxchg_double( - s->cpu_slab->freelist, s->cpu_slab->tid, - freelist, tid, - head, next_tid(tid)))) { - + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } @@ -4505,11 +4530,11 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) } } -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; + } #endif /* diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 10d73a0dfcec..a044a130405b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { - unsigned long pfn = pte_pfn(*pte); + unsigned long pfn = pte_pfn(ptep_get(pte)); int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) @@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct page *reuse) { pte_t *pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) { + if (pte_none(ptep_get(pte))) { pte_t entry; void *p; @@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, * with just tail struct pages. */ return vmemmap_populate_range(start, end, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); } size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); @@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); if (rc) return -ENOMEM; } diff --git a/mm/sparse.c b/mm/sparse.c index c2afdb26039e..297a8b772e8d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -701,7 +701,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) return rc; } #else -struct page * __meminit populate_section_memmap(unsigned long pfn, +static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { @@ -922,10 +922,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, return 0; } -void sparse_remove_section(struct mem_section *ms, unsigned long pfn, - unsigned long nr_pages, unsigned long map_offset, - struct vmem_altmap *altmap) +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; + section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index 423199ee8478..cd8f0150ba3a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -76,7 +76,7 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { /* * This path almost never happens for VM activity - pages are normally freed - * via pagevecs. But it gets used by networking - and for compound pages. + * in batches. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct folio *folio) { @@ -1044,25 +1044,25 @@ void release_pages(release_pages_arg arg, int nr) EXPORT_SYMBOL(release_pages); /* - * The pages which we're about to release may be in the deferred lru-addition + * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's - * OK from a correctness point of view but is inefficient - those pages may be + * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * - * So __pagevec_release() will drain those queues here. + * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ -void __pagevec_release(struct pagevec *pvec) +void __folio_batch_release(struct folio_batch *fbatch) { - if (!pvec->percpu_pvec_drained) { + if (!fbatch->percpu_pvec_drained) { lru_add_drain(); - pvec->percpu_pvec_drained = true; + fbatch->percpu_pvec_drained = true; } - release_pages(pvec->pages, pagevec_count(pvec)); - pagevec_reinit(pvec); + release_pages(fbatch->folios, folio_batch_count(fbatch)); + folio_batch_reinit(fbatch); } -EXPORT_SYMBOL(__pagevec_release); +EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. diff --git a/mm/swap_state.c b/mm/swap_state.c index b76a65ac28b3..f8ea7015bad4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -16,7 +16,6 @@ #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> -#include <linux/pagevec.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> @@ -275,9 +274,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, } } -/* - * If we are the only user, then try to free up the swap cache. - * +/* + * If we are the only user, then try to free up the swap cache. + * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. @@ -294,7 +293,7 @@ void free_swap_cache(struct page *page) } } -/* +/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ @@ -417,9 +416,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, { struct swap_info_struct *si; struct folio *folio; + struct page *page; void *shadow = NULL; *new_page_allocated = false; + si = get_swap_device(entry); + if (!si) + return NULL; for (;;) { int err; @@ -428,14 +431,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ - si = get_swap_device(entry); - if (!si) - return NULL; folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - put_swap_device(si); - if (!IS_ERR(folio)) - return folio_file_page(folio, swp_offset(entry)); + if (!IS_ERR(folio)) { + page = folio_file_page(folio, swp_offset(entry)); + goto got_page; + } /* * Just skip read ahead for unused swap slot. @@ -445,8 +446,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ - if (!__swp_swapcount(entry) && swap_slot_cache_enabled) - return NULL; + if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) + goto fail_put_swap; /* * Get a new page to read into from swap. Allocate it now, @@ -455,7 +456,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); if (!folio) - return NULL; + goto fail_put_swap; /* * Swap entry may have been freed since our caller observed it. @@ -466,7 +467,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, folio_put(folio); if (err != -EEXIST) - return NULL; + goto fail_put_swap; /* * We might race against __delete_from_swap_cache(), and @@ -500,12 +501,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* Caller will initiate read into locked folio */ folio_add_lru(folio); *new_page_allocated = true; - return &folio->page; + page = &folio->page; +got_page: + put_swap_device(si); + return page; fail_unlock: put_swap_folio(folio, entry); folio_unlock(folio); folio_put(folio); +fail_put_swap: + put_swap_device(si); return NULL; } @@ -514,6 +520,10 @@ fail_unlock: * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. + * + * get/put_swap_device() aren't needed to call this function, because + * __read_swap_cache_async() call them and swap_readpage() holds the + * swap cache folio lock. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, @@ -698,6 +708,14 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } +#define SWAP_RA_ORDER_CEILING 5 + +struct vma_swap_readahead { + unsigned short win; + unsigned short offset; + unsigned short nr_pte; +}; + static void swap_ra_info(struct vm_fault *vmf, struct vma_swap_readahead *ra_info) { @@ -705,11 +723,7 @@ static void swap_ra_info(struct vm_fault *vmf, unsigned long ra_val; unsigned long faddr, pfn, fpfn, lpfn, rpfn; unsigned long start, end; - pte_t *pte, *orig_pte; unsigned int max_win, hits, prev_win, win; -#ifndef CONFIG_64BIT - pte_t *tpte; -#endif max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); @@ -728,12 +742,9 @@ static void swap_ra_info(struct vm_fault *vmf, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) return; - /* Copy the PTEs because the page table may be unmapped */ - orig_pte = pte = pte_offset_map(vmf->pmd, faddr); if (fpfn == pfn + 1) { lpfn = fpfn; rpfn = fpfn + win; @@ -753,15 +764,6 @@ static void swap_ra_info(struct vm_fault *vmf, ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; - pte -= ra_info->offset; -#ifdef CONFIG_64BIT - ra_info->ptes = pte; -#else - tpte = ra_info->ptes; - for (pfn = start; pfn != end; pfn++) - *tpte++ = *pte++; -#endif - pte_unmap(orig_pte); } /** @@ -785,7 +787,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct swap_iocb *splug = NULL; struct vm_area_struct *vma = vmf->vma; struct page *page; - pte_t *pte, pentry; + pte_t *pte = NULL, pentry; + unsigned long addr; swp_entry_t entry; unsigned int i; bool page_allocated; @@ -797,17 +800,25 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, if (ra_info.win == 1) goto skip; + addr = vmf->address - (ra_info.offset * PAGE_SIZE); + blk_start_plug(&plug); - for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; - i++, pte++) { - pentry = *pte; + for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + if (!pte++) { + pte = pte_offset_map(vmf->pmd, addr); + if (!pte) + break; + } + pentry = ptep_get_lockless(pte); if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; + pte_unmap(pte); + pte = NULL; page = __read_swap_cache_async(entry, gfp_mask, vma, - vmf->address, &page_allocated); + addr, &page_allocated); if (!page) continue; if (page_allocated) { @@ -819,6 +830,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, } put_page(page); } + if (pte) + pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); diff --git a/mm/swapfile.c b/mm/swapfile.c index 274bbf797480..8e6dde68b389 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -41,6 +41,7 @@ #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> +#include <linux/suspend.h> #include <asm/tlbflush.h> #include <linux/swapops.h> @@ -1219,6 +1220,13 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, } /* + * When we get a swap entry, if there aren't some other ways to + * prevent swapoff, such as the folio in swap cache is locked, page + * table lock is held, etc., the swap entry may become invalid because + * of swapoff. Then, we need to enclose all swap related functions + * with get_swap_device() and put_swap_device(), unless the swap + * functions call get/put_swap_device() by themselves. + * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until @@ -1227,9 +1235,8 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way - * to prevent swapoff, such as page lock, page table lock, etc. The - * caller must be prepared for that. For example, the following - * situation is possible. + * to prevent swapoff. The caller must be prepared for that. For + * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() @@ -1432,16 +1439,10 @@ void swapcache_free_entries(swp_entry_t *entries, int n) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si; + struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); - int count = 0; - si = get_swap_device(entry); - if (si) { - count = swap_count(si->swap_map[offset]); - put_swap_device(si); - } - return count; + return swap_count(si->swap_map[offset]); } /* @@ -1449,7 +1450,7 @@ int __swap_count(swp_entry_t entry) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; @@ -1463,24 +1464,6 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) /* * How many references to @entry are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -int __swp_swapcount(swp_entry_t entry) -{ - int count = 0; - struct swap_info_struct *si; - - si = get_swap_device(entry); - if (si) { - count = swap_swapcount(si, entry); - put_swap_device(si); - } - return count; -} - -/* - * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) @@ -1762,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1774,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1810,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1819,15 +1805,16 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: - pte_unmap_unlock(pte, ptl); + if (pte) + pte_unmap_unlock(pte, ptl); if (page != swapcache) { unlock_page(page); put_page(page); @@ -1839,27 +1826,37 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - swp_entry_t entry; - pte_t *pte; + pte_t *pte = NULL; struct swap_info_struct *si; - int ret = 0; si = swap_info[type]; - pte = pte_offset_map(pmd, addr); do { struct folio *folio; unsigned long offset; unsigned char swp_count; + swp_entry_t entry; + int ret; + pte_t ptent; + + if (!pte++) { + pte = pte_offset_map(pmd, addr); + if (!pte) + break; + } + + ptent = ptep_get_lockless(pte); - if (!is_swap_pte(*pte)) + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); + pte = NULL; + folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; @@ -1878,8 +1875,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) - goto try_next; - + continue; return -ENOMEM; } @@ -1889,20 +1885,17 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (ret < 0) { folio_unlock(folio); folio_put(folio); - goto out; + return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); -try_next: - pte = pte_offset_map(pmd, addr); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + } while (addr += PAGE_SIZE, addr != end); - ret = 0; -out: - return ret; + if (pte) + pte_unmap(pte); + return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -1917,8 +1910,6 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { cond_resched(); next = pmd_addr_end(addr, end); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; @@ -2539,7 +2530,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, old_block_size); - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, p); } inode_lock(inode); @@ -2770,7 +2761,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = blkdev_get_by_dev(inode->i_rdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); + BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL); if (IS_ERR(p->bdev)) { error = PTR_ERR(p->bdev); p->bdev = NULL; @@ -3221,7 +3212,7 @@ bad_swap: p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); - blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(p->bdev, p); } inode = NULL; destroy_swap_extents(p); @@ -3288,9 +3279,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned char has_cache; int err; - p = get_swap_device(entry); - if (!p) - return -EINVAL; + p = swp_swap_info(entry); offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); @@ -3337,7 +3326,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); - put_swap_device(p); return err; } @@ -3468,11 +3456,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) goto out; } - /* - * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel page tables: so it - * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. - */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; diff --git a/mm/truncate.c b/mm/truncate.c index 86de31ed4d32..95d1291d269b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -486,18 +486,17 @@ void truncate_inode_pages_final(struct address_space *mapping) EXPORT_SYMBOL(truncate_inode_pages_final); /** - * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate + * mapping_try_invalidate - Invalidate all the evictable folios of one inode + * @mapping: the address_space which holds the folios to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) - * @nr_pagevec: invalidate failed page number for caller + * @nr_failed: How many folio invalidations failed * - * This helper is similar to invalidate_mapping_pages(), except that it accounts - * for pages that are likely on a pagevec and counts them in @nr_pagevec, which - * will be used by the caller. + * This function is similar to invalidate_mapping_pages(), except that it + * returns the number of folios which could not be evicted in @nr_failed. */ -unsigned long invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) +unsigned long mapping_try_invalidate(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_failed) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; @@ -527,9 +526,9 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, */ if (!ret) { deactivate_file_folio(folio); - /* It is likely on the pagevec of a remote CPU */ - if (nr_pagevec) - (*nr_pagevec)++; + /* Likely in the lru cache of a remote CPU */ + if (nr_failed) + (*nr_failed)++; } count += ret; } @@ -552,12 +551,12 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, * If you want to remove all the pages of one inode, regardless of * their use and writeback state, use truncate_inode_pages(). * - * Return: the number of the cache entries that were invalidated + * Return: The number of indices that had their contents invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return invalidate_mapping_pagevec(mapping, start, end, NULL); + return mapping_try_invalidate(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); @@ -566,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently - * sitting in the folio_add_lru() pagevecs. + * sitting in the folio_add_lru() caches. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e97a0b4889fc..a2bf37ee276d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -76,7 +76,10 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (flags & MFILL_ATOMIC_WP) _dst_pte = pte_mkuffd_wp(_dst_pte); + ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; if (vma_is_shmem(dst_vma)) { /* serialize against truncate with the page table lock */ @@ -94,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ - if (!pte_none_mostly(*dst_pte)) + if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; folio = page_folio(page); @@ -121,6 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); +out: return ret; } @@ -212,7 +216,10 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); + ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; if (dst_vma->vm_file) { /* the shmem MAP_PRIVATE case requires checking the i_size */ inode = dst_vma->vm_file->f_inode; @@ -223,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, goto out_unlock; } ret = -EEXIST; - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ @@ -231,6 +238,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); +out: return ret; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9683573f1225..93cf99aba335 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(*pte))) + if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; @@ -703,11 +703,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; - ptep = pte_offset_map(pmd, addr); - pte = *ptep; + ptep = pte_offset_kernel(pmd, addr); + pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); - pte_unmap(ptep); return page; } @@ -791,7 +790,7 @@ get_subtree_max_size(struct rb_node *node) RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) -static void purge_vmap_area_lazy(void); +static void reclaim_and_purge_vmap_areas(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); @@ -1649,7 +1648,7 @@ retry: overflow: if (!purged) { - purge_vmap_area_lazy(); + reclaim_and_purge_vmap_areas(); purged = 1; goto retry; } @@ -1785,9 +1784,10 @@ out: } /* - * Kick off a purge of the outstanding lazy areas. + * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list. */ -static void purge_vmap_area_lazy(void) +static void reclaim_and_purge_vmap_areas(void) + { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); @@ -1908,6 +1908,12 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +/* + * Purge threshold to prevent overeager purging of fragmented blocks for + * regular operations: Purge if vb->free is less than 1/4 of the capacity. + */ +#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) + #define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ #define VMAP_FLAGS_MASK 0x3 @@ -2086,39 +2092,62 @@ static void free_vmap_block(struct vmap_block *vb) kfree_rcu(vb, rcu_head); } +static bool purge_fragmented_block(struct vmap_block *vb, + struct vmap_block_queue *vbq, struct list_head *purge_list, + bool force_purge) +{ + if (vb->free + vb->dirty != VMAP_BBMAP_BITS || + vb->dirty == VMAP_BBMAP_BITS) + return false; + + /* Don't overeagerly purge usable blocks unless requested */ + if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD)) + return false; + + /* prevent further allocs after releasing lock */ + WRITE_ONCE(vb->free, 0); + /* prevent purging it again */ + WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS); + vb->dirty_min = 0; + vb->dirty_max = VMAP_BBMAP_BITS; + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + list_add_tail(&vb->purge, purge_list); + return true; +} + +static void free_purged_blocks(struct list_head *purge_list) +{ + struct vmap_block *vb, *n_vb; + + list_for_each_entry_safe(vb, n_vb, purge_list, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + static void purge_fragmented_blocks(int cpu) { LIST_HEAD(purge); struct vmap_block *vb; - struct vmap_block *n_vb; struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long free = READ_ONCE(vb->free); + unsigned long dirty = READ_ONCE(vb->dirty); - if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + if (free + dirty != VMAP_BBMAP_BITS || + dirty == VMAP_BBMAP_BITS) continue; spin_lock(&vb->lock); - if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { - vb->free = 0; /* prevent further allocs after releasing lock */ - vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - vb->dirty_min = 0; - vb->dirty_max = VMAP_BBMAP_BITS; - spin_lock(&vbq->lock); - list_del_rcu(&vb->free_list); - spin_unlock(&vbq->lock); - spin_unlock(&vb->lock); - list_add_tail(&vb->purge, &purge); - } else - spin_unlock(&vb->lock); + purge_fragmented_block(vb, vbq, &purge, true); + spin_unlock(&vb->lock); } rcu_read_unlock(); - - list_for_each_entry_safe(vb, n_vb, &purge, purge) { - list_del(&vb->purge); - free_vmap_block(vb); - } + free_purged_blocks(&purge); } static void purge_fragmented_blocks_allcpus(void) @@ -2153,6 +2182,9 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; + if (READ_ONCE(vb->free) < (1UL << order)) + continue; + spin_lock(&vb->lock); if (vb->free < (1UL << order)) { spin_unlock(&vb->lock); @@ -2161,7 +2193,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); - vb->free -= 1UL << order; + WRITE_ONCE(vb->free, vb->free - (1UL << order)); bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); @@ -2211,11 +2243,11 @@ static void vb_free(unsigned long addr, unsigned long size) spin_lock(&vb->lock); - /* Expand dirty range */ + /* Expand the not yet TLB flushed dirty range */ vb->dirty_min = min(vb->dirty_min, offset); vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); - vb->dirty += 1UL << order; + WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order)); if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free); spin_unlock(&vb->lock); @@ -2226,21 +2258,30 @@ static void vb_free(unsigned long addr, unsigned long size) static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { + LIST_HEAD(purge_list); int cpu; if (unlikely(!vmap_initialized)) return; - might_sleep(); + mutex_lock(&vmap_purge_lock); for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; + unsigned long idx; rcu_read_lock(); - list_for_each_entry_rcu(vb, &vbq->free, free_list) { + xa_for_each(&vbq->vmap_blocks, idx, vb) { spin_lock(&vb->lock); - if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { + + /* + * Try to purge a fragmented block first. If it's + * not purgeable, check whether there is dirty + * space to be flushed. + */ + if (!purge_fragmented_block(vb, vbq, &purge_list, false) && + vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; @@ -2250,15 +2291,18 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) start = min(s, start); end = max(e, end); + /* Prevent that this is flushed again */ + vb->dirty_min = VMAP_BBMAP_BITS; + vb->dirty_max = 0; + flush = 1; } spin_unlock(&vb->lock); } rcu_read_unlock(); } + free_purged_blocks(&purge_list); - mutex_lock(&vmap_purge_lock); - purge_fragmented_blocks_allcpus(); if (!__purge_vmap_area_lazy(start, end) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); @@ -2899,10 +2943,16 @@ struct vmap_pfn_data { static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; + unsigned long pfn = data->pfns[data->idx]; + pte_t ptent; - if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) + if (WARN_ON_ONCE(pfn_valid(pfn))) return -EINVAL; - *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); + + ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); + set_pte_at(&init_mm, addr, pte, ptent); + + data->idx++; return 0; } @@ -3098,11 +3148,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - /* vm_area_alloc_pages() can also fail due to a fatal signal */ - if (!fatal_signal_pending(current)) + /* + * vm_area_alloc_pages() can fail due to insufficient memory but + * also:- + * + * - a pending fatal signal + * - insufficient huge page-order pages + * + * Since we always retry allocations at order-0 in the huge page + * case a warning for either is spurious. + */ + if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + "vmalloc error: size %lu, failed to allocate pages", + area->nr_pages * PAGE_SIZE); goto fail; } @@ -3511,7 +3570,7 @@ static size_t zero_iter(struct iov_iter *iter, size_t count) while (remains > 0) { size_t num, copied; - num = remains < PAGE_SIZE ? remains : PAGE_SIZE; + num = min_t(size_t, remains, PAGE_SIZE); copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); remains -= copied; @@ -4142,7 +4201,7 @@ recovery: overflow: spin_unlock(&free_vmap_area_lock); if (!purged) { - purge_vmap_area_lazy(); + reclaim_and_purge_vmap_areas(); purged = true; /* Before "retry", check if we recover. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..1080209a568b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -57,7 +57,6 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> -#include <linux/srcu.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -190,9 +189,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); -DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -211,21 +208,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -266,7 +250,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -292,7 +276,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +308,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -352,16 +336,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,7 +357,8 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +372,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +382,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -408,7 +392,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -417,7 +401,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -433,7 +417,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,15 +426,20 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_read(&shrinker_rwsem); } +/* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } -static bool global_reclaim(struct scan_control *sc) +/* + * Returns true for reclaim on the root cgroup. This is true for direct + * allocator reclaim and reclaim through cgroup interfaces on the root cgroup. + */ +static bool root_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); } @@ -505,7 +494,7 @@ static bool cgroup_reclaim(struct scan_control *sc) return false; } -static bool global_reclaim(struct scan_control *sc) +static bool root_reclaim(struct scan_control *sc) { return true; } @@ -562,7 +551,7 @@ static void flush_reclaim_state(struct scan_control *sc) * memcg reclaim, to make reporting more accurate and reduce * underestimation, but it's probably not worth the complexity for now. */ - if (current->reclaim_state && global_reclaim(sc)) { + if (current->reclaim_state && root_reclaim(sc)) { sc->nr_reclaimed += current->reclaim_state->reclaimed; current->reclaim_state->reclaimed = 0; } @@ -743,9 +732,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +744,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -810,16 +799,13 @@ void unregister_shrinker(struct shrinker *shrinker) if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); - list_del_rcu(&shrinker->list); + down_write(&shrinker_rwsem); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - mutex_unlock(&shrinker_mutex); - - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + up_write(&shrinker_rwsem); shrinker_debugfs_remove(debugfs_entry, debugfs_id); @@ -831,13 +817,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -946,20 +934,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1005,14 +992,14 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ @@ -1049,7 +1036,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; /* * The root memcg might be allocated even though memcg is disabled @@ -1061,11 +1047,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - generation = atomic_read(&shrinker_srcu_generation); - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1076,14 +1061,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { freed = freed ? : 1; break; } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } @@ -1621,9 +1611,10 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -static struct page *alloc_demote_page(struct page *page, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { - struct page *target_page; + struct folio *dst; nodemask_t *allowed_mask; struct migration_target_control *mtc; @@ -1641,14 +1632,14 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) */ mtc->nmask = NULL; mtc->gfp_mask |= __GFP_THISNODE; - target_page = alloc_migration_target(page, (unsigned long)mtc); - if (target_page) - return target_page; + dst = alloc_migration_target(src, (unsigned long)mtc); + if (dst) + return dst; mtc->gfp_mask &= ~__GFP_THISNODE; mtc->nmask = allowed_mask; - return alloc_migration_target(page, (unsigned long)mtc); + return alloc_migration_target(src, (unsigned long)mtc); } /* @@ -1683,7 +1674,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -2270,6 +2261,25 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } +#ifdef CONFIG_CMA +/* + * It is waste of effort to scan and reclaim CMA pages if it is not available + * for current allocation context. Kswapd can not be enrolled as it can not + * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL + */ +static bool skip_cma(struct folio *folio, struct scan_control *sc) +{ + return !current_is_kswapd() && + gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && + get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; +} +#else +static bool skip_cma(struct folio *folio, struct scan_control *sc) +{ + return false; +} +#endif + /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * @@ -2316,7 +2326,8 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx) { + if (folio_zonenum(folio) > sc->reclaim_idx || + skip_cma(folio, sc)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; goto move; @@ -2458,7 +2469,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + if (gfp_has_io_fs(sc->gfp_mask)) inactive >>= 3; too_many = isolated > inactive; @@ -3233,6 +3244,16 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) #endif +static bool should_walk_mmu(void) +{ + return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK); +} + +static bool should_clear_pmd_young(void) +{ + return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG); +} + /****************************************************************************** * shorthand helpers ******************************************************************************/ @@ -3993,28 +4014,29 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); - VM_WARN_ON_ONCE(pmd_leaf(*pmd)); - - ptl = pte_lockptr(args->mm, pmd); - if (!spin_trylock(ptl)) + pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); + if (!pte) + return false; + if (!spin_trylock(ptl)) { + pte_unmap(pte); return false; + } arch_enter_lazy_mmu_mode(); - - pte = pte_offset_map(pmd, start & PMD_MASK); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; + pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(pte[i], args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) { + if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -4029,7 +4051,7 @@ restart: young++; walk->mm_stats[MM_LEAF_YOUNG]++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4042,10 +4064,8 @@ restart: if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - pte_unmap(pte); - arch_leave_lazy_mmu_mode(); - spin_unlock(ptl); + pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); } @@ -4097,7 +4117,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area goto next; if (!pmd_trans_huge(pmd[i])) { - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (should_clear_pmd_young()) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -4143,7 +4163,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, unsigned long next; unsigned long addr; struct vm_area_struct *vma; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; @@ -4190,7 +4210,7 @@ restart: #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (should_clear_pmd_young()) { if (!pmd_young(val)) continue; @@ -4492,7 +4512,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ - if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { + if (!should_walk_mmu()) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; } @@ -4674,12 +4694,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; + pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + pfn = get_pte_pfn(ptent, pvmw->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) + if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); @@ -4691,7 +4712,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) young++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4743,10 +4764,11 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { int seg; int old, new; + unsigned long flags; int bin = get_random_u32_below(MEMCG_NR_BINS); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock(&pgdat->memcg_lru.lock); + spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); @@ -4781,7 +4803,7 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - spin_unlock(&pgdat->memcg_lru.lock); + spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); } void lru_gen_online_memcg(struct mem_cgroup *memcg) @@ -4794,7 +4816,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock(&pgdat->memcg_lru.lock); + spin_lock_irq(&pgdat->memcg_lru.lock); VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); @@ -4805,7 +4827,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) lruvec->lrugen.gen = gen; - spin_unlock(&pgdat->memcg_lru.lock); + spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -4829,7 +4851,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock(&pgdat->memcg_lru.lock); + spin_lock_irq(&pgdat->memcg_lru.lock); VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); @@ -4841,12 +4863,14 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - spin_unlock(&pgdat->memcg_lru.lock); + spin_unlock_irq(&pgdat->memcg_lru.lock); } } -void lru_gen_soft_reclaim(struct lruvec *lruvec) +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + /* see the comment on MEMCG_NR_GENS */ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); @@ -4912,7 +4936,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) WRITE_ONCE(lrugen->protected[hist][type][tier - 1], lrugen->protected[hist][type][tier - 1] + delta); - __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); return true; } @@ -5307,7 +5330,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool static unsigned long get_nr_to_reclaim(struct scan_control *sc) { /* don't abort memcg reclaim to ensure fairness */ - if (!global_reclaim(sc)) + if (!root_reclaim(sc)) return -1; return max(sc->nr_to_reclaim, compact_gap(sc->order)); @@ -5459,7 +5482,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { struct blk_plug plug; - VM_WARN_ON_ONCE(global_reclaim(sc)); + VM_WARN_ON_ONCE(root_reclaim(sc)); VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); @@ -5520,7 +5543,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * struct blk_plug plug; unsigned long reclaimed = sc->nr_reclaimed; - VM_WARN_ON_ONCE(!global_reclaim(sc)); + VM_WARN_ON_ONCE(!root_reclaim(sc)); /* * Unmapped clean folios are already prioritized. Scanning for more of @@ -5727,10 +5750,10 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c if (get_cap(LRU_GEN_CORE)) caps |= BIT(LRU_GEN_CORE); - if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + if (should_walk_mmu()) caps |= BIT(LRU_GEN_MM_WALK); - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (should_clear_pmd_young()) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); return sysfs_emit(buf, "0x%04x\n", caps); @@ -6242,7 +6265,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled() && !global_reclaim(sc)) { + if (lru_gen_enabled() && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } @@ -6398,14 +6421,13 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!managed_zone(zone)) continue; - switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_SUCCESS: - case COMPACT_CONTINUE: + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) + return false; + + if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; - default: - /* check next zone */ - ; - } } /* @@ -6484,7 +6506,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; - if (lru_gen_enabled() && global_reclaim(sc)) { + if (lru_gen_enabled() && root_reclaim(sc)) { lru_gen_shrink_node(pgdat, sc); return; } @@ -6556,10 +6578,13 @@ again: * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle(). */ - if ((current_is_kswapd() || - (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) + set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); + + if (current_is_kswapd()) + set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); + } /* * Stall direct reclaim for IO completions if the lruvec is @@ -6569,7 +6594,8 @@ again: */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && - test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || + test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) @@ -6593,14 +6619,14 @@ again: static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - enum compact_result suitable; - suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); - if (suitable == COMPACT_SUCCESS) - /* Allocation should succeed already. Don't reclaim. */ + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) return true; - if (suitable == COMPACT_SKIPPED) - /* Compaction cannot yet proceed. Do reclaim. */ + + /* Compaction cannot yet proceed. Do reclaim. */ + if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; /* @@ -6826,7 +6852,7 @@ retry: lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } } @@ -6887,7 +6913,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) continue; pfmemalloc_reserve += min_wmark_pages(zone); - free_pages += zone_page_state(zone, NR_FREE_PAGES); + free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES); } /* If there are no reserves (unexpected config) then do not throttle */ @@ -7215,7 +7241,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } @@ -7840,7 +7867,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) /* * This kswapd start function will be called by init and node-hot-add. */ -void kswapd_run(int nid) +void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -7861,7 +7888,7 @@ void kswapd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kswapd_stop(int nid) +void __meminit kswapd_stop(int nid) { pg_data_t *pgdat = NODE_DATA(nid); struct task_struct *kswapd; @@ -8058,23 +8085,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) } #endif -void check_move_unevictable_pages(struct pagevec *pvec) -{ - struct folio_batch fbatch; - unsigned i; - - folio_batch_init(&fbatch); - for (i = 0; i < pvec->nr; i++) { - struct page *page = pvec->pages[i]; - - if (PageTransTail(page)) - continue; - folio_batch_add(&fbatch, page_folio(page)); - } - check_move_unevictable_folios(&fbatch); -} -EXPORT_SYMBOL_GPL(check_move_unevictable_pages); - /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list diff --git a/mm/vmstat.c b/mm/vmstat.c index c28046371b45..b731d57996c5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -28,6 +28,7 @@ #include <linux/mm_inline.h> #include <linux/page_ext.h> #include <linux/page_owner.h> +#include <linux/sched/isolation.h> #include "internal.h" @@ -1180,6 +1181,9 @@ const char * const vmstat_text[] = { "nr_zspages", #endif "nr_free_cma", +#ifdef CONFIG_UNACCEPTED_MEMORY + "nr_unaccepted", +#endif /* enum numa_stat_item counters */ #ifdef CONFIG_NUMA @@ -2022,6 +2026,20 @@ static void vmstat_shepherd(struct work_struct *w) for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + /* + * In kernel users of vmstat counters either require the precise value and + * they are using zone_page_state_snapshot interface or they can live with + * an imprecision as the regular flushing can happen at arbitrary time and + * cumulative error can grow (see calculate_normal_threshold). + * + * From that POV the regular flushing can be postponed for CPUs that have + * been isolated from the kernel interference without critical + * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd + * for all isolated CPUs to avoid interference with the isolated workload. + */ + if (cpu_is_isolated(cpu)) + continue; + if (!delayed_work_pending(dw) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); diff --git a/mm/workingset.c b/mm/workingset.c index 817758951886..4686ae363000 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -255,45 +255,58 @@ static void *lru_gen_eviction(struct folio *folio) return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } +/* + * Tests if the shadow entry is for a folio that was recently evicted. + * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. + */ +static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, + unsigned long *token, bool *workingset) +{ + int memcg_id; + unsigned long min_seq; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + + unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); + + memcg = mem_cgroup_from_id(memcg_id); + *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); + return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); +} + static void lru_gen_refault(struct folio *folio, void *shadow) { + bool recent; int hist, tier, refs; - int memcg_id; bool workingset; unsigned long token; - unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; - struct mem_cgroup *memcg; - struct pglist_data *pgdat; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); - unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); - - if (pgdat != folio_pgdat(folio)) - return; - rcu_read_lock(); - memcg = folio_memcg_rcu(folio); - if (memcg_id != mem_cgroup_id(memcg)) + recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); + if (lruvec != folio_lruvec(folio)) goto unlock; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - lrugen = &lruvec->lrugen; + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); - min_seq = READ_ONCE(lrugen->min_seq[type]); - if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + if (!recent) goto unlock; - hist = lru_hist_from_seq(min_seq); + lrugen = &lruvec->lrugen; + + hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); /* see the comment in folio_lru_refs() */ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; tier = lru_tier_from_refs(refs); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); /* * Count the following two cases as stalls: @@ -317,6 +330,12 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } +static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, + unsigned long *token, bool *workingset) +{ + return false; +} + static void lru_gen_refault(struct folio *folio, void *shadow) { } @@ -385,42 +404,33 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) } /** - * workingset_refault - Evaluate the refault of a previously evicted folio. - * @folio: The freshly allocated replacement folio. - * @shadow: Shadow entry of the evicted folio. - * - * Calculates and evaluates the refault distance of the previously - * evicted folio in the context of the node and the memcg whose memory - * pressure caused the eviction. + * workingset_test_recent - tests if the shadow entry is for a folio that was + * recently evicted. Also fills in @workingset with the value unpacked from + * shadow. + * @shadow: the shadow entry to be tested. + * @file: whether the corresponding folio is from the file lru. + * @workingset: where the workingset value unpacked from shadow should + * be stored. + * + * Return: true if the shadow is for a recently evicted folio; false otherwise. */ -void workingset_refault(struct folio *folio, void *shadow) +bool workingset_test_recent(void *shadow, bool file, bool *workingset) { - bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; - struct pglist_data *pgdat; - struct mem_cgroup *memcg; - unsigned long eviction; - struct lruvec *lruvec; unsigned long refault; - bool workingset; int memcgid; - long nr; + struct pglist_data *pgdat; + unsigned long eviction; - if (lru_gen_enabled()) { - lru_gen_refault(folio, shadow); - return; - } + if (lru_gen_enabled()) + return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; - /* Flush stats (and potentially sleep) before holding RCU read lock */ - mem_cgroup_flush_stats_ratelimited(); - - rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. @@ -439,7 +449,8 @@ void workingset_refault(struct folio *folio, void *shadow) */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) - goto out; + return false; + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -462,20 +473,6 @@ void workingset_refault(struct folio *folio, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; /* - * The activation decision for this folio is made at the level - * where the eviction occurred, as that is where the LRU order - * during folio reclaim is being determined. - * - * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. - */ - nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - pgdat = folio_pgdat(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether @@ -495,7 +492,54 @@ void workingset_refault(struct folio *folio, void *shadow) NR_INACTIVE_ANON); } } - if (refault_distance > workingset_size) + + return refault_distance <= workingset_size; +} + +/** + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. + * + * Calculates and evaluates the refault distance of the previously + * evicted folio in the context of the node and the memcg whose memory + * pressure caused the eviction. + */ +void workingset_refault(struct folio *folio, void *shadow) +{ + bool file = folio_is_file_lru(folio); + struct pglist_data *pgdat; + struct mem_cgroup *memcg; + struct lruvec *lruvec; + bool workingset; + long nr; + + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + + /* Flush stats (and potentially sleep) before holding RCU read lock */ + mem_cgroup_flush_stats_ratelimited(); + + rcu_read_lock(); + + /* + * The activation decision for this folio is made at the level + * where the eviction occurred, as that is where the LRU order + * during folio reclaim is being determined. + * + * However, the cgroup that will own the folio is the one that + * is actually experiencing the refault event. + */ + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); + + if (!workingset_test_recent(shadow, file, &workingset)) goto out; folio_set_active(folio); diff --git a/mm/z3fold.c b/mm/z3fold.c index 0cef845d397b..e84de91ecccb 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -125,13 +125,11 @@ struct z3fold_header { /** * struct z3fold_pool - stores metadata for each z3fold pool * @name: pool name - * @lock: protects pool unbuddied/lru lists + * @lock: protects pool unbuddied lists * @stale_lock: protects pool stale page list * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- * buddies; the list each z3fold page is added to depends on * the size of its free region. - * @lru: list tracking the z3fold pages in LRU order by most recently - * added buddy. * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation @@ -149,12 +147,9 @@ struct z3fold_pool { spinlock_t lock; spinlock_t stale_lock; struct list_head *unbuddied; - struct list_head lru; struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; - struct zpool *zpool; - const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; @@ -329,7 +324,6 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_header *zhdr = page_address(page); struct z3fold_buddy_slots *slots; - INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); @@ -451,8 +445,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) set_bit(PAGE_STALE, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); - if (!list_empty(&page->lru)) - list_del_init(&page->lru); spin_unlock(&pool->lock); if (locked) @@ -930,7 +922,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp) for_each_unbuddied_list(i, 0) INIT_LIST_HEAD(&unbuddied[i]); } - INIT_LIST_HEAD(&pool->lru); INIT_LIST_HEAD(&pool->stale); atomic64_set(&pool->pages_nr, 0); pool->name = name; @@ -1073,12 +1064,6 @@ found: headless: spin_lock(&pool->lock); - /* Add/move z3fold page to beginning of LRU */ - if (!list_empty(&page->lru)) - list_del(&page->lru); - - list_add(&page->lru, &pool->lru); - *handle = encode_handle(zhdr, bud); spin_unlock(&pool->lock); if (bud != HEADLESS) @@ -1115,9 +1100,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) * immediately so we don't care about its value any more. */ if (!page_claimed) { - spin_lock(&pool->lock); - list_del(&page->lru); - spin_unlock(&pool->lock); put_z3fold_header(zhdr); free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); @@ -1173,194 +1155,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) } /** - * z3fold_reclaim_page() - evicts allocations from a pool page and frees it - * @pool: pool from which a page will attempt to be evicted - * @retries: number of pages on the LRU list for which eviction will - * be attempted before failing - * - * z3fold reclaim is different from normal system reclaim in that it is done - * from the bottom, up. This is because only the bottom layer, z3fold, has - * information on how the allocations are organized within each z3fold page. - * This has the potential to create interesting locking situations between - * z3fold and the user, however. - * - * To avoid these, this is how z3fold_reclaim_page() should be called: - * - * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). - * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and - * call the user-defined eviction handler with the pool and handle as - * arguments. - * - * If the handle can not be evicted, the eviction handler should return - * non-zero. z3fold_reclaim_page() will add the z3fold page back to the - * appropriate list and try the next z3fold page on the LRU up to - * a user defined number of retries. - * - * If the handle is successfully evicted, the eviction handler should - * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() - * contains logic to delay freeing the page if the page is under reclaim, - * as indicated by the setting of the PG_reclaim flag on the underlying page. - * - * If all buddies in the z3fold page are successfully evicted, then the - * z3fold page can be freed. - * - * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are - * no pages to evict or an eviction handler is not registered, -EAGAIN if - * the retry limit was hit. - */ -static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) -{ - int i, ret = -1; - struct z3fold_header *zhdr = NULL; - struct page *page = NULL; - struct list_head *pos; - unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; - struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN))); - - rwlock_init(&slots.lock); - slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); - - spin_lock(&pool->lock); - for (i = 0; i < retries; i++) { - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lock); - return -EINVAL; - } - list_for_each_prev(pos, &pool->lru) { - page = list_entry(pos, struct page, lru); - - zhdr = page_address(page); - if (test_bit(PAGE_HEADLESS, &page->private)) { - /* - * For non-headless pages, we wait to do this - * until we have the page lock to avoid racing - * with __z3fold_alloc(). Headless pages don't - * have a lock (and __z3fold_alloc() will never - * see them), but we still need to test and set - * PAGE_CLAIMED to avoid racing with - * z3fold_free(), so just do it now before - * leaving the loop. - */ - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) - continue; - - break; - } - - if (!z3fold_page_trylock(zhdr)) { - zhdr = NULL; - continue; /* can't evict at this point */ - } - - /* test_and_set_bit is of course atomic, but we still - * need to do it under page lock, otherwise checking - * that bit in __z3fold_alloc wouldn't make sense - */ - if (zhdr->foreign_handles || - test_and_set_bit(PAGE_CLAIMED, &page->private)) { - z3fold_page_unlock(zhdr); - zhdr = NULL; - continue; /* can't evict such page */ - } - list_del_init(&zhdr->buddy); - zhdr->cpu = -1; - /* See comment in __z3fold_alloc. */ - kref_get(&zhdr->refcount); - break; - } - - if (!zhdr) - break; - - list_del_init(&page->lru); - spin_unlock(&pool->lock); - - if (!test_bit(PAGE_HEADLESS, &page->private)) { - /* - * We need encode the handles before unlocking, and - * use our local slots structure because z3fold_free - * can zero out zhdr->slots and we can't do much - * about that - */ - first_handle = 0; - last_handle = 0; - middle_handle = 0; - memset(slots.slot, 0, sizeof(slots.slot)); - if (zhdr->first_chunks) - first_handle = __encode_handle(zhdr, &slots, - FIRST); - if (zhdr->middle_chunks) - middle_handle = __encode_handle(zhdr, &slots, - MIDDLE); - if (zhdr->last_chunks) - last_handle = __encode_handle(zhdr, &slots, - LAST); - /* - * it's safe to unlock here because we hold a - * reference to this page - */ - z3fold_page_unlock(zhdr); - } else { - first_handle = encode_handle(zhdr, HEADLESS); - last_handle = middle_handle = 0; - } - /* Issue the eviction callback(s) */ - if (middle_handle) { - ret = pool->zpool_ops->evict(pool->zpool, middle_handle); - if (ret) - goto next; - } - if (first_handle) { - ret = pool->zpool_ops->evict(pool->zpool, first_handle); - if (ret) - goto next; - } - if (last_handle) { - ret = pool->zpool_ops->evict(pool->zpool, last_handle); - if (ret) - goto next; - } -next: - if (test_bit(PAGE_HEADLESS, &page->private)) { - if (ret == 0) { - free_z3fold_page(page, true); - atomic64_dec(&pool->pages_nr); - return 0; - } - spin_lock(&pool->lock); - list_add(&page->lru, &pool->lru); - spin_unlock(&pool->lock); - clear_bit(PAGE_CLAIMED, &page->private); - } else { - struct z3fold_buddy_slots *slots = zhdr->slots; - z3fold_page_lock(zhdr); - if (kref_put(&zhdr->refcount, - release_z3fold_page_locked)) { - kmem_cache_free(pool->c_handle, slots); - return 0; - } - /* - * if we are here, the page is still not completely - * free. Take the global pool lock then to be able - * to add it back to the lru list - */ - spin_lock(&pool->lock); - list_add(&page->lru, &pool->lru); - spin_unlock(&pool->lock); - if (list_empty(&zhdr->buddy)) - add_to_unbuddied(pool, zhdr); - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); - } - - /* We started off locked to we need to lock the pool back */ - spin_lock(&pool->lock); - } - spin_unlock(&pool->lock); - return -EAGAIN; -} - -/** * z3fold_map() - maps the allocation associated with the given handle * @pool: pool in which the allocation resides * @handle: handle associated with the allocation to be mapped @@ -1470,8 +1264,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) spin_lock(&pool->lock); if (!list_empty(&zhdr->buddy)) list_del_init(&zhdr->buddy); - if (!list_empty(&page->lru)) - list_del_init(&page->lru); spin_unlock(&pool->lock); kref_get(&zhdr->refcount); @@ -1531,9 +1323,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, encode_handle(new_zhdr, MIDDLE); set_bit(NEEDS_COMPACTING, &newpage->private); new_zhdr->cpu = smp_processor_id(); - spin_lock(&pool->lock); - list_add(&newpage->lru, &pool->lru); - spin_unlock(&pool->lock); __SetPageMovable(newpage, &z3fold_mops); z3fold_page_unlock(new_zhdr); @@ -1559,9 +1348,6 @@ static void z3fold_page_putback(struct page *page) INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) return; - spin_lock(&pool->lock); - list_add(&page->lru, &pool->lru); - spin_unlock(&pool->lock); if (list_empty(&zhdr->buddy)) add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); @@ -1578,18 +1364,9 @@ static const struct movable_operations z3fold_mops = { * zpool ****************/ -static void *z3fold_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *z3fold_zpool_create(const char *name, gfp_t gfp) { - struct z3fold_pool *pool; - - pool = z3fold_create_pool(name, gfp); - if (pool) { - pool->zpool = zpool; - pool->zpool_ops = zpool_ops; - } - return pool; + return z3fold_create_pool(name, gfp); } static void z3fold_zpool_destroy(void *pool) @@ -1607,25 +1384,6 @@ static void z3fold_zpool_free(void *pool, unsigned long handle) z3fold_free(pool, handle); } -static int z3fold_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - unsigned int total = 0; - int ret = -EINVAL; - - while (total < pages) { - ret = z3fold_reclaim_page(pool, 8); - if (ret < 0) - break; - total++; - } - - if (reclaimed) - *reclaimed = total; - - return ret; -} - static void *z3fold_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -1649,7 +1407,6 @@ static struct zpool_driver z3fold_zpool_driver = { .destroy = z3fold_zpool_destroy, .malloc = z3fold_zpool_malloc, .free = z3fold_zpool_free, - .shrink = z3fold_zpool_shrink, .map = z3fold_zpool_map, .unmap = z3fold_zpool_unmap, .total_size = z3fold_zpool_total_size, diff --git a/mm/zbud.c b/mm/zbud.c index 3acd26193920..2190cc1f37b3 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -83,11 +83,7 @@ struct zbud_pool; * its free region. * @buddied: list tracking the zbud pages that contain two buddies; * these zbud pages are full - * @lru: list tracking the zbud pages in LRU order by most recently - * added buddy. * @pages_nr: number of zbud pages in the pool. - * @zpool: zpool driver - * @zpool_ops: zpool operations structure with an evict callback * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular zbud pool. @@ -102,26 +98,20 @@ struct zbud_pool { struct list_head buddied; struct list_head unbuddied[NCHUNKS]; }; - struct list_head lru; u64 pages_nr; - struct zpool *zpool; - const struct zpool_ops *zpool_ops; }; /* * struct zbud_header - zbud page metadata occupying the first chunk of each * zbud page. * @buddy: links the zbud page into the unbuddied/buddied lists in the pool - * @lru: links the zbud page into the lru list in the pool * @first_chunks: the size of the first buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free */ struct zbud_header { struct list_head buddy; - struct list_head lru; unsigned int first_chunks; unsigned int last_chunks; - bool under_reclaim; }; /***************** @@ -149,8 +139,6 @@ static struct zbud_header *init_zbud_page(struct page *page) zhdr->first_chunks = 0; zhdr->last_chunks = 0; INIT_LIST_HEAD(&zhdr->buddy); - INIT_LIST_HEAD(&zhdr->lru); - zhdr->under_reclaim = false; return zhdr; } @@ -221,7 +209,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp) for_each_unbuddied_list(i, 0) INIT_LIST_HEAD(&pool->unbuddied[i]); INIT_LIST_HEAD(&pool->buddied); - INIT_LIST_HEAD(&pool->lru); pool->pages_nr = 0; return pool; } @@ -310,11 +297,6 @@ found: list_add(&zhdr->buddy, &pool->buddied); } - /* Add/move zbud page to beginning of LRU */ - if (!list_empty(&zhdr->lru)) - list_del(&zhdr->lru); - list_add(&zhdr->lru, &pool->lru); - *handle = encode_handle(zhdr, bud); spin_unlock(&pool->lock); @@ -325,11 +307,6 @@ found: * zbud_free() - frees the allocation associated with the given handle * @pool: pool in which the allocation resided * @handle: handle associated with the allocation returned by zbud_alloc() - * - * In the case that the zbud page in which the allocation resides is under - * reclaim, as indicated by the PG_reclaim flag being set, this function - * only sets the first|last_chunks to 0. The page is actually freed - * once both buddies are evicted (see zbud_reclaim_page() below). */ static void zbud_free(struct zbud_pool *pool, unsigned long handle) { @@ -345,18 +322,11 @@ static void zbud_free(struct zbud_pool *pool, unsigned long handle) else zhdr->first_chunks = 0; - if (zhdr->under_reclaim) { - /* zbud page is under reclaim, reclaim will free */ - spin_unlock(&pool->lock); - return; - } - /* Remove from existing buddy list */ list_del(&zhdr->buddy); if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { /* zbud page is empty, free */ - list_del(&zhdr->lru); free_zbud_page(zhdr); pool->pages_nr--; } else { @@ -369,110 +339,6 @@ static void zbud_free(struct zbud_pool *pool, unsigned long handle) } /** - * zbud_reclaim_page() - evicts allocations from a pool page and frees it - * @pool: pool from which a page will attempt to be evicted - * @retries: number of pages on the LRU list for which eviction will - * be attempted before failing - * - * zbud reclaim is different from normal system reclaim in that the reclaim is - * done from the bottom, up. This is because only the bottom layer, zbud, has - * information on how the allocations are organized within each zbud page. This - * has the potential to create interesting locking situations between zbud and - * the user, however. - * - * To avoid these, this is how zbud_reclaim_page() should be called: - * - * The user detects a page should be reclaimed and calls zbud_reclaim_page(). - * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call - * the user-defined eviction handler with the pool and handle as arguments. - * - * If the handle can not be evicted, the eviction handler should return - * non-zero. zbud_reclaim_page() will add the zbud page back to the - * appropriate list and try the next zbud page on the LRU up to - * a user defined number of retries. - * - * If the handle is successfully evicted, the eviction handler should - * return 0 _and_ should have called zbud_free() on the handle. zbud_free() - * contains logic to delay freeing the page if the page is under reclaim, - * as indicated by the setting of the PG_reclaim flag on the underlying page. - * - * If all buddies in the zbud page are successfully evicted, then the - * zbud page can be freed. - * - * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are - * no pages to evict or an eviction handler is not registered, -EAGAIN if - * the retry limit was hit. - */ -static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) -{ - int i, ret, freechunks; - struct zbud_header *zhdr; - unsigned long first_handle = 0, last_handle = 0; - - spin_lock(&pool->lock); - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lock); - return -EINVAL; - } - for (i = 0; i < retries; i++) { - zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); - list_del(&zhdr->lru); - list_del(&zhdr->buddy); - /* Protect zbud page against free */ - zhdr->under_reclaim = true; - /* - * We need encode the handles before unlocking, since we can - * race with free that will set (first|last)_chunks to 0 - */ - first_handle = 0; - last_handle = 0; - if (zhdr->first_chunks) - first_handle = encode_handle(zhdr, FIRST); - if (zhdr->last_chunks) - last_handle = encode_handle(zhdr, LAST); - spin_unlock(&pool->lock); - - /* Issue the eviction callback(s) */ - if (first_handle) { - ret = pool->zpool_ops->evict(pool->zpool, first_handle); - if (ret) - goto next; - } - if (last_handle) { - ret = pool->zpool_ops->evict(pool->zpool, last_handle); - if (ret) - goto next; - } -next: - spin_lock(&pool->lock); - zhdr->under_reclaim = false; - if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { - /* - * Both buddies are now free, free the zbud page and - * return success. - */ - free_zbud_page(zhdr); - pool->pages_nr--; - spin_unlock(&pool->lock); - return 0; - } else if (zhdr->first_chunks == 0 || - zhdr->last_chunks == 0) { - /* add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - } else { - /* add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); - } - - /* add to beginning of LRU */ - list_add(&zhdr->lru, &pool->lru); - } - spin_unlock(&pool->lock); - return -EAGAIN; -} - -/** * zbud_map() - maps the allocation associated with the given handle * @pool: pool in which the allocation resides * @handle: handle associated with the allocation to be mapped @@ -514,18 +380,9 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool) * zpool ****************/ -static void *zbud_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *zbud_zpool_create(const char *name, gfp_t gfp) { - struct zbud_pool *pool; - - pool = zbud_create_pool(gfp); - if (pool) { - pool->zpool = zpool; - pool->zpool_ops = zpool_ops; - } - return pool; + return zbud_create_pool(gfp); } static void zbud_zpool_destroy(void *pool) @@ -543,25 +400,6 @@ static void zbud_zpool_free(void *pool, unsigned long handle) zbud_free(pool, handle); } -static int zbud_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - unsigned int total = 0; - int ret = -EINVAL; - - while (total < pages) { - ret = zbud_reclaim_page(pool, 8); - if (ret < 0) - break; - total++; - } - - if (reclaimed) - *reclaimed = total; - - return ret; -} - static void *zbud_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -585,7 +423,6 @@ static struct zpool_driver zbud_zpool_driver = { .destroy = zbud_zpool_destroy, .malloc = zbud_zpool_malloc, .free = zbud_zpool_free, - .shrink = zbud_zpool_shrink, .map = zbud_zpool_map, .unmap = zbud_zpool_unmap, .total_size = zbud_zpool_total_size, diff --git a/mm/zpool.c b/mm/zpool.c index 6a19c4a58f77..846410479c2f 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -133,7 +133,6 @@ EXPORT_SYMBOL(zpool_has_pool); * @type: The type of the zpool to create (e.g. zbud, zsmalloc) * @name: The name of the zpool (e.g. zram0, zswap) * @gfp: The GFP flags to use when allocating the pool. - * @ops: The optional ops callback. * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the @@ -145,8 +144,7 @@ EXPORT_SYMBOL(zpool_has_pool); * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, - const struct zpool_ops *ops) +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) { struct zpool_driver *driver; struct zpool *zpool; @@ -173,7 +171,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, } zpool->driver = driver; - zpool->pool = driver->create(name, gfp, ops, zpool); + zpool->pool = driver->create(name, gfp); if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -280,30 +278,6 @@ void zpool_free(struct zpool *zpool, unsigned long handle) } /** - * zpool_shrink() - Shrink the pool size - * @zpool: The zpool to shrink. - * @pages: The number of pages to shrink the pool. - * @reclaimed: The number of pages successfully evicted. - * - * This attempts to shrink the actual memory size of the pool - * by evicting currently used handle(s). If the pool was - * created with no zpool_ops, or the evict call fails for any - * of the handles, this will fail. If non-NULL, the @reclaimed - * parameter will be set to the number of pages reclaimed, - * which may be more than the number of pages requested. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: 0 on success, negative value on error/failure. - */ -int zpool_shrink(struct zpool *zpool, unsigned int pages, - unsigned int *reclaimed) -{ - return zpool->driver->shrink ? - zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; -} - -/** * zpool_map_handle() - Map a previously allocated handle into memory * @zpool: The zpool that the handle was allocated from * @handle: The handle to map @@ -360,24 +334,6 @@ u64 zpool_get_total_size(struct zpool *zpool) } /** - * zpool_evictable() - Test if zpool is potentially evictable - * @zpool: The zpool to test - * - * Zpool is only potentially evictable when it's created with struct - * zpool_ops.evict and its driver implements struct zpool_driver.shrink. - * - * However, it doesn't necessarily mean driver will use zpool_ops.evict - * in its implementation of zpool_driver.shrink. It could do internal - * defragmentation instead. - * - * Returns: true if potentially evictable; false otherwise. - */ -bool zpool_evictable(struct zpool *zpool) -{ - return zpool->driver->shrink; -} - -/** * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. * @zpool: The zpool to test * diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 02f7f414aade..3f057970504e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -107,21 +107,8 @@ */ #define OBJ_ALLOCATED_TAG 1 -#ifdef CONFIG_ZPOOL -/* - * The second least-significant bit in the object's header identifies if the - * value stored at the header is a deferred handle from the last reclaim - * attempt. - * - * As noted above, this is valid because we have room for two bits. - */ -#define OBJ_DEFERRED_HANDLE_TAG 2 -#define OBJ_TAG_BITS 2 -#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) -#else #define OBJ_TAG_BITS 1 #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG -#endif /* CONFIG_ZPOOL */ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) @@ -227,12 +214,6 @@ struct link_free { * Handle of allocated object. */ unsigned long handle; -#ifdef CONFIG_ZPOOL - /* - * Deferred handle of a reclaimed object. - */ - unsigned long deferred_handle; -#endif }; }; @@ -250,13 +231,6 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; -#ifdef CONFIG_ZPOOL - /* List tracking the zspages in LRU order by most recently added object */ - struct list_head lru; - struct zpool *zpool; - const struct zpool_ops *zpool_ops; -#endif - #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -279,13 +253,6 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ - -#ifdef CONFIG_ZPOOL - /* links the zspage to the lru list in the pool */ - struct list_head lru; - bool under_reclaim; -#endif - struct zs_pool *pool; rwlock_t lock; }; @@ -384,23 +351,14 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *zs_zpool_create(const char *name, gfp_t gfp) { /* * Ignore global gfp flags: zs_malloc() may be invoked from * different contexts and its caller must provide a valid * gfp mask. */ - struct zs_pool *pool = zs_create_pool(name); - - if (pool) { - pool->zpool = zpool; - pool->zpool_ops = zpool_ops; - } - - return pool; + return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) @@ -422,27 +380,6 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries); - -static int zs_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - unsigned int total = 0; - int ret = -EINVAL; - - while (total < pages) { - ret = zs_reclaim_page(pool, 8); - if (ret < 0) - break; - total++; - } - - if (reclaimed) - *reclaimed = total; - - return ret; -} - static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -481,7 +418,6 @@ static struct zpool_driver zs_zpool_driver = { .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, @@ -884,14 +820,6 @@ static inline bool obj_allocated(struct page *page, void *obj, unsigned long *ph return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); } -#ifdef CONFIG_ZPOOL -static bool obj_stores_deferred_handle(struct page *page, void *obj, - unsigned long *phandle) -{ - return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); -} -#endif - static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -922,39 +850,6 @@ unlock: return 0; } -#ifdef CONFIG_ZPOOL -static unsigned long find_deferred_handle_obj(struct size_class *class, - struct page *page, int *obj_idx); - -/* - * Free all the deferred handles whose objects are freed in zs_free. - */ -static void free_handles(struct zs_pool *pool, struct size_class *class, - struct zspage *zspage) -{ - int obj_idx = 0; - struct page *page = get_first_page(zspage); - unsigned long handle; - - while (1) { - handle = find_deferred_handle_obj(class, page, &obj_idx); - if (!handle) { - page = get_next_page(page); - if (!page) - break; - obj_idx = 0; - continue; - } - - cache_free_handle(pool, handle); - obj_idx++; - } -} -#else -static inline void free_handles(struct zs_pool *pool, struct size_class *class, - struct zspage *zspage) {} -#endif - static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { @@ -969,9 +864,6 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg != ZS_INUSE_RATIO_0); - /* Free all deferred handles from zs_free */ - free_handles(pool, class, zspage); - next = page = get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1006,9 +898,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, } remove_zspage(class, zspage, ZS_INUSE_RATIO_0); -#ifdef CONFIG_ZPOOL - list_del(&zspage->lru); -#endif __free_zspage(pool, class, zspage); } @@ -1054,11 +943,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) off %= PAGE_SIZE; } -#ifdef CONFIG_ZPOOL - INIT_LIST_HEAD(&zspage->lru); - zspage->under_reclaim = false; -#endif - set_freeobj(zspage, 0); } @@ -1341,7 +1225,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, spin_unlock(&pool->lock); class = zspage_class(pool, zspage); - off = (class->size * obj_idx) & ~PAGE_MASK; + off = offset_in_page(class->size * obj_idx); local_lock(&zs_map_area.lock); area = this_cpu_ptr(&zs_map_area); @@ -1381,7 +1265,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); class = zspage_class(pool, zspage); - off = (class->size * obj_idx) & ~PAGE_MASK; + off = offset_in_page(class->size * obj_idx); area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) @@ -1438,7 +1322,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, offset = obj * class->size; nr_page = offset >> PAGE_SHIFT; - m_offset = offset & ~PAGE_MASK; + m_offset = offset_in_page(offset); m_page = get_first_page(zspage); for (i = 0; i < nr_page; i++) @@ -1525,20 +1409,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); out: -#ifdef CONFIG_ZPOOL - /* Add/move zspage to beginning of LRU */ - if (!list_empty(&zspage->lru)) - list_del(&zspage->lru); - list_add(&zspage->lru, &pool->lru); -#endif - spin_unlock(&pool->lock); return handle; } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(int class_size, unsigned long obj, unsigned long *handle) +static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; @@ -1548,31 +1425,18 @@ static void obj_free(int class_size, unsigned long obj, unsigned long *handle) void *vaddr; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class_size * f_objidx) & ~PAGE_MASK; + f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); link = (struct link_free *)(vaddr + f_offset); - if (handle) { -#ifdef CONFIG_ZPOOL - /* Stores the (deferred) handle in the object's header */ - *handle |= OBJ_DEFERRED_HANDLE_TAG; - *handle &= ~OBJ_ALLOCATED_TAG; - - if (likely(!ZsHugePage(zspage))) - link->deferred_handle = *handle; - else - f_page->index = *handle; -#endif - } else { - /* Insert this object in containing zspage's freelist */ - if (likely(!ZsHugePage(zspage))) - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; - else - f_page->index = 0; - set_freeobj(zspage, f_objidx); - } + /* Insert this object in containing zspage's freelist */ + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; + set_freeobj(zspage, f_objidx); kunmap_atomic(vaddr); mod_zspage_inuse(zspage, -1); @@ -1600,21 +1464,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) class = zspage_class(pool, zspage); class_stat_dec(class, ZS_OBJS_INUSE, 1); - -#ifdef CONFIG_ZPOOL - if (zspage->under_reclaim) { - /* - * Reclaim needs the handles during writeback. It'll free - * them along with the zspage when it's done with them. - * - * Record current deferred handle in the object's header. - */ - obj_free(class->size, obj, &handle); - spin_unlock(&pool->lock); - return; - } -#endif - obj_free(class->size, obj, NULL); + obj_free(class->size, obj); fullness = fix_fullness_group(class, zspage); if (fullness == ZS_INUSE_RATIO_0) @@ -1640,8 +1490,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); - s_off = (class->size * s_objidx) & ~PAGE_MASK; - d_off = (class->size * d_objidx) & ~PAGE_MASK; + s_off = offset_in_page(class->size * s_objidx); + d_off = offset_in_page(class->size * d_objidx); if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; @@ -1735,18 +1585,6 @@ static unsigned long find_alloced_obj(struct size_class *class, return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); } -#ifdef CONFIG_ZPOOL -/* - * Find object storing a deferred handle in header in zspage from index object - * and return handle. - */ -static unsigned long find_deferred_handle_obj(struct size_class *class, - struct page *page, int *obj_idx) -{ - return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); -} -#endif - struct zs_compact_control { /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; @@ -1786,7 +1624,7 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); - obj_free(class->size, used_obj, NULL); + obj_free(class->size, used_obj); } /* Remember last position in this iteration */ @@ -1846,7 +1684,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) return fullness; } -#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) +#ifdef CONFIG_COMPACTION /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. @@ -1888,24 +1726,7 @@ static void lock_zspage(struct zspage *zspage) } migrate_read_unlock(zspage); } -#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */ - -#ifdef CONFIG_ZPOOL -/* - * Unlocks all the pages of the zspage. - * - * pool->lock must be held before this function is called - * to prevent the underlying pages from migrating. - */ -static void unlock_zspage(struct zspage *zspage) -{ - struct page *page = get_first_page(zspage); - - do { - unlock_page(page); - } while ((page = get_next_page(page)) != NULL); -} -#endif /* CONFIG_ZPOOL */ +#endif /* CONFIG_COMPACTION */ static void migrate_lock_init(struct zspage *zspage) { @@ -2126,9 +1947,6 @@ static void async_free_zspage(struct work_struct *work) VM_BUG_ON(fullness != ZS_INUSE_RATIO_0); class = pool->size_class[class_idx]; spin_lock(&pool->lock); -#ifdef CONFIG_ZPOOL - list_del(&zspage->lru); -#endif __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } @@ -2474,10 +2292,6 @@ struct zs_pool *zs_create_pool(const char *name) */ zs_register_shrinker(pool); -#ifdef CONFIG_ZPOOL - INIT_LIST_HEAD(&pool->lru); -#endif - return pool; err: @@ -2520,190 +2334,6 @@ void zs_destroy_pool(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_destroy_pool); -#ifdef CONFIG_ZPOOL -static void restore_freelist(struct zs_pool *pool, struct size_class *class, - struct zspage *zspage) -{ - unsigned int obj_idx = 0; - unsigned long handle, off = 0; /* off is within-page offset */ - struct page *page = get_first_page(zspage); - struct link_free *prev_free = NULL; - void *prev_page_vaddr = NULL; - - /* in case no free object found */ - set_freeobj(zspage, (unsigned int)(-1UL)); - - while (page) { - void *vaddr = kmap_atomic(page); - struct page *next_page; - - while (off < PAGE_SIZE) { - void *obj_addr = vaddr + off; - - /* skip allocated object */ - if (obj_allocated(page, obj_addr, &handle)) { - obj_idx++; - off += class->size; - continue; - } - - /* free deferred handle from reclaim attempt */ - if (obj_stores_deferred_handle(page, obj_addr, &handle)) - cache_free_handle(pool, handle); - - if (prev_free) - prev_free->next = obj_idx << OBJ_TAG_BITS; - else /* first free object found */ - set_freeobj(zspage, obj_idx); - - prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); - /* if last free object in a previous page, need to unmap */ - if (prev_page_vaddr) { - kunmap_atomic(prev_page_vaddr); - prev_page_vaddr = NULL; - } - - obj_idx++; - off += class->size; - } - - /* - * Handle the last (full or partial) object on this page. - */ - next_page = get_next_page(page); - if (next_page) { - if (!prev_free || prev_page_vaddr) { - /* - * There is no free object in this page, so we can safely - * unmap it. - */ - kunmap_atomic(vaddr); - } else { - /* update prev_page_vaddr since prev_free is on this page */ - prev_page_vaddr = vaddr; - } - } else { /* this is the last page */ - if (prev_free) { - /* - * Reset OBJ_TAG_BITS bit to last link to tell - * whether it's allocated object or not. - */ - prev_free->next = -1UL << OBJ_TAG_BITS; - } - - /* unmap previous page (if not done yet) */ - if (prev_page_vaddr) { - kunmap_atomic(prev_page_vaddr); - prev_page_vaddr = NULL; - } - - kunmap_atomic(vaddr); - } - - page = next_page; - off %= PAGE_SIZE; - } -} - -static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) -{ - int i, obj_idx, ret = 0; - unsigned long handle; - struct zspage *zspage; - struct page *page; - int fullness; - - /* Lock LRU and fullness list */ - spin_lock(&pool->lock); - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lock); - return -EINVAL; - } - - for (i = 0; i < retries; i++) { - struct size_class *class; - - zspage = list_last_entry(&pool->lru, struct zspage, lru); - list_del(&zspage->lru); - - /* zs_free may free objects, but not the zspage and handles */ - zspage->under_reclaim = true; - - class = zspage_class(pool, zspage); - fullness = get_fullness_group(class, zspage); - - /* Lock out object allocations and object compaction */ - remove_zspage(class, zspage, fullness); - - spin_unlock(&pool->lock); - cond_resched(); - - /* Lock backing pages into place */ - lock_zspage(zspage); - - obj_idx = 0; - page = get_first_page(zspage); - while (1) { - handle = find_alloced_obj(class, page, &obj_idx); - if (!handle) { - page = get_next_page(page); - if (!page) - break; - obj_idx = 0; - continue; - } - - /* - * This will write the object and call zs_free. - * - * zs_free will free the object, but the - * under_reclaim flag prevents it from freeing - * the zspage altogether. This is necessary so - * that we can continue working with the - * zspage potentially after the last object - * has been freed. - */ - ret = pool->zpool_ops->evict(pool->zpool, handle); - if (ret) - goto next; - - obj_idx++; - } - -next: - /* For freeing the zspage, or putting it back in the pool and LRU list. */ - spin_lock(&pool->lock); - zspage->under_reclaim = false; - - if (!get_zspage_inuse(zspage)) { - /* - * Fullness went stale as zs_free() won't touch it - * while the page is removed from the pool. Fix it - * up for the check in __free_zspage(). - */ - zspage->fullness = ZS_INUSE_RATIO_0; - - __free_zspage(pool, class, zspage); - spin_unlock(&pool->lock); - return 0; - } - - /* - * Eviction fails on one of the handles, so we need to restore zspage. - * We need to rebuild its freelist (and free stored deferred handles), - * put it back to the correct size class, and add it to the LRU list. - */ - restore_freelist(pool, class, zspage); - putback_zspage(class, zspage); - list_add(&zspage->lru, &pool->lru); - unlock_zspage(zspage); - } - - spin_unlock(&pool->lock); - return -EAGAIN; -} -#endif /* CONFIG_ZPOOL */ - static int __init zs_init(void) { int ret; diff --git a/mm/zswap.c b/mm/zswap.c index 30092d9a3b23..62195f72bf56 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -37,6 +37,7 @@ #include <linux/workqueue.h> #include "swap.h" +#include "internal.h" /********************************* * statistics @@ -137,6 +138,10 @@ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); +static bool zswap_exclusive_loads_enabled = IS_ENABLED( + CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); +module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); + /********************************* * data structures **********************************/ @@ -149,6 +154,12 @@ struct crypto_acomp_ctx { struct mutex *mutex; }; +/* + * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. + * The only case where lru_lock is not acquired while holding tree.lock is + * when a zswap_entry is taken off the lru for writeback, in that case it + * needs to be verified that it's still valid in the tree. + */ struct zswap_pool { struct zpool *zpool; struct crypto_acomp_ctx __percpu *acomp_ctx; @@ -158,6 +169,8 @@ struct zswap_pool { struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; + struct list_head lru; + spinlock_t lru_lock; }; /* @@ -175,14 +188,16 @@ struct zswap_pool { * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during - * decompression. For a same value filled page length is 0. + * decompression. For a same value filled page length is 0, and both + * pool and lru are invalid and must be ignored. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content + * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { struct rb_node rbnode; - pgoff_t offset; + swp_entry_t swpentry; int refcount; unsigned int length; struct zswap_pool *pool; @@ -191,10 +206,7 @@ struct zswap_entry { unsigned long value; }; struct obj_cgroup *objcg; -}; - -struct zswap_header { - swp_entry_t swpentry; + struct list_head lru; }; /* @@ -238,14 +250,11 @@ static bool zswap_has_pool; pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpool)) -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); +static int zswap_writeback_entry(struct zswap_entry *entry, + struct zswap_tree *tree); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); -static const struct zpool_ops zswap_zpool_ops = { - .evict = zswap_writeback_entry -}; - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -302,12 +311,14 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) { struct rb_node *node = root->rb_node; struct zswap_entry *entry; + pgoff_t entry_offset; while (node) { entry = rb_entry(node, struct zswap_entry, rbnode); - if (entry->offset > offset) + entry_offset = swp_offset(entry->swpentry); + if (entry_offset > offset) node = node->rb_left; - else if (entry->offset < offset) + else if (entry_offset < offset) node = node->rb_right; else return entry; @@ -324,13 +335,15 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, { struct rb_node **link = &root->rb_node, *parent = NULL; struct zswap_entry *myentry; + pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); while (*link) { parent = *link; myentry = rb_entry(parent, struct zswap_entry, rbnode); - if (myentry->offset > entry->offset) + myentry_offset = swp_offset(myentry->swpentry); + if (myentry_offset > entry_offset) link = &(*link)->rb_left; - else if (myentry->offset < entry->offset) + else if (myentry_offset < entry_offset) link = &(*link)->rb_right; else { *dupentry = myentry; @@ -342,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) { if (!RB_EMPTY_NODE(&entry->rbnode)) { rb_erase(&entry->rbnode, root); RB_CLEAR_NODE(&entry->rbnode); + return true; } + return false; } /* @@ -363,6 +378,9 @@ static void zswap_free_entry(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { + spin_lock(&entry->pool->lru_lock); + list_del(&entry->lru); + spin_unlock(&entry->pool->lru_lock); zpool_free(entry->pool->zpool, entry->handle); zswap_pool_put(entry->pool); } @@ -583,13 +601,95 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +/* + * If the entry is still valid in the tree, drop the initial ref and remove it + * from the tree. This function must be called with an additional ref held, + * otherwise it may race with another invalidation freeing the entry. + */ +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + if (zswap_rb_erase(&tree->rbroot, entry)) + zswap_entry_put(tree, entry); +} + +static int zswap_reclaim_entry(struct zswap_pool *pool) +{ + struct zswap_entry *entry; + struct zswap_tree *tree; + pgoff_t swpoffset; + int ret; + + /* Get an entry off the LRU */ + spin_lock(&pool->lru_lock); + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lru_lock); + return -EINVAL; + } + entry = list_last_entry(&pool->lru, struct zswap_entry, lru); + list_del_init(&entry->lru); + /* + * Once the lru lock is dropped, the entry might get freed. The + * swpoffset is copied to the stack, and entry isn't deref'd again + * until the entry is verified to still be alive in the tree. + */ + swpoffset = swp_offset(entry->swpentry); + tree = zswap_trees[swp_type(entry->swpentry)]; + spin_unlock(&pool->lru_lock); + + /* Check for invalidate() race */ + spin_lock(&tree->lock); + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { + ret = -EAGAIN; + goto unlock; + } + /* Hold a reference to prevent a free during writeback */ + zswap_entry_get(entry); + spin_unlock(&tree->lock); + + ret = zswap_writeback_entry(entry, tree); + + spin_lock(&tree->lock); + if (ret) { + /* Writeback failed, put entry back on LRU */ + spin_lock(&pool->lru_lock); + list_move(&entry->lru, &pool->lru); + spin_unlock(&pool->lru_lock); + goto put_unlock; + } + + /* + * Writeback started successfully, the page now belongs to the + * swapcache. Drop the entry from zswap - unless invalidate already + * took it out while we had the tree->lock released for IO. + */ + zswap_invalidate_entry(tree, entry); + +put_unlock: + /* Drop local reference */ + zswap_entry_put(tree, entry); +unlock: + spin_unlock(&tree->lock); + return ret ? -EAGAIN : 0; +} + static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); + int ret, failures = 0; - if (zpool_shrink(pool->zpool, 1, NULL)) - zswap_reject_reclaim_fail++; + do { + ret = zswap_reclaim_entry(pool); + if (ret) { + zswap_reject_reclaim_fail++; + if (ret != -EAGAIN) + break; + if (++failures == MAX_RECLAIM_RETRIES) + break; + } + cond_resched(); + } while (!zswap_can_accept()); zswap_pool_put(pool); } @@ -618,7 +718,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); - pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); + pool->zpool = zpool_create_pool(type, name, gfp); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; @@ -644,6 +744,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); + INIT_LIST_HEAD(&pool->lru); + spin_lock_init(&pool->lru_lock); INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool); @@ -964,16 +1066,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zswap_entry *entry, + struct zswap_tree *tree) { - struct zswap_header *zhdr; - swp_entry_t swpentry; - struct zswap_tree *tree; - pgoff_t offset; - struct zswap_entry *entry; + swp_entry_t swpentry = entry->swpentry; struct page *page; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; + struct zpool *pool = entry->pool->zpool; u8 *src, *tmp = NULL; unsigned int dlen; @@ -988,25 +1088,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) return -ENOMEM; } - /* extract swpentry from data */ - zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); - swpentry = zhdr->swpentry; /* here */ - tree = zswap_trees[swp_type(swpentry)]; - offset = swp_offset(swpentry); - zpool_unmap_handle(pool, handle); - - /* find and ref zswap entry */ - spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); - if (!entry) { - /* entry was invalidated */ - spin_unlock(&tree->lock); - kfree(tmp); - return 0; - } - spin_unlock(&tree->lock); - BUG_ON(offset != entry->offset); - /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ @@ -1028,7 +1109,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) * writing. */ spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, entry->offset) != entry) { + if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(page_folio(page)); ret = -ENOMEM; @@ -1040,12 +1121,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); dlen = PAGE_SIZE; - zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); - src = (u8 *)zhdr + sizeof(struct zswap_header); + src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(pool)) { memcpy(tmp, src, entry->length); src = tmp; - zpool_unmap_handle(pool, handle); + zpool_unmap_handle(pool, entry->handle); } mutex_lock(acomp_ctx->mutex); @@ -1060,7 +1140,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!zpool_can_sleep_mapped(pool)) kfree(tmp); else - zpool_unmap_handle(pool, handle); + zpool_unmap_handle(pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1077,23 +1157,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) put_page(page); zswap_written_back_pages++; - spin_lock(&tree->lock); - /* drop local reference */ - zswap_entry_put(tree, entry); - - /* - * There are two possible situations for entry here: - * (1) refcount is 1(normal case), entry is valid and on the tree - * (2) refcount is 0, entry is freed and not on the tree - * because invalidate happened during writeback - * search the tree and free the entry if find entry - */ - if (entry == zswap_rb_search(&tree->rbroot, offset)) - zswap_entry_put(tree, entry); - spin_unlock(&tree->lock); - return ret; - fail: if (!zpool_can_sleep_mapped(pool)) kfree(tmp); @@ -1102,13 +1166,8 @@ fail: * if we get here due to ZSWAP_SWAPCACHE_EXIST * a load may be happening concurrently. * it is safe and okay to not free the entry. - * if we free the entry in the following put * it is also okay to return !0 */ - spin_lock(&tree->lock); - zswap_entry_put(tree, entry); - spin_unlock(&tree->lock); - return ret; } @@ -1156,11 +1215,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct obj_cgroup *objcg = NULL; struct zswap_pool *pool; int ret; - unsigned int hlen, dlen = PAGE_SIZE; + unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; - struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; gfp_t gfp; /* THP isn't supported */ @@ -1195,7 +1253,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, if (zswap_pool_reached_full) { if (!zswap_can_accept()) { ret = -ENOMEM; - goto reject; + goto shrink; } else zswap_pool_reached_full = false; } @@ -1212,7 +1270,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, src = kmap_atomic(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_atomic(src); - entry->offset = offset; + entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1266,11 +1324,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ - hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(entry->pool->zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); + ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; @@ -1280,13 +1337,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto put_dstmem; } buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO); - memcpy(buf, &zhdr, hlen); - memcpy(buf + hlen, dst, dlen); + memcpy(buf, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); mutex_unlock(acomp_ctx->mutex); /* populate entry */ - entry->offset = offset; + entry->swpentry = swp_entry(type, offset); entry->handle = handle; entry->length = dlen; @@ -1309,6 +1365,11 @@ insert_entry: zswap_entry_put(tree, dupentry); } } while (ret == -EEXIST); + if (entry->length) { + spin_lock(&entry->pool->lru_lock); + list_add(&entry->lru, &entry->pool->lru); + spin_unlock(&entry->pool->lru_lock); + } spin_unlock(&tree->lock); /* update stats */ @@ -1341,7 +1402,7 @@ shrink: * return -1 on entry not found or error */ static int zswap_frontswap_load(unsigned type, pgoff_t offset, - struct page *page) + struct page *page, bool *exclusive) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; @@ -1380,8 +1441,6 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); - if (zpool_evictable(entry->pool->zpool)) - src += sizeof(struct zswap_header); if (!zpool_can_sleep_mapped(entry->pool->zpool)) { memcpy(tmp, src, entry->length); @@ -1410,6 +1469,14 @@ stats: count_objcg_event(entry->objcg, ZSWPIN); freeentry: spin_lock(&tree->lock); + if (!ret && zswap_exclusive_loads_enabled) { + zswap_invalidate_entry(tree, entry); + *exclusive = true; + } else if (entry->length) { + spin_lock(&entry->pool->lru_lock); + list_move(&entry->lru, &entry->pool->lru); + spin_unlock(&entry->pool->lru_lock); + } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); @@ -1430,13 +1497,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) spin_unlock(&tree->lock); return; } - - /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, entry); - - /* drop the initial reference from entry creation */ - zswap_entry_put(tree, entry); - + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } |